src/composer/internal/char_chunk.cc - mozc - Git at Google

 // Copyright 2010, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "composer/internal/char_chunk.h"

 #include "base/util.h"
 #include "composer/internal/transliterators.h"
 #include "composer/table.h"

 namespace mozc {
 namespace composer {

 namespace {
 // Delete "end" from "target", if "target" ends with the "end".
 bool DeleteEnd(const string &end, string *target) {
   const string::size_type rindex = target->rfind(end);
   if (rindex == string::npos) {
     return false;
   }
   target->erase(rindex);
   return true;
 }
 };

 CharChunk::CharChunk()
     : transliterator_(Transliterators::GetConversionStringSelector()),
       status_mask_(0) {}

 void CharChunk::Clear() {
   raw_.clear();
   conversion_.clear();
   pending_.clear();
   ambiguous_.clear();
   clear_status();
 }

 size_t CharChunk::GetLength(const TransliteratorInterface *t12r) const {
   const string t13n =
       GetTransliterator(t12r)->Transliterate(raw_, conversion_ + pending_);
   return Util::CharsLen(t13n);
 }

 void CharChunk::AppendResult(const Table &table,
                              const TransliteratorInterface *t12r,
                              string *result) const {
   if (has_status(NO_CONVERSION)) {
     result->append(raw_);
   } else {
     const string t13n =
         GetTransliterator(t12r)->Transliterate(raw_, conversion_ + pending_);
     result->append(t13n);
   }
 }

 void CharChunk::AppendTrimedResult(const Table &table,
                                    const TransliteratorInterface *t12r,
                                    string *result) const {
   if (has_status(NO_CONVERSION)) {
     result->append(raw_);
   } else {
     // Only determined value (e.g. |conversion_| only) is added.
     string converted = conversion_;
     if (!pending_.empty()) {
       size_t key_length = 0;
       bool fixed = false;
       const Entry* entry = table.LookUpPrefix(pending_, &key_length, &fixed);
       if (entry != NULL && entry->input() == entry->result()) {
         converted.append(entry->result());
       }
     }
     result->append(GetTransliterator(t12r)->Transliterate(raw_, converted));
   }
 }

 void CharChunk::AppendFixedResult(const Table &table,
                                   const TransliteratorInterface *t12r,
                                   string *result) const {
   if (has_status(NO_CONVERSION)) {
     result->append(raw_);
   } else {
     string converted = conversion_;
     if (!ambiguous_.empty()) {
       // Add the |ambiguous_| value as a fixed value.  |ambiguous_|
       // contains an undetermined result string like "ん" converted
       // from a single 'n'.
       converted.append(ambiguous_);
     } else {
       // If |pending_| exists but |ambiguous_| does not exist,
       // |pending_| is appended.  If |ambiguous_| exists, the value of
       // |pending_| is usually equal to |ambiguous_| so it is not
       // appended.
       converted.append(pending_);
     }
     result->append(GetTransliterator(t12r)->Transliterate(raw_, converted));
   }
 }

 bool CharChunk::IsFixed() const {
   return pending_.empty();
 }

 bool CharChunk::IsAppendable(const TransliteratorInterface *t12r) const {
   return !pending_.empty() && (t12r == NULL || t12r == transliterator_);
 }

 bool CharChunk::AddInputInternal(const Table &table, string *input) {
   const bool kNoLoop = false;

   size_t key_length = 0;
   bool fixed = false;
   string key = pending_ + *input;
   const Entry* entry = table.LookUpPrefix(key, &key_length, &fixed);

   if (entry == NULL) {
     if (key_length == 0) {
       // No prefix character is not contained in the table, fallback
       // operation is performed.
       if (pending_.empty()) {
         // Conversion data was not found.
         AddConvertedChar(input);
       }
       return kNoLoop;
     }

     // Some prefix character is contained in the table, but not
     // reached any conversion result (like "t" with "ta->た").
     DCHECK_GE(key_length, pending_.size());
     key_length -= pending_.size();

     // Conversion data had only pending.
     const string new_pending_chars = input->substr(0, key_length);
     raw_.append(new_pending_chars);
     pending_.append(new_pending_chars);
     input->erase(0, key_length);
     return kNoLoop;
   }
   // The prefix of key reached a conversion result, thus entry is not NULL.

   if (key.size() == key_length) {
     raw_.append(*input);
     input->clear();
     if (fixed) {
       // The whole key has been used, table lookup has reached a fixed
       // value.  It is a stable world. (like "ka->か", "q@->だ").
       conversion_.append(entry->result());
       pending_ = entry->pending();
       ambiguous_.clear();
     } else {  // !fixed
       // The whole string of key reached a conversion result, but the
       // result is ambiguous (like "n" with "n->ん and na->な").
       pending_ = key;
       ambiguous_ = entry->result();
     }
     return kNoLoop;
   }

   // Delete pending_ from raw_ if matched.
   DeleteEnd(pending_, &raw_);

   // A result was found without any ambiguity.
   input->assign(key.substr(key_length));
   raw_.append(key.substr(0, key_length));
   conversion_.append(entry->result());
   pending_ = entry->pending();
   ambiguous_.clear();

   if (input->empty() || pending_.empty()) {
     // If the remaining input character or pending character is empty,
     // there is no reason to continue the looping.
     return kNoLoop;
   }

   const bool kLoop = true;
   return kLoop;
 }

 void CharChunk::AddInput(const Table &table, string *input) {
   while (AddInputInternal(table, input));
 }

 void CharChunk::AddConvertedChar(string *input) {
   // TODO(komatsu) Nice to make "string Util::PopOneChar(string *str);".
   string first_char = Util::SubString(*input, 0, 1);
   conversion_.append(first_char);
   raw_.append(first_char);
   *input = Util::SubString(*input, 1, string::npos);
 }

 void CharChunk::AddInputAndConvertedChar(const Table &table,
                                          string *key,
                                          string *converted_char) {
   // If this chunk is empty, the key and converted_char are simply
   // copied.
   if (raw_.empty() && pending_.empty() && conversion_.empty()) {
     raw_ = *key;
     key->clear();
     pending_ = *converted_char;
     // TODO(komatsu): We should check if the |converted_char| is
     // really ambigous or not, otherwise the last character of the
     // preedit on Kana input is always dropped.
     ambiguous_ = *converted_char;
     converted_char->clear();
     return;
   }

   const string input = pending_ + *converted_char;
   size_t key_length = 0;
   bool fixed = false;
   const Entry* entry = table.LookUpPrefix(input, &key_length, &fixed);
   if (entry == NULL) {
     // Do not modify this char_chunk, all key and converted_char
     // values will be used by the next char_chunk.
     return;
   }

   // The whole input string was used.
   if (key_length == input.size()) {
     raw_.append(*key);
     if (fixed) {
       conversion_.append(entry->result());
       pending_ = entry->pending();
       ambiguous_.clear();
     } else {
       // |conversion_| remains the current value.
       pending_ = entry->result();
       ambiguous_ = entry->result();
     }
     key->clear();
     converted_char->clear();
     return;
   }

   // The following key_length == pending_.size() means the new key and
   // and converted_char does not affect at all.  Do not any thing here
   // and a new char_chunk will be made for the new key and
   // converted_char.
   if (key_length == pending_.size()) {
     return;
   }

   // The input is partially used.
   raw_.append(*key);
   conversion_.append(entry->result());
   pending_ = entry->pending();
   // While the whole key is used in this char_chunk, the
   // converted_char is separated to this char_chunk and the next
   // char_chunk.  This is not a preferred behavior, but there is no
   // better way to work around this limitation.
   key->clear();
   converted_char->assign(input.substr(key_length));
 }

 void CharChunk::SetTransliterator(
     const TransliteratorInterface *transliterator) {
   if (transliterator == NULL) {
     return;
   }
   transliterator_ = transliterator;
 }

 const string &CharChunk::raw() const {
   return raw_;
 }

 void CharChunk::set_raw(const string &raw) {
   raw_ = raw;
 }

 const string &CharChunk::conversion() const {
   return conversion_;
 }

 void CharChunk::set_conversion(const string &conversion) {
   conversion_ = conversion;
 }

 const string &CharChunk::pending() const {
   return pending_;
 }

 void CharChunk::set_pending(const string &pending) {
   pending_ = pending;
 }

 void CharChunk::set_status(const uint32 status_mask) {
   status_mask_ = status_mask;
 }

 void CharChunk::add_status(const uint32 status_mask) {
   status_mask_ |= status_mask;
 }

 bool CharChunk::has_status(const uint32 status_mask) const {
   return (status_mask == (status_mask_ & status_mask));
 }

 void CharChunk::clear_status() {
   status_mask_ = 0;
 }

 bool CharChunk::SplitChunk(const TransliteratorInterface *t12r,
                            const size_t position,
                            CharChunk* left_new_chunk) {
   if (position <= 0 || position >= GetLength(t12r)) {
     LOG(WARNING) << "Invalid position: " << position;
     return false;
   }

   string raw_lhs, raw_rhs, converted_lhs, converted_rhs;
   GetTransliterator(t12r)->Split(
       position, raw_, conversion_ + pending_,
       &raw_lhs, &raw_rhs, &converted_lhs, &converted_rhs);

   left_new_chunk->SetTransliterator(transliterator_);
   left_new_chunk->set_raw(raw_lhs);
   set_raw(raw_rhs);

   if (converted_lhs.size() > conversion_.size()) {
     // [ conversion | pending ] => [ conv | pend#1 ] [ pend#2 ]
     const string pending_lhs = converted_lhs.substr(conversion_.size());
     left_new_chunk->set_conversion(conversion_);
     left_new_chunk->set_pending(pending_lhs);

     conversion_.clear();
     pending_ = converted_rhs;
   } else {
     // [ conversion | pending ] => [ conv#1 ] [ conv#2 | pending ]
     left_new_chunk->set_conversion(converted_lhs);
     // left_new_chunk->set_pending("");
     const size_t pending_pos = converted_rhs.size() - pending_.size();
     conversion_ = converted_rhs.substr(0, pending_pos);
     // pending_ = pending_;
   }
   return true;
 }

 const TransliteratorInterface *CharChunk::GetTransliterator(
     const TransliteratorInterface *transliterator) const {
   DCHECK(transliterator_);
   return transliterator != NULL ? transliterator : transliterator_;
 }


 }  // namespace composer
 }  // namespace mozc
	// Copyright 2010, Google Inc.
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above
	// copyright notice, this list of conditions and the following disclaimer
	// in the documentation and/or other materials provided with the
	// distribution.
	// * Neither the name of Google Inc. nor the names of its
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	#include "composer/internal/char_chunk.h"

	#include "base/util.h"
	#include "composer/internal/transliterators.h"
	#include "composer/table.h"

	namespace mozc {
	namespace composer {

	namespace {
	// Delete "end" from "target", if "target" ends with the "end".
	bool DeleteEnd(const string &end, string *target) {
	const string::size_type rindex = target->rfind(end);
	if (rindex == string::npos) {
	return false;
	}
	target->erase(rindex);
	return true;
	}
	};

	CharChunk::CharChunk()
	: transliterator_(Transliterators::GetConversionStringSelector()),
	status_mask_(0) {}

	void CharChunk::Clear() {
	raw_.clear();
	conversion_.clear();
	pending_.clear();
	ambiguous_.clear();
	clear_status();
	}

	size_t CharChunk::GetLength(const TransliteratorInterface *t12r) const {
	const string t13n =
	GetTransliterator(t12r)->Transliterate(raw_, conversion_ + pending_);
	return Util::CharsLen(t13n);
	}

	void CharChunk::AppendResult(const Table &table,
	const TransliteratorInterface *t12r,
	string *result) const {
	if (has_status(NO_CONVERSION)) {
	result->append(raw_);
	} else {
	const string t13n =
	GetTransliterator(t12r)->Transliterate(raw_, conversion_ + pending_);
	result->append(t13n);
	}
	}

	void CharChunk::AppendTrimedResult(const Table &table,
	const TransliteratorInterface *t12r,
	string *result) const {
	if (has_status(NO_CONVERSION)) {
	result->append(raw_);
	} else {
	// Only determined value (e.g. \|conversion_\| only) is added.
	string converted = conversion_;
	if (!pending_.empty()) {
	size_t key_length = 0;
	bool fixed = false;
	const Entry* entry = table.LookUpPrefix(pending_, &key_length, &fixed);
	if (entry != NULL && entry->input() == entry->result()) {
	converted.append(entry->result());
	}
	}
	result->append(GetTransliterator(t12r)->Transliterate(raw_, converted));
	}
	}

	void CharChunk::AppendFixedResult(const Table &table,
	const TransliteratorInterface *t12r,
	string *result) const {
	if (has_status(NO_CONVERSION)) {
	result->append(raw_);
	} else {
	string converted = conversion_;
	if (!ambiguous_.empty()) {
	// Add the \|ambiguous_\| value as a fixed value. \|ambiguous_\|
	// contains an undetermined result string like "ん" converted
	// from a single 'n'.
	converted.append(ambiguous_);
	} else {
	// If \|pending_\| exists but \|ambiguous_\| does not exist,
	// \|pending_\| is appended. If \|ambiguous_\| exists, the value of
	// \|pending_\| is usually equal to \|ambiguous_\| so it is not
	// appended.
	converted.append(pending_);
	}
	result->append(GetTransliterator(t12r)->Transliterate(raw_, converted));
	}
	}

	bool CharChunk::IsFixed() const {
	return pending_.empty();
	}

	bool CharChunk::IsAppendable(const TransliteratorInterface *t12r) const {
	return !pending_.empty() && (t12r == NULL \|\| t12r == transliterator_);
	}

	bool CharChunk::AddInputInternal(const Table &table, string *input) {
	const bool kNoLoop = false;

	size_t key_length = 0;
	bool fixed = false;
	string key = pending_ + *input;
	const Entry* entry = table.LookUpPrefix(key, &key_length, &fixed);

	if (entry == NULL) {
	if (key_length == 0) {
	// No prefix character is not contained in the table, fallback
	// operation is performed.
	if (pending_.empty()) {
	// Conversion data was not found.
	AddConvertedChar(input);
	}
	return kNoLoop;
	}

	// Some prefix character is contained in the table, but not
	// reached any conversion result (like "t" with "ta->た").
	DCHECK_GE(key_length, pending_.size());
	key_length -= pending_.size();

	// Conversion data had only pending.
	const string new_pending_chars = input->substr(0, key_length);
	raw_.append(new_pending_chars);
	pending_.append(new_pending_chars);
	input->erase(0, key_length);
	return kNoLoop;
	}
	// The prefix of key reached a conversion result, thus entry is not NULL.

	if (key.size() == key_length) {
	raw_.append(*input);
	input->clear();
	if (fixed) {
	// The whole key has been used, table lookup has reached a fixed
	// value. It is a stable world. (like "ka->か", "q@->だ").
	conversion_.append(entry->result());
	pending_ = entry->pending();
	ambiguous_.clear();
	} else { // !fixed
	// The whole string of key reached a conversion result, but the
	// result is ambiguous (like "n" with "n->ん and na->な").
	pending_ = key;
	ambiguous_ = entry->result();
	}
	return kNoLoop;
	}

	// Delete pending_ from raw_ if matched.
	DeleteEnd(pending_, &raw_);

	// A result was found without any ambiguity.
	input->assign(key.substr(key_length));
	raw_.append(key.substr(0, key_length));
	conversion_.append(entry->result());
	pending_ = entry->pending();
	ambiguous_.clear();

	if (input->empty() \|\| pending_.empty()) {
	// If the remaining input character or pending character is empty,
	// there is no reason to continue the looping.
	return kNoLoop;
	}

	const bool kLoop = true;
	return kLoop;
	}

	void CharChunk::AddInput(const Table &table, string *input) {
	while (AddInputInternal(table, input));
	}

	void CharChunk::AddConvertedChar(string *input) {
	// TODO(komatsu) Nice to make "string Util::PopOneChar(string *str);".
	string first_char = Util::SubString(*input, 0, 1);
	conversion_.append(first_char);
	raw_.append(first_char);
	input = Util::SubString(input, 1, string::npos);
	}

	void CharChunk::AddInputAndConvertedChar(const Table &table,
	string *key,
	string *converted_char) {
	// If this chunk is empty, the key and converted_char are simply
	// copied.
	if (raw_.empty() && pending_.empty() && conversion_.empty()) {
	raw_ = *key;
	key->clear();
	pending_ = *converted_char;
	// TODO(komatsu): We should check if the \|converted_char\| is
	// really ambigous or not, otherwise the last character of the
	// preedit on Kana input is always dropped.
	ambiguous_ = *converted_char;
	converted_char->clear();
	return;
	}

	const string input = pending_ + *converted_char;
	size_t key_length = 0;
	bool fixed = false;
	const Entry* entry = table.LookUpPrefix(input, &key_length, &fixed);
	if (entry == NULL) {
	// Do not modify this char_chunk, all key and converted_char
	// values will be used by the next char_chunk.
	return;
	}

	// The whole input string was used.
	if (key_length == input.size()) {
	raw_.append(*key);
	if (fixed) {
	conversion_.append(entry->result());
	pending_ = entry->pending();
	ambiguous_.clear();
	} else {
	// \|conversion_\| remains the current value.
	pending_ = entry->result();
	ambiguous_ = entry->result();
	}
	key->clear();
	converted_char->clear();
	return;
	}

	// The following key_length == pending_.size() means the new key and
	// and converted_char does not affect at all. Do not any thing here
	// and a new char_chunk will be made for the new key and
	// converted_char.
	if (key_length == pending_.size()) {
	return;
	}

	// The input is partially used.
	raw_.append(*key);
	conversion_.append(entry->result());
	pending_ = entry->pending();
	// While the whole key is used in this char_chunk, the
	// converted_char is separated to this char_chunk and the next
	// char_chunk. This is not a preferred behavior, but there is no
	// better way to work around this limitation.
	key->clear();
	converted_char->assign(input.substr(key_length));
	}

	void CharChunk::SetTransliterator(
	const TransliteratorInterface *transliterator) {
	if (transliterator == NULL) {
	return;
	}
	transliterator_ = transliterator;
	}

	const string &CharChunk::raw() const {
	return raw_;
	}

	void CharChunk::set_raw(const string &raw) {
	raw_ = raw;
	}

	const string &CharChunk::conversion() const {
	return conversion_;
	}

	void CharChunk::set_conversion(const string &conversion) {
	conversion_ = conversion;
	}

	const string &CharChunk::pending() const {
	return pending_;
	}

	void CharChunk::set_pending(const string &pending) {
	pending_ = pending;
	}

	void CharChunk::set_status(const uint32 status_mask) {
	status_mask_ = status_mask;
	}

	void CharChunk::add_status(const uint32 status_mask) {
	status_mask_ \|= status_mask;
	}

	bool CharChunk::has_status(const uint32 status_mask) const {
	return (status_mask == (status_mask_ & status_mask));
	}

	void CharChunk::clear_status() {
	status_mask_ = 0;
	}

	bool CharChunk::SplitChunk(const TransliteratorInterface *t12r,
	const size_t position,
	CharChunk* left_new_chunk) {
	if (position <= 0 \|\| position >= GetLength(t12r)) {
	LOG(WARNING) << "Invalid position: " << position;
	return false;
	}

	string raw_lhs, raw_rhs, converted_lhs, converted_rhs;
	GetTransliterator(t12r)->Split(
	position, raw_, conversion_ + pending_,
	&raw_lhs, &raw_rhs, &converted_lhs, &converted_rhs);

	left_new_chunk->SetTransliterator(transliterator_);
	left_new_chunk->set_raw(raw_lhs);
	set_raw(raw_rhs);

	if (converted_lhs.size() > conversion_.size()) {
	// [ conversion \| pending ] => [ conv \| pend#1 ] [ pend#2 ]
	const string pending_lhs = converted_lhs.substr(conversion_.size());
	left_new_chunk->set_conversion(conversion_);
	left_new_chunk->set_pending(pending_lhs);

	conversion_.clear();
	pending_ = converted_rhs;
	} else {
	// [ conversion \| pending ] => [ conv#1 ] [ conv#2 \| pending ]
	left_new_chunk->set_conversion(converted_lhs);
	// left_new_chunk->set_pending("");
	const size_t pending_pos = converted_rhs.size() - pending_.size();
	conversion_ = converted_rhs.substr(0, pending_pos);
	// pending_ = pending_;
	}
	return true;
	}

	const TransliteratorInterface *CharChunk::GetTransliterator(
	const TransliteratorInterface *transliterator) const {
	DCHECK(transliterator_);
	return transliterator != NULL ? transliterator : transliterator_;
	}


	} // namespace composer
	} // namespace mozc