| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "composer/internal/char_chunk.h" |
| |
| #include <set> |
| #include <string> |
| #include <vector> |
| |
| #include "base/logging.h" |
| #include "base/util.h" |
| #include "composer/internal/composition_input.h" |
| #include "composer/internal/transliterators.h" |
| #include "composer/table.h" |
| |
| namespace mozc { |
| namespace composer { |
| |
| namespace { |
| // Max recursion count for looking up pending loop. |
| const int kMaxRecursion = 4; |
| |
| // Delete "end" from "target", if "target" ends with the "end". |
| bool DeleteEnd(const string &end, string *target) { |
| const string::size_type rindex = target->rfind(end); |
| if (rindex == string::npos) { |
| return false; |
| } |
| target->erase(rindex); |
| return true; |
| } |
| |
| // Get from pending rules recursively |
| // The recursion will be stopped if recursion_count is 0. |
| // When returns false, the caller doesn't append result entries. |
| // If we have the rule, |
| // '1' -> '', 'あ' |
| // 'あ1' -> '', 'い' |
| // 'い1' -> '', 'う' |
| // 'う1' -> '', 'え' |
| // 'え1' -> '', 'お' |
| // 'お1' -> '', 'あ' |
| // 'あ*' -> '', '{*}ぁ' |
| // '{*}ぁ' -> '', '{*}あ' |
| // '{*}あ' -> '', '{*}ぁ' |
| // 'い*' -> '', '{*}ぃ' |
| // '{*}ぃ' -> '', '{*}い' |
| // '{*}い' -> '', '{*}ぃ' |
| // Here, we want to get '{*}あ' <-> '{*}ぁ' loop from the input, 'あ' |
| bool GetFromPending(const Table *table, const string &key, |
| int recursion_count, set<string> *result) { |
| DCHECK(result); |
| if (recursion_count == 0) { |
| // Don't find the loop within the |recursion_count|. |
| return false; |
| } |
| if (result->find(key) != result->end()) { |
| // Found the entry that is already looked up. |
| // Return true because we found the loop. |
| return true; |
| } |
| result->insert(key); |
| |
| vector<const Entry *> entries; |
| table->LookUpPredictiveAll(key, &entries); |
| for (size_t i = 0; i < entries.size(); ++i) { |
| if (!entries[i]->result().empty()) { |
| // skip rules with result, because this causes too many results. |
| // for example, if we have |
| // 'k' -> 'っ', 'k' |
| // 'ka' -> 'か', '' |
| // From the input 'k', this causes 'か', 'っ', 'っか', ... |
| // So here we stop calling recursion. |
| return false; |
| } |
| if (!GetFromPending(table, entries[i]->pending(), |
| recursion_count - 1, result)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| } // anonymous namespace |
| |
| CharChunk::CharChunk(Transliterators::Transliterator transliterator, |
| const Table *table) |
| : transliterator_(transliterator), |
| table_(table), |
| attributes_(NO_TABLE_ATTRIBUTE) { |
| DCHECK_NE(Transliterators::LOCAL, transliterator); |
| } |
| |
| void CharChunk::Clear() { |
| raw_.clear(); |
| conversion_.clear(); |
| pending_.clear(); |
| ambiguous_.clear(); |
| } |
| |
| size_t CharChunk::GetLength(Transliterators::Transliterator t12r) const { |
| const string t13n = Transliterate( |
| t12r, |
| Table::DeleteSpecialKey(raw_), |
| Table::DeleteSpecialKey(conversion_ + pending_)); |
| return Util::CharsLen(t13n); |
| } |
| |
| void CharChunk::AppendResult(Transliterators::Transliterator t12r, |
| string *result) const { |
| const string t13n = Transliterate( |
| t12r, |
| Table::DeleteSpecialKey(raw_), |
| Table::DeleteSpecialKey(conversion_ + pending_)); |
| result->append(t13n); |
| } |
| |
| void CharChunk::AppendTrimedResult(Transliterators::Transliterator t12r, |
| string *result) const { |
| // Only determined value (e.g. |conversion_| only) is added. |
| string converted = conversion_; |
| if (!pending_.empty()) { |
| size_t key_length = 0; |
| bool fixed = false; |
| const Entry *entry = table_->LookUpPrefix(pending_, &key_length, &fixed); |
| if (entry != NULL && entry->input() == entry->result()) { |
| converted.append(entry->result()); |
| } |
| } |
| result->append(Transliterate( |
| t12r, |
| Table::DeleteSpecialKey(raw_), |
| Table::DeleteSpecialKey(converted))); |
| } |
| |
| void CharChunk::AppendFixedResult(Transliterators::Transliterator t12r, |
| string *result) const { |
| string converted = conversion_; |
| if (!ambiguous_.empty()) { |
| // Add the |ambiguous_| value as a fixed value. |ambiguous_| |
| // contains an undetermined result string like "ん" converted |
| // from a single 'n'. |
| converted.append(ambiguous_); |
| } else { |
| // If |pending_| exists but |ambiguous_| does not exist, |
| // |pending_| is appended. If |ambiguous_| exists, the value of |
| // |pending_| is usually equal to |ambiguous_| so it is not |
| // appended. |
| converted.append(pending_); |
| } |
| result->append(Transliterate( |
| t12r, |
| Table::DeleteSpecialKey(raw_), |
| Table::DeleteSpecialKey(converted))); |
| } |
| |
| // If we have the rule (roman), |
| // 1: 'ka' -> 'か', '' |
| // 2: 'ki' -> 'き', '' |
| // 3: 'ku' -> 'く', '' |
| // 4: 'kk' -> 'っ', 'k' |
| // From the input 'k', we want to correct 'k', 'か', 'き', 'く', 'っ' |
| // We don't expand for next k for rule 4, because it causes |
| // many useless looped results, like 'っか', 'っっか', 'っっ', etc |
| // |
| // If we have the input 'kk', we will get 'か', 'き', 'く', 'っ' from the |
| // pending 'k' of rule 4. |
| // With the result of AppendTrimedResult, 'っ', we can get |
| // 'っか', 'っき', 'っく', 'っっ' from this input. |
| // |
| // If we have the rule (kana), |
| // 'は゜' -> 'ぱ', '' |
| // 'は゛' -> 'ば', '' |
| // From the input 'は', we want 'は', 'ば', 'ぱ' |
| // |
| // For mobile rules, |
| // If we have the rule, |
| // '1' -> '', 'あ' |
| // 'あ1' -> '', 'い' |
| // 'い1' -> '', 'う' |
| // 'う1' -> '', 'え' |
| // 'え1' -> '', 'お' |
| // 'お1' -> '', 'あ' |
| // 'あ*' -> '', '{*}ぁ' |
| // '{*}ぁ' -> '', '{*}あ' |
| // '{*}あ' -> '', '{*}ぁ' |
| // 'い*' -> '', '{*}ぃ' |
| // '{*}ぃ' -> '', '{*}い' |
| // '{*}い' -> '', '{*}ぃ' |
| // From the input '1', we want 'あ', 'ぁ', not including 'い', 'ぃ', 'う', etc |
| // |
| // Actually, we don't need to recursive lookup for above two patterns. |
| // |
| // What we want to append here is the 'looped rule' in |kMaxRecursion| lookup. |
| // Here, '{*}ぁ' -> '{*}あ' -> '{*}ぁ' is the loop. |
| void CharChunk::GetExpandedResults(set<string> *results) const { |
| DCHECK(results); |
| |
| if (pending_.empty()) { |
| return; |
| } |
| // Append current pending string |
| if (conversion_.empty()) { |
| results->insert(Table::DeleteSpecialKey(pending_)); |
| } |
| vector<const Entry *> entries; |
| table_->LookUpPredictiveAll(pending_, &entries); |
| for (size_t i = 0; i < entries.size(); ++i) { |
| if (!entries[i]->result().empty()) { |
| results->insert(Table::DeleteSpecialKey(entries[i]->result())); |
| } |
| if (entries[i]->pending().empty()) { |
| continue; |
| } |
| set<string> loop_result; |
| if (!GetFromPending(table_, entries[i]->pending(), |
| kMaxRecursion, &loop_result)) { |
| continue; |
| } |
| for (set<string>::const_iterator itr = loop_result.begin(); |
| itr != loop_result.end(); ++itr) { |
| results->insert(Table::DeleteSpecialKey(*itr)); |
| } |
| } |
| } |
| |
| bool CharChunk::IsFixed() const { |
| return pending_.empty(); |
| } |
| |
| bool CharChunk::IsAppendable(Transliterators::Transliterator t12r, |
| const Table *table) const { |
| return !pending_.empty() && |
| (t12r == Transliterators::LOCAL || t12r == transliterator_) && |
| table == table_; |
| } |
| |
| bool CharChunk::IsConvertible( |
| Transliterators::Transliterator t12r, |
| const Table *table, |
| const string &input) const { |
| if (!IsAppendable(t12r, table)) { |
| return false; |
| } |
| |
| size_t key_length = 0; |
| bool fixed = false; |
| string key = pending_ + input; |
| const Entry *entry = table->LookUpPrefix(key, &key_length, &fixed); |
| |
| return entry && (key.size() == key_length) && fixed; |
| } |
| |
| void CharChunk::Combine(const CharChunk &left_chunk) { |
| conversion_ = left_chunk.conversion_ + conversion_; |
| raw_ = left_chunk.raw_ + raw_; |
| // TODO(komatsu): This is a hacky way. We should look up the |
| // conversion table with the new |raw_| value. |
| if (left_chunk.ambiguous_.empty()) { |
| ambiguous_.clear(); |
| } else { |
| if (ambiguous_.empty()) { |
| ambiguous_ = left_chunk.ambiguous_ + pending_; |
| } else { |
| ambiguous_ = left_chunk.ambiguous_ + ambiguous_; |
| } |
| } |
| pending_ = left_chunk.pending_ + pending_; |
| } |
| |
| bool CharChunk::AddInputInternal(string *input) { |
| const bool kNoLoop = false; |
| |
| size_t key_length = 0; |
| bool fixed = false; |
| string key = pending_ + *input; |
| const Entry *entry = table_->LookUpPrefix(key, &key_length, &fixed); |
| |
| if (entry == NULL) { |
| if (key_length == 0) { |
| // No prefix character is not contained in the table, fallback |
| // operation is performed. |
| if (pending_.empty()) { |
| // Conversion data was not found. |
| AddConvertedChar(input); |
| } |
| return kNoLoop; |
| } |
| |
| if (key_length < pending_.size()) { |
| // Do not modify this char_chunk, all key characters will be used |
| // by the next char_chunk. |
| return kNoLoop; |
| } |
| |
| DCHECK_GE(key_length, pending_.size()); |
| // Some prefix character is contained in the table, but not |
| // reached any conversion result (like "t" with "ta->た"). |
| key_length -= pending_.size(); |
| |
| // Conversion data had only pending. |
| const string new_pending_chars(*input, 0, key_length); |
| raw_.append(new_pending_chars); |
| pending_.append(new_pending_chars); |
| if (!ambiguous_.empty()) { |
| // If ambiguos_ has already a value, ambiguous_ is appended. |
| // ex. "ny" makes ambiguous_ "んy", but "sh" leaves ambiguous_ empty. |
| ambiguous_.append(new_pending_chars); |
| } |
| input->erase(0, key_length); |
| return kNoLoop; |
| } |
| |
| // The prefix of key reached a conversion result, thus entry is not NULL. |
| |
| if (key.size() == key_length) { |
| const bool is_following_entry = ( |
| !conversion_.empty() || |
| (!raw_.empty() && !pending_.empty() && raw_ != pending_)); |
| |
| raw_.append(*input); |
| input->clear(); |
| if (fixed) { |
| // The whole key has been used, table lookup has reached a fixed |
| // value. It is a stable world. (like "ka->か", "q@->だ"). |
| conversion_.append(entry->result()); |
| pending_ = entry->pending(); |
| ambiguous_.clear(); |
| |
| // If this entry is the first entry, the table attributes are |
| // applied to this chunk. |
| if (!is_following_entry) { |
| attributes_ = entry->attributes(); |
| } |
| } else { // !fixed |
| // The whole string of key reached a conversion result, but the |
| // result is ambiguous (like "n" with "n->ん and na->な"). |
| pending_ = key; |
| ambiguous_ = entry->result(); |
| } |
| return kNoLoop; |
| } |
| |
| // Delete pending_ from raw_ if matched. |
| DeleteEnd(pending_, &raw_); |
| |
| // A result was found without any ambiguity. |
| input->assign(key, key_length, key.size() - key_length); |
| raw_.append(key, 0, key_length); |
| conversion_.append(entry->result()); |
| pending_ = entry->pending(); |
| ambiguous_.clear(); |
| |
| if (input->empty() || pending_.empty()) { |
| // If the remaining input character or pending character is empty, |
| // there is no reason to continue the looping. |
| return kNoLoop; |
| } |
| |
| const bool kLoop = true; |
| return kLoop; |
| } |
| |
| void CharChunk::AddInput(string *input) { |
| while (AddInputInternal(input)) {} |
| } |
| |
| void CharChunk::AddConvertedChar(string *input) { |
| // TODO(komatsu) Nice to make "string Util::PopOneChar(string *str);". |
| string first_char = Util::SubString(*input, 0, 1); |
| conversion_.append(first_char); |
| raw_.append(first_char); |
| *input = Util::SubString(*input, 1, string::npos); |
| } |
| |
| void CharChunk::AddInputAndConvertedChar(string *key, |
| string *converted_char) { |
| // If this chunk is empty, the key and converted_char are simply |
| // copied. |
| if (raw_.empty() && pending_.empty() && conversion_.empty()) { |
| raw_ = *key; |
| key->clear(); |
| pending_ = *converted_char; |
| // TODO(komatsu): We should check if the |converted_char| is |
| // really ambigous or not, otherwise the last character of the |
| // preedit on Kana input is always dropped. |
| ambiguous_ = *converted_char; |
| converted_char->clear(); |
| |
| // If this entry is the first entry, the table attributes are |
| // applied to this chunk. |
| const Entry *entry = table_->LookUp(pending_); |
| if (entry != NULL) { |
| attributes_ = entry->attributes(); |
| } |
| return; |
| } |
| |
| const string input = pending_ + *converted_char; |
| size_t key_length = 0; |
| bool fixed = false; |
| const Entry *entry = table_->LookUpPrefix(input, &key_length, &fixed); |
| if (entry == NULL) { |
| // Do not modify this char_chunk, all key and converted_char |
| // values will be used by the next char_chunk. |
| return; |
| } |
| |
| // The whole input string was used. |
| if (key_length == input.size()) { |
| raw_.append(*key); |
| if (fixed) { |
| conversion_.append(entry->result()); |
| pending_ = entry->pending(); |
| ambiguous_.clear(); |
| } else { // !fixed |
| // |conversion_| remains the current value. |
| pending_ = entry->result(); |
| ambiguous_ = entry->result(); |
| } |
| key->clear(); |
| converted_char->clear(); |
| return; |
| } |
| |
| // The following key_length == pending_.size() means the new key and |
| // and converted_char does not affect at all. Do not any thing here |
| // and a new char_chunk will be made for the new key and |
| // converted_char. |
| if (key_length == pending_.size()) { |
| return; |
| } |
| |
| // The input is partially used. |
| raw_.append(*key); |
| conversion_.append(entry->result()); |
| pending_ = entry->pending(); |
| // While the whole key is used in this char_chunk, the |
| // converted_char is separated to this char_chunk and the next |
| // char_chunk. This is not a preferred behavior, but there is no |
| // better way to work around this limitation. |
| key->clear(); |
| converted_char->assign(input, key_length, input.size() - key_length); |
| } |
| |
| bool CharChunk::ShouldCommit() const { |
| return (attributes_ & DIRECT_INPUT) && pending_.empty(); |
| } |
| |
| bool CharChunk::ShouldInsertNewChunk(const CompositionInput &input) const { |
| if (raw_.empty() && conversion_.empty() && pending_.empty()) { |
| return false; |
| } |
| |
| const bool is_new_input = |
| input.is_new_input() || |
| ((attributes_ & END_CHUNK) && pending_.empty()); |
| |
| if (is_new_input && |
| (table_->HasNewChunkEntry(input.raw()) || |
| !table_->HasSubRules(input.raw()))) { |
| // Such input which is not on the table is treated as NewChunk entry. |
| return true; |
| } |
| return false; |
| } |
| |
| void CharChunk::AddCompositionInput(CompositionInput *input) { |
| if (input->has_conversion()) { |
| AddInputAndConvertedChar( |
| input->mutable_raw(), |
| input->mutable_conversion()); |
| return; |
| } |
| |
| if (ShouldInsertNewChunk(*input)) { |
| return; |
| } |
| AddInput(input->mutable_raw()); |
| } |
| |
| void CharChunk::SetTransliterator( |
| const Transliterators::Transliterator transliterator) { |
| if (transliterator == Transliterators::LOCAL) { |
| // LOCAL transliterator shouldn't be set as local transliterator. |
| // Just ignore. |
| return; |
| } |
| transliterator_ = transliterator; |
| } |
| |
| const string &CharChunk::raw() const { |
| return raw_; |
| } |
| |
| void CharChunk::set_raw(const string &raw) { |
| raw_ = raw; |
| } |
| |
| const string &CharChunk::conversion() const { |
| return conversion_; |
| } |
| |
| void CharChunk::set_conversion(const string &conversion) { |
| conversion_ = conversion; |
| } |
| |
| const string &CharChunk::pending() const { |
| return pending_; |
| } |
| |
| void CharChunk::set_pending(const string &pending) { |
| pending_ = pending; |
| } |
| |
| const string &CharChunk::ambiguous() const { |
| return ambiguous_; |
| } |
| |
| void CharChunk::set_ambiguous(const string &ambiguous) { |
| ambiguous_ = ambiguous; |
| } |
| |
| bool CharChunk::SplitChunk(Transliterators::Transliterator t12r, |
| const size_t position, |
| CharChunk **left_new_chunk) { |
| if (position <= 0 || position >= GetLength(t12r)) { |
| LOG(WARNING) << "Invalid position: " << position; |
| return false; |
| } |
| |
| string raw_lhs, raw_rhs, converted_lhs, converted_rhs; |
| Transliterators::GetTransliterator(GetTransliterator(t12r))->Split( |
| position, |
| Table::DeleteSpecialKey(raw_), |
| Table::DeleteSpecialKey(conversion_ + pending_), |
| &raw_lhs, &raw_rhs, &converted_lhs, &converted_rhs); |
| |
| *left_new_chunk = new CharChunk(transliterator_, table_); |
| (*left_new_chunk)->set_raw(raw_lhs); |
| set_raw(raw_rhs); |
| |
| if (converted_lhs.size() > conversion_.size()) { |
| // [ conversion | pending ] => [ conv | pend#1 ] [ pend#2 ] |
| const string pending_lhs(converted_lhs, conversion_.size()); |
| (*left_new_chunk)->set_conversion(conversion_); |
| (*left_new_chunk)->set_pending(pending_lhs); |
| |
| conversion_.clear(); |
| pending_ = converted_rhs; |
| ambiguous_.clear(); |
| } else { |
| // [ conversion | pending ] => [ conv#1 ] [ conv#2 | pending ] |
| (*left_new_chunk)->set_conversion(converted_lhs); |
| // left_new_chunk->set_pending(""); |
| const size_t pending_pos = converted_rhs.size() - pending_.size(); |
| conversion_.assign(converted_rhs, 0, pending_pos); |
| // pending_ = pending_; |
| } |
| return true; |
| } |
| |
| Transliterators::Transliterator |
| CharChunk::GetTransliterator( |
| Transliterators::Transliterator transliterator) const { |
| if (attributes_ && NO_TRANSLITERATION) { |
| if (transliterator == Transliterators::LOCAL || |
| transliterator == Transliterators::HALF_ASCII || |
| transliterator == Transliterators::FULL_ASCII) { |
| return Transliterators::CONVERSION_STRING; |
| } |
| return transliterator; |
| } |
| if (transliterator == Transliterators::LOCAL) { |
| return transliterator_; |
| } |
| return transliterator; |
| } |
| |
| string CharChunk::Transliterate( |
| Transliterators::Transliterator transliterator, |
| const string &raw, const string &converted) const { |
| return Transliterators::GetTransliterator( |
| GetTransliterator(transliterator))->Transliterate(raw, converted); |
| } |
| |
| |
| CharChunk *CharChunk::Clone() const { |
| // TODO(hsumita): Implements TransliteratorFactory and uses it instead of |
| // copying a pointer. |
| CharChunk *new_char_chunk = new CharChunk(transliterator_, table_); |
| new_char_chunk->raw_.assign(raw_); |
| new_char_chunk->conversion_.assign(conversion_); |
| new_char_chunk->pending_.assign(pending_); |
| new_char_chunk->ambiguous_.assign(ambiguous_); |
| new_char_chunk->attributes_ = attributes_; |
| return new_char_chunk; |
| } |
| |
| } // namespace composer |
| } // namespace mozc |