| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // System dictionary maintains following sections |
| // (1) Key trie |
| // Trie containing encoded key. Returns ids for lookup. |
| // We can get the key using the id by performing reverse lookup |
| // against the trie. |
| // (2) Value trie |
| // Trie containing encoded value. Returns ids for lookup. |
| // We can get the value using the id by performing reverse lookup |
| // against the trie. |
| // (3) Token array |
| // Array containing encoded tokens. Array index is the id in key trie. |
| // Token contains cost, POS, the id in key trie, etc. |
| // (4) Table for high frequent POS(left/right ID) |
| // Frequenty appearing POSs are stored as POS ids in token info for |
| // reducing binary size. This table is the map from the id to the |
| // actual ids. |
| |
| #include "dictionary/system/system_dictionary.h" |
| |
| #include <algorithm> |
| #include <cstring> |
| #include <limits> |
| #include <map> |
| #include <queue> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/logging.h" |
| #include "base/port.h" |
| #include "base/string_piece.h" |
| #include "base/system_util.h" |
| #include "base/util.h" |
| #include "dictionary/dictionary_token.h" |
| #include "dictionary/file/dictionary_file.h" |
| #include "dictionary/system/codec_interface.h" |
| #include "dictionary/system/words_info.h" |
| #include "storage/louds/bit_vector_based_array.h" |
| #include "storage/louds/louds_trie.h" |
| |
| namespace mozc { |
| namespace dictionary { |
| |
| using mozc::storage::louds::BitVectorBasedArray; |
| using mozc::storage::louds::LoudsTrie; |
| |
| namespace { |
| |
| const int kMinTokenArrayBlobSize = 4; |
| |
| // Expansion table format: |
| // "<Character to expand>[<Expanded character 1><Expanded character 2>...]" |
| // |
| // Only characters that will be encoded into 1-byte ASCII char are allowed in |
| // the table. |
| // |
| // Note that this implementation has potential issue that the key/values may |
| // be mixed. |
| // TODO(hidehiko): Clean up this hacky implementation. |
| const char *kHiraganaExpansionTable[] = { |
| "\xe3\x81\x82\xe3\x81\x82\xe3\x81\x81", // "ああぁ" |
| "\xe3\x81\x84\xe3\x81\x84\xe3\x81\x83", // "いいぃ" |
| "\xe3\x81\x86\xe3\x81\x86\xe3\x81\x85\xe3\x82\x94", // "ううぅゔ" |
| "\xe3\x81\x88\xe3\x81\x88\xe3\x81\x87", // "ええぇ" |
| "\xe3\x81\x8a\xe3\x81\x8a\xe3\x81\x89", // "おおぉ" |
| "\xe3\x81\x8b\xe3\x81\x8b\xe3\x81\x8c", // "かかが" |
| "\xe3\x81\x8d\xe3\x81\x8d\xe3\x81\x8e", // "ききぎ" |
| "\xe3\x81\x8f\xe3\x81\x8f\xe3\x81\x90", // "くくぐ" |
| "\xe3\x81\x91\xe3\x81\x91\xe3\x81\x92", // "けけげ" |
| "\xe3\x81\x93\xe3\x81\x93\xe3\x81\x94", // "ここご" |
| "\xe3\x81\x95\xe3\x81\x95\xe3\x81\x96", // "ささざ" |
| "\xe3\x81\x97\xe3\x81\x97\xe3\x81\x98", // "ししじ" |
| "\xe3\x81\x99\xe3\x81\x99\xe3\x81\x9a", // "すすず" |
| "\xe3\x81\x9b\xe3\x81\x9b\xe3\x81\x9c", // "せせぜ" |
| "\xe3\x81\x9d\xe3\x81\x9d\xe3\x81\x9e", // "そそぞ" |
| "\xe3\x81\x9f\xe3\x81\x9f\xe3\x81\xa0", // "たただ" |
| "\xe3\x81\xa1\xe3\x81\xa1\xe3\x81\xa2", // "ちちぢ" |
| "\xe3\x81\xa4\xe3\x81\xa4\xe3\x81\xa3\xe3\x81\xa5", // "つつっづ" |
| "\xe3\x81\xa6\xe3\x81\xa6\xe3\x81\xa7", // "ててで" |
| "\xe3\x81\xa8\xe3\x81\xa8\xe3\x81\xa9", // "ととど" |
| "\xe3\x81\xaf\xe3\x81\xaf\xe3\x81\xb0\xe3\x81\xb1", // "ははばぱ" |
| "\xe3\x81\xb2\xe3\x81\xb2\xe3\x81\xb3\xe3\x81\xb4", // "ひひびぴ" |
| "\xe3\x81\xb5\xe3\x81\xb5\xe3\x81\xb6\xe3\x81\xb7", // "ふふぶぷ" |
| "\xe3\x81\xb8\xe3\x81\xb8\xe3\x81\xb9\xe3\x81\xba", // "へへべぺ" |
| "\xe3\x81\xbb\xe3\x81\xbb\xe3\x81\xbc\xe3\x81\xbd", // "ほほぼぽ" |
| "\xe3\x82\x84\xe3\x82\x84\xe3\x82\x83", // "ややゃ" |
| "\xe3\x82\x86\xe3\x82\x86\xe3\x82\x85", // "ゆゆゅ" |
| "\xe3\x82\x88\xe3\x82\x88\xe3\x82\x87", // "よよょ" |
| "\xe3\x82\x8f\xe3\x82\x8f\xe3\x82\x8e", // "わわゎ" |
| }; |
| |
| const uint32 kAsciiRange = 0x80; |
| |
| // Confirm that all the characters are within ASCII range. |
| bool ContainsAsciiCodeOnly(const string &str) { |
| for (string::const_iterator it = str.begin(); it != str.end(); ++it) { |
| if (static_cast<unsigned char>(*it) >= kAsciiRange) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| void SetKeyExpansion(char key, const string &expansion, |
| KeyExpansionTable *key_expansion_table) { |
| key_expansion_table->Add(key, expansion); |
| } |
| |
| void BuildHiraganaExpansionTable( |
| const SystemDictionaryCodecInterface &codec, |
| KeyExpansionTable *encoded_table) { |
| for (size_t index = 0; index < arraysize(kHiraganaExpansionTable); ++index) { |
| string encoded; |
| codec.EncodeKey(kHiraganaExpansionTable[index], &encoded); |
| DCHECK(ContainsAsciiCodeOnly(encoded)) << |
| "Encoded expansion data are supposed to fit within ASCII"; |
| |
| DCHECK_GT(encoded.size(), 0) << "Expansion data is empty"; |
| |
| if (encoded.size() == 1) { |
| continue; |
| } else { |
| SetKeyExpansion(encoded[0], encoded.substr(1), encoded_table); |
| } |
| } |
| } |
| |
| // Note that this class is just introduced due to performance reason. |
| // Conceptually, it should be in somewhere close to the codec implementation |
| // (see comments in Next method for details). |
| // However, it is necessary to refactor a bit larger area, especially around |
| // codec implementations, to make it happen. |
| // Considering the merit to introduce this class, we temporarily put it here. |
| // TODO(hidehiko): Move this class into a Codec related file. |
| class TokenDecodeIterator { |
| public: |
| TokenDecodeIterator( |
| const SystemDictionaryCodecInterface *codec, |
| const LoudsTrie &value_trie, |
| const uint32 *frequent_pos, |
| StringPiece key, |
| const uint8 *ptr) |
| : codec_(codec), |
| value_trie_(&value_trie), |
| frequent_pos_(frequent_pos), |
| key_(key), |
| state_(HAS_NEXT), |
| ptr_(ptr), |
| token_info_(NULL) { |
| key.CopyToString(&token_.key); |
| NextInternal(); |
| } |
| |
| ~TokenDecodeIterator() { |
| } |
| |
| const TokenInfo& Get() const { return token_info_; } |
| bool Done() const { return state_ == DONE; } |
| |
| void Next() { |
| DCHECK_NE(state_, DONE); |
| if (state_ == LAST_TOKEN) { |
| state_ = DONE; |
| return; |
| } |
| NextInternal(); |
| } |
| |
| private: |
| enum State { |
| HAS_NEXT, |
| LAST_TOKEN, |
| DONE, |
| }; |
| |
| void NextInternal() { |
| // Reset token_info with preserving some needed info in previous token. |
| int prev_id_in_value_trie = token_info_.id_in_value_trie; |
| token_info_.Clear(); |
| token_info_.token = &token_; |
| |
| // Do not clear key in token. |
| token_info_.token->attributes = Token::NONE; |
| |
| // This implementation is depending on the internal behavior of DecodeToken |
| // especially which fields are updated or not. Important fields are: |
| // Token::key, Token::value : key and value are never updated. |
| // Token::cost : always updated. |
| // Token::lid, Token::rid : updated iff the pos_type is neither |
| // FREQUENT_POS nor SAME_AS_PREV_POS. |
| // Token::attributes : updated iff the value is SPELLING_COLLECTION. |
| // TokenInfo::id_in_value_trie : updated iff the value_type is |
| // DEFAULT_VALUE. |
| // Thus, by not-reseting Token instance intentionally, we can skip most |
| // SAME_AS_PREV operations. |
| // The exception is Token::attributes. It is not-always set, so we need |
| // reset it everytime. |
| // This kind of structure should be packed in the codec or some |
| // related but new class. |
| int read_bytes; |
| if (!codec_->DecodeToken(ptr_, &token_info_, &read_bytes)) { |
| state_ = LAST_TOKEN; |
| } |
| ptr_ += read_bytes; |
| |
| // Fill remaining values. |
| switch (token_info_.value_type) { |
| case TokenInfo::DEFAULT_VALUE: { |
| token_.value.clear(); |
| LookupValue(token_info_.id_in_value_trie, &token_.value); |
| break; |
| } |
| case TokenInfo::SAME_AS_PREV_VALUE: { |
| DCHECK_NE(prev_id_in_value_trie, -1); |
| token_info_.id_in_value_trie = prev_id_in_value_trie; |
| // We can keep the current value here. |
| break; |
| } |
| case TokenInfo::AS_IS_HIRAGANA: { |
| token_.value = token_.key; |
| break; |
| } |
| case TokenInfo::AS_IS_KATAKANA: { |
| if (!key_.empty() && key_katakana_.empty()) { |
| Util::HiraganaToKatakana(key_, &key_katakana_); |
| } |
| token_.value = key_katakana_; |
| break; |
| } |
| default: { |
| LOG(DFATAL) << "unknown value_type: " << token_info_.value_type; |
| break; |
| } |
| } |
| |
| if (token_info_.accent_encoding_type == TokenInfo::EMBEDDED_IN_TOKEN) { |
| token_.value.append(1, '_') |
| .append(Util::StringPrintf("%d", token_info_.accent_type)); |
| } |
| |
| if (token_info_.pos_type == TokenInfo::FREQUENT_POS) { |
| const uint32 pos = frequent_pos_[token_info_.id_in_frequent_pos_map]; |
| token_.lid = pos >> 16; |
| token_.rid = pos & 0xffff; |
| } |
| } |
| |
| void LookupValue(int id, string *value) const { |
| char buffer[LoudsTrie::kMaxDepth + 1]; |
| const StringPiece encoded_value = value_trie_->RestoreKeyString(id, buffer); |
| codec_->DecodeValue(encoded_value, value); |
| } |
| |
| const SystemDictionaryCodecInterface *codec_; |
| const LoudsTrie *value_trie_; |
| const uint32 *frequent_pos_; |
| |
| const StringPiece key_; |
| // Katakana key will be lazily initialized. |
| string key_katakana_; |
| |
| State state_; |
| const uint8 *ptr_; |
| |
| TokenInfo token_info_; |
| Token token_; |
| |
| DISALLOW_COPY_AND_ASSIGN(TokenDecodeIterator); |
| }; |
| |
| inline const uint8 *GetTokenArrayPtr(const BitVectorBasedArray &token_array, |
| int key_id) { |
| size_t length = 0; |
| return reinterpret_cast<const uint8*>(token_array.Get(key_id, &length)); |
| } |
| |
| // Iterator for scanning token array. |
| // This iterator does not return actual token info but returns |
| // id data and the position only. |
| // This will be used only for reverse lookup. |
| // Forward lookup does not need such iterator because it can access |
| // a token directly without linear scan. |
| // |
| // Usage: |
| // for (TokenScanIterator iter(codec_, token_array_); |
| // !iter.Done(); iter.Next()) { |
| // const TokenScanIterator::Result &result = iter.Get(); |
| // // Do something with |result|. |
| // } |
| class TokenScanIterator { |
| public: |
| struct Result { |
| // Value id for the current token |
| int value_id; |
| // Index (= key id) for the current token |
| int index; |
| // Offset from the tokens section beginning. |
| // (token_array_->Get(id_in_key_trie) == |
| // token_array_->Get(0) + tokens_offset) |
| int tokens_offset; |
| }; |
| |
| TokenScanIterator( |
| const SystemDictionaryCodecInterface *codec, |
| const BitVectorBasedArray &token_array) |
| : codec_(codec), |
| termination_flag_(codec->GetTokensTerminationFlag()), |
| state_(HAS_NEXT), |
| offset_(0), |
| tokens_offset_(0), |
| index_(0) { |
| encoded_tokens_ptr_ = GetTokenArrayPtr(token_array, 0); |
| NextInternal(); |
| } |
| |
| ~TokenScanIterator() { |
| } |
| |
| const Result &Get() const { return result_; } |
| |
| bool Done() const { return state_ == DONE; } |
| |
| void Next() { |
| DCHECK_NE(state_, DONE); |
| NextInternal(); |
| } |
| |
| private: |
| enum State { |
| HAS_NEXT, |
| DONE, |
| }; |
| |
| void NextInternal() { |
| if (encoded_tokens_ptr_[offset_] == termination_flag_) { |
| state_ = DONE; |
| return; |
| } |
| int read_bytes; |
| result_.value_id = -1; |
| result_.index = index_; |
| result_.tokens_offset = tokens_offset_; |
| const bool is_last_token = |
| !(codec_->ReadTokenForReverseLookup(encoded_tokens_ptr_ + offset_, |
| &result_.value_id, &read_bytes)); |
| if (is_last_token) { |
| int tokens_size = offset_ + read_bytes - tokens_offset_; |
| if (tokens_size < kMinTokenArrayBlobSize) { |
| tokens_size = kMinTokenArrayBlobSize; |
| } |
| tokens_offset_ += tokens_size; |
| ++index_; |
| offset_ = tokens_offset_; |
| } else { |
| offset_ += read_bytes; |
| } |
| } |
| |
| const SystemDictionaryCodecInterface *codec_; |
| const uint8 *encoded_tokens_ptr_; |
| const uint8 termination_flag_; |
| State state_; |
| Result result_; |
| int offset_; |
| int tokens_offset_; |
| int index_; |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(TokenScanIterator); |
| }; |
| |
| struct ReverseLookupResult { |
| ReverseLookupResult() : tokens_offset(-1), id_in_key_trie(-1) {} |
| // Offset from the tokens section beginning. |
| // (token_array_.Get(id_in_key_trie) == token_array_.Get(0) + tokens_offset) |
| int tokens_offset; |
| // Id in key trie |
| int id_in_key_trie; |
| }; |
| |
| } // namespace |
| |
| class SystemDictionary::ReverseLookupCache { |
| public: |
| ReverseLookupCache() {} |
| |
| bool IsAvailable(const set<int> &id_set) const { |
| for (set<int>::const_iterator itr = id_set.begin(); |
| itr != id_set.end(); |
| ++itr) { |
| if (results.find(*itr) == results.end()) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| multimap<int, ReverseLookupResult> results; |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(ReverseLookupCache); |
| }; |
| |
| class SystemDictionary::ReverseLookupIndex { |
| public: |
| ReverseLookupIndex( |
| const SystemDictionaryCodecInterface *codec, |
| const BitVectorBasedArray &token_array) { |
| // Gets id size. |
| int value_id_max = -1; |
| for (TokenScanIterator iter(codec, token_array); |
| !iter.Done(); iter.Next()) { |
| const TokenScanIterator::Result &result = iter.Get(); |
| value_id_max = max(value_id_max, result.value_id); |
| } |
| |
| CHECK_GE(value_id_max, 0); |
| index_size_ = value_id_max + 1; |
| index_.reset(new ReverseLookupResultArray[index_size_]); |
| |
| // Gets result size for each ids. |
| for (TokenScanIterator iter(codec, token_array); |
| !iter.Done(); iter.Next()) { |
| const TokenScanIterator::Result &result = iter.Get(); |
| if (result.value_id != -1) { |
| DCHECK_LT(result.value_id, index_size_); |
| ++(index_[result.value_id].size); |
| } |
| } |
| |
| for (size_t i = 0; i < index_size_; ++i) { |
| index_[i].results.reset(new ReverseLookupResult[index_[i].size]); |
| } |
| |
| // Builds index. |
| for (TokenScanIterator iter(codec, token_array); |
| !iter.Done(); iter.Next()) { |
| const TokenScanIterator::Result &result = iter.Get(); |
| if (result.value_id == -1) { |
| continue; |
| } |
| |
| DCHECK_LT(result.value_id, index_size_); |
| ReverseLookupResultArray *result_array = &index_[result.value_id]; |
| |
| // Finds uninitialized result. |
| size_t result_index = 0; |
| for (result_index = 0; |
| result_index < result_array->size; |
| ++result_index) { |
| const ReverseLookupResult &lookup_result = |
| result_array->results[result_index]; |
| if (lookup_result.tokens_offset == -1 && |
| lookup_result.id_in_key_trie == -1) { |
| result_array->results[result_index].tokens_offset = |
| result.tokens_offset; |
| result_array->results[result_index].id_in_key_trie = result.index; |
| break; |
| } |
| } |
| } |
| |
| CHECK(index_.get() != NULL); |
| } |
| |
| ~ReverseLookupIndex() {} |
| |
| void FillResultMap(const set<int> &id_set, |
| multimap<int, ReverseLookupResult> *result_map) { |
| for (set<int>::const_iterator id_itr = id_set.begin(); |
| id_itr != id_set.end(); ++id_itr) { |
| const ReverseLookupResultArray &result_array = index_[*id_itr]; |
| for (size_t i = 0; i < result_array.size; ++i) { |
| result_map->insert(make_pair(*id_itr, result_array.results[i])); |
| } |
| } |
| } |
| |
| private: |
| struct ReverseLookupResultArray { |
| ReverseLookupResultArray() : size(0) {} |
| // Use scoped_ptr for reducing memory consumption as possible. |
| // Using vector requires 90 MB even when we call resize explicitly. |
| // On the other hand, scoped_ptr requires 57 MB. |
| scoped_ptr<ReverseLookupResult[]> results; |
| size_t size; |
| }; |
| |
| // Use scoped array for reducing memory consumption as possible. |
| scoped_ptr<ReverseLookupResultArray[]> index_; |
| size_t index_size_; |
| |
| DISALLOW_COPY_AND_ASSIGN(ReverseLookupIndex); |
| }; |
| |
| struct SystemDictionary::PredictiveLookupSearchState { |
| PredictiveLookupSearchState() : key_pos(0), is_expanded(false) {} |
| PredictiveLookupSearchState(const storage::louds::LoudsTrie::Node &n, |
| size_t pos, bool expanded) |
| : node(n), key_pos(pos), is_expanded(expanded) {} |
| |
| storage::louds::LoudsTrie::Node node; |
| size_t key_pos; |
| bool is_expanded; |
| }; |
| |
| struct SystemDictionary::Builder::Specification { |
| enum InputType { |
| FILENAME, |
| IMAGE, |
| }; |
| |
| Specification(InputType t, const string &fn, const char *p, int l, Options o, |
| const SystemDictionaryCodecInterface *codec) |
| : type(t), filename(fn), ptr(p), len(l), options(o), codec(codec) {} |
| |
| InputType type; |
| |
| // For InputType::FILENAME |
| const string filename; |
| |
| // For InputType::IMAGE |
| const char *ptr; |
| const int len; |
| |
| Options options; |
| const SystemDictionaryCodecInterface *codec; |
| }; |
| |
| SystemDictionary::Builder::Builder(const string &filename) |
| : spec_(new Specification(Specification::FILENAME, |
| filename, NULL, -1, NONE, NULL)) {} |
| |
| SystemDictionary::Builder::Builder(const char *ptr, int len) |
| : spec_(new Specification(Specification::IMAGE, |
| "", ptr, len, NONE, NULL)) {} |
| |
| SystemDictionary::Builder::~Builder() {} |
| |
| SystemDictionary::Builder & SystemDictionary::Builder::SetOptions( |
| Options options) { |
| spec_->options = options; |
| return *this; |
| } |
| |
| SystemDictionary::Builder &SystemDictionary::Builder::SetCodec( |
| const SystemDictionaryCodecInterface *codec) { |
| spec_->codec = codec; |
| return *this; |
| } |
| |
| SystemDictionary *SystemDictionary::Builder::Build() { |
| if (spec_->codec == NULL) { |
| spec_->codec = SystemDictionaryCodecFactory::GetCodec(); |
| } |
| |
| scoped_ptr<SystemDictionary> instance(new SystemDictionary(spec_->codec)); |
| |
| switch (spec_->type) { |
| case Specification::FILENAME: |
| if (!instance->dictionary_file_->OpenFromFile(spec_->filename)) { |
| LOG(ERROR) << "Failed to open system dictionary file"; |
| return NULL; |
| } |
| break; |
| case Specification::IMAGE: |
| // Make the dictionary not to be paged out. |
| // We don't check the return value because the process doesn't necessarily |
| // has the priviledge to mlock. |
| // Note that we don't munlock the space because it's always better to keep |
| // the singleton system dictionary paged in as long as the process runs. |
| SystemUtil::MaybeMLock(spec_->ptr, spec_->len); |
| if (!instance->dictionary_file_->OpenFromImage(spec_->ptr, spec_->len)) { |
| LOG(ERROR) << "Failed to open system dictionary image"; |
| return NULL; |
| } |
| break; |
| default: |
| LOG(ERROR) << "Invalid input type."; |
| return NULL; |
| } |
| |
| if (!instance->OpenDictionaryFile( |
| (spec_->options & ENABLE_REVERSE_LOOKUP_INDEX) != 0)) { |
| LOG(ERROR) << "Failed to create system dictionary"; |
| return NULL; |
| } |
| |
| return instance.release(); |
| } |
| |
| SystemDictionary::SystemDictionary(const SystemDictionaryCodecInterface *codec) |
| : frequent_pos_(NULL), |
| codec_(codec), |
| dictionary_file_(new DictionaryFile) {} |
| |
| SystemDictionary::~SystemDictionary() {} |
| |
| bool SystemDictionary::OpenDictionaryFile(bool enable_reverse_lookup_index) { |
| int len; |
| |
| const uint8 *key_image = reinterpret_cast<const uint8 *>( |
| dictionary_file_->GetSection(codec_->GetSectionNameForKey(), &len)); |
| if (!key_trie_.Open(key_image)) { |
| LOG(ERROR) << "cannot open key trie"; |
| return false; |
| } |
| |
| BuildHiraganaExpansionTable(*codec_, &hiragana_expansion_table_); |
| |
| const uint8 *value_image = reinterpret_cast<const uint8 *>( |
| dictionary_file_->GetSection(codec_->GetSectionNameForValue(), &len)); |
| if (!value_trie_.Open(value_image)) { |
| LOG(ERROR) << "can not open value trie"; |
| return false; |
| } |
| |
| const unsigned char *token_image = reinterpret_cast<const unsigned char *>( |
| dictionary_file_->GetSection(codec_->GetSectionNameForTokens(), &len)); |
| token_array_.Open(token_image); |
| |
| frequent_pos_ = reinterpret_cast<const uint32*>( |
| dictionary_file_->GetSection(codec_->GetSectionNameForPos(), &len)); |
| if (frequent_pos_ == NULL) { |
| LOG(ERROR) << "can not find frequent pos section"; |
| return false; |
| } |
| |
| if (enable_reverse_lookup_index) { |
| InitReverseLookupIndex(); |
| } |
| |
| return true; |
| } |
| |
| void SystemDictionary::InitReverseLookupIndex() { |
| if (reverse_lookup_index_.get() != NULL) { |
| return; |
| } |
| reverse_lookup_index_.reset(new ReverseLookupIndex(codec_, token_array_)); |
| } |
| |
| bool SystemDictionary::HasKey(StringPiece key) const { |
| string encoded_key; |
| codec_->EncodeKey(key, &encoded_key); |
| return key_trie_.HasKey(encoded_key); |
| } |
| |
| bool SystemDictionary::HasValue(StringPiece value) const { |
| string encoded_value; |
| codec_->EncodeValue(value, &encoded_value); |
| if (value_trie_.HasKey(encoded_value)) { |
| return true; |
| } |
| |
| // Because Hiragana, Katakana and Alphabet words are not stored in the |
| // value_trie for the data compression. They are only stored in the |
| // key_trie with flags. So we also need to check the existence in |
| // the key_trie. |
| |
| // Normalize the value as the key. This process depends on the |
| // implementation of SystemDictionaryBuilder::BuildValueTrie. |
| string key; |
| Util::KatakanaToHiragana(value, &key); |
| |
| string encoded_key; |
| codec_->EncodeKey(key, &encoded_key); |
| const int key_id = key_trie_.ExactSearch(encoded_key); |
| if (key_id == -1) { |
| return false; |
| } |
| |
| // We need to check the contents of value_trie for Katakana values. |
| // If (key, value) = (かな, カナ) is in the dictionary, "カナ" is |
| // not used as a key for value_trie or key_trie. Only "かな" is |
| // used as a key for key_trie. If we accept this limitation, we can |
| // skip the following code. |
| // |
| // If we add "if (key == value) return true;" here, we can check |
| // almost all cases of Hiragana and Alphabet words without the |
| // following iteration. However, when (mozc, MOZC) is stored but |
| // (mozc, mozc) is NOT stored, HasValue("mozc") wrongly returns |
| // true. |
| |
| // Get the block of tokens for this key. |
| const uint8 *encoded_tokens_ptr = GetTokenArrayPtr(token_array_, key_id); |
| |
| // Check tokens. |
| for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_, key, |
| encoded_tokens_ptr); |
| !iter.Done(); iter.Next()) { |
| const Token *token = iter.Get().token; |
| if (value == token->value) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| void SystemDictionary::CollectPredictiveNodesInBfsOrder( |
| StringPiece encoded_key, |
| const KeyExpansionTable &table, |
| size_t limit, |
| vector<PredictiveLookupSearchState> *result) const { |
| queue<PredictiveLookupSearchState> queue; |
| queue.push(PredictiveLookupSearchState(LoudsTrie::Node(), 0, false)); |
| do { |
| PredictiveLookupSearchState state = queue.front(); |
| queue.pop(); |
| |
| // Update traversal state for |encoded_key| and its expanded keys. |
| if (state.key_pos < encoded_key.size()) { |
| const char target_char = encoded_key[state.key_pos]; |
| const ExpandedKey &chars = table.ExpandKey(target_char); |
| |
| for (key_trie_.MoveToFirstChild(&state.node); |
| key_trie_.IsValidNode(state.node); |
| key_trie_.MoveToNextSibling(&state.node)) { |
| const char c = key_trie_.GetEdgeLabelToParentNode(state.node); |
| if (!chars.IsHit(c)) { |
| continue; |
| } |
| const bool is_expanded = state.is_expanded || c != target_char; |
| queue.push(PredictiveLookupSearchState(state.node, |
| state.key_pos + 1, |
| is_expanded)); |
| } |
| continue; |
| } |
| |
| // Collect prediction keys (state.key_pos >= encoded_key.size()). |
| if (key_trie_.IsTerminalNode(state.node)) { |
| result->push_back(state); |
| } |
| |
| // Collected enough entries. Collect all the remaining keys that have the |
| // same length as the longest key. |
| if (result->size() > limit) { |
| // The current key is the longest because of BFS property. |
| const size_t max_key_len = state.key_pos; |
| while (!queue.empty()) { |
| state = queue.front(); |
| queue.pop(); |
| if (state.key_pos > max_key_len) { |
| // Key length in the queue is monotonically increasing because of BFS |
| // property. So we don't need to check all the elements in the queue. |
| break; |
| } |
| DCHECK_EQ(state.key_pos, max_key_len); |
| if (key_trie_.IsTerminalNode(state.node)) { |
| result->push_back(state); |
| } |
| } |
| break; |
| } |
| |
| // Update traversal state for children. |
| for (key_trie_.MoveToFirstChild(&state.node); |
| key_trie_.IsValidNode(state.node); |
| key_trie_.MoveToNextSibling(&state.node)) { |
| queue.push(PredictiveLookupSearchState(state.node, |
| state.key_pos + 1, |
| state.is_expanded)); |
| } |
| } while (!queue.empty()); |
| } |
| |
| void SystemDictionary::LookupPredictive( |
| StringPiece key, bool use_kana_modifier_insensitive_lookup, |
| Callback *callback) const { |
| // Do nothing for empty key, although looking up all the entries with empty |
| // string seems natural. |
| if (key.empty()) { |
| return; |
| } |
| |
| string encoded_key; |
| codec_->EncodeKey(key, &encoded_key); |
| if (encoded_key.size() > LoudsTrie::kMaxDepth) { |
| return; |
| } |
| |
| const KeyExpansionTable &table = use_kana_modifier_insensitive_lookup |
| ? hiragana_expansion_table_ |
| : KeyExpansionTable::GetDefaultInstance(); |
| |
| // TODO(noriyukit): Lookup limit should be implemented at caller side by using |
| // callback mechanism. This hard-coding limits the capability and generality |
| // of dictionary module. CollectPredictiveNodesInBfsOrder() and the following |
| // loop for callback should be integrated for this purpose. |
| const size_t kLookupLimit = 64; |
| vector<PredictiveLookupSearchState> result; |
| result.reserve(kLookupLimit); |
| CollectPredictiveNodesInBfsOrder(encoded_key, table, kLookupLimit, &result); |
| |
| // Reused buffer and instances inside the following loop. |
| char encoded_actual_key_buffer[LoudsTrie::kMaxDepth + 1]; |
| string decoded_key, actual_key_str; |
| decoded_key.reserve(key.size() * 2); |
| actual_key_str.reserve(key.size() * 2); |
| for (size_t i = 0; i < result.size(); ++i) { |
| const PredictiveLookupSearchState &state = result[i]; |
| |
| // Computes the actual key. For example: |
| // key = "くー" |
| // encoded_actual_key = encode("ぐーぐる") [expanded] |
| // encoded_actual_key_prediction_suffix = encode("ぐる") |
| const StringPiece encoded_actual_key = |
| key_trie_.RestoreKeyString(state.node, encoded_actual_key_buffer); |
| const StringPiece encoded_actual_key_prediction_suffix( |
| encoded_actual_key, |
| encoded_key.size(), |
| encoded_actual_key.size() - encoded_key.size()); |
| |
| // decoded_key = "くーぐる" (= key + prediction suffix) |
| decoded_key.clear(); |
| key.CopyToString(&decoded_key); |
| codec_->DecodeKey(encoded_actual_key_prediction_suffix, &decoded_key); |
| switch (callback->OnKey(decoded_key)) { |
| case Callback::TRAVERSE_DONE: |
| return; |
| case Callback::TRAVERSE_NEXT_KEY: |
| continue; |
| case DictionaryInterface::Callback::TRAVERSE_CULL: |
| LOG(FATAL) << "Culling is not implemented."; |
| continue; |
| default: |
| break; |
| } |
| |
| StringPiece actual_key; |
| if (state.is_expanded) { |
| actual_key_str.clear(); |
| codec_->DecodeKey(encoded_actual_key, &actual_key_str); |
| actual_key = actual_key_str; |
| } else { |
| actual_key = decoded_key; |
| } |
| switch (callback->OnActualKey(decoded_key, actual_key, state.is_expanded)) { |
| case Callback::TRAVERSE_DONE: |
| return; |
| case Callback::TRAVERSE_NEXT_KEY: |
| continue; |
| case Callback::TRAVERSE_CULL: |
| LOG(FATAL) << "Culling is not implemented."; |
| continue; |
| default: |
| break; |
| } |
| |
| const int key_id = key_trie_.GetKeyIdOfTerminalNode(state.node); |
| for (TokenDecodeIterator iter(codec_, value_trie_, |
| frequent_pos_, actual_key, |
| GetTokenArrayPtr(token_array_, key_id)); |
| !iter.Done(); iter.Next()) { |
| const TokenInfo &token_info = iter.Get(); |
| const Callback::ResultType result = |
| callback->OnToken(decoded_key, actual_key, *token_info.token); |
| if (result == Callback::TRAVERSE_DONE) { |
| return; |
| } |
| if (result == Callback::TRAVERSE_NEXT_KEY) { |
| break; |
| } |
| DCHECK_NE(Callback::TRAVERSE_CULL, result) << "Not implemented"; |
| } |
| } |
| } |
| |
| namespace { |
| |
| // An implementation of prefix search without key expansion. Runs |callback| |
| // for prefixes of |encoded_key| in |key_trie|. |
| // Args: |
| // key_trie, value_trie, token_array, codec, frequent_pos: |
| // Members in SystemDictionary. |
| // key: |
| // The head address of the original key before applying codec. |
| // encoded_key: |
| // The encoded |key|. |
| // callback: |
| // A callback function to be called. |
| // token_filter: |
| // A functor of signature bool(const TokenInfo &). Only tokens for which |
| // this functor returns true are passed to callback function. |
| template <typename Func> |
| void RunCallbackOnEachPrefix(const LoudsTrie &key_trie, |
| const LoudsTrie &value_trie, |
| const BitVectorBasedArray &token_array, |
| const SystemDictionaryCodecInterface *codec, |
| const uint32 *frequent_pos, |
| const char *key, |
| StringPiece encoded_key, |
| DictionaryInterface::Callback *callback, |
| Func token_filter) { |
| typedef DictionaryInterface::Callback Callback; |
| LoudsTrie::Node node; |
| for (StringPiece::size_type i = 0; i < encoded_key.size(); ) { |
| if (!key_trie.MoveToChildByLabel(encoded_key[i], &node)) { |
| return; |
| } |
| ++i; // Increment here for next loop and |encoded_prefix| defined below. |
| if (!key_trie.IsTerminalNode(node)) { |
| continue; |
| } |
| const StringPiece encoded_prefix(encoded_key, 0, i); |
| const StringPiece prefix(key, codec->GetDecodedKeyLength(encoded_prefix)); |
| |
| switch (callback->OnKey(prefix)) { |
| case Callback::TRAVERSE_DONE: |
| case Callback::TRAVERSE_CULL: |
| return; |
| case Callback::TRAVERSE_NEXT_KEY: |
| continue; |
| default: |
| break; |
| } |
| |
| switch (callback->OnActualKey(prefix, prefix, false)) { |
| case Callback::TRAVERSE_DONE: |
| case Callback::TRAVERSE_CULL: |
| return; |
| case Callback::TRAVERSE_NEXT_KEY: |
| continue; |
| default: |
| break; |
| } |
| |
| const int key_id = key_trie.GetKeyIdOfTerminalNode(node); |
| for (TokenDecodeIterator iter(codec, value_trie, frequent_pos, prefix, |
| GetTokenArrayPtr(token_array, key_id)); |
| !iter.Done(); iter.Next()) { |
| const TokenInfo &token_info = iter.Get(); |
| if (!token_filter(token_info)) { |
| continue; |
| } |
| const Callback::ResultType res = |
| callback->OnToken(prefix, prefix, *token_info.token); |
| if (res == Callback::TRAVERSE_DONE || res == Callback::TRAVERSE_CULL) { |
| return; |
| } |
| if (res == Callback::TRAVERSE_NEXT_KEY) { |
| break; |
| } |
| } |
| } |
| } |
| |
| struct SelectAllTokens { |
| bool operator()(const TokenInfo &token_info) const { return true; } |
| }; |
| |
| class ReverseLookupCallbackWrapper : public DictionaryInterface::Callback { |
| public: |
| explicit ReverseLookupCallbackWrapper(DictionaryInterface::Callback *callback) |
| : callback_(callback) {} |
| virtual ~ReverseLookupCallbackWrapper() {} |
| virtual SystemDictionary::Callback::ResultType OnToken( |
| StringPiece key, StringPiece actual_key, const Token &token) { |
| Token modified_token = token; |
| modified_token.key.swap(modified_token.value); |
| return callback_->OnToken(key, actual_key, modified_token); |
| } |
| |
| DictionaryInterface::Callback *callback_; |
| }; |
| |
| } // namespace |
| |
| // Recursive implemention of depth-first prefix search with key expansion. |
| // Input parameters: |
| // key: |
| // The head address of the original key before applying codec. |
| // encoded_key: |
| // The encoded |key|. |
| // table: |
| // Key expansion table. |
| // callback: |
| // A callback function to be called. |
| // Parameters for recursion: |
| // node: |
| // Stores the current location in |key_trie_|. |
| // key_pos: |
| // Depth of node, i.e., encoded_key.substr(0, key_pos) is the current prefix |
| // for search. |
| // is_expanded: |
| // true is stored if the current node is reached by key expansion. |
| // actual_key_buffer: |
| // Buffer for storing actually used characters to reach this node, i.e., |
| // StringPiece(actual_key_buffer, key_pos) is the matched prefix using key |
| // expansion. |
| // actual_prefix: |
| // A reused string for decoded actual key. This is just for performance |
| // purpose. |
| DictionaryInterface::Callback::ResultType |
| SystemDictionary::LookupPrefixWithKeyExpansionImpl( |
| const char *key, |
| StringPiece encoded_key, |
| const KeyExpansionTable &table, |
| Callback *callback, |
| LoudsTrie::Node node, |
| StringPiece::size_type key_pos, |
| bool is_expanded, |
| char *actual_key_buffer, |
| string *actual_prefix) const { |
| // This do-block handles a terminal node and callback. do-block is used to |
| // break the block and continue to the subsequent traversal phase. |
| do { |
| if (!key_trie_.IsTerminalNode(node)) { |
| break; |
| } |
| |
| const StringPiece encoded_prefix(encoded_key, 0, key_pos); |
| const StringPiece prefix(key, codec_->GetDecodedKeyLength(encoded_prefix)); |
| Callback::ResultType result = callback->OnKey(prefix); |
| if (result == Callback::TRAVERSE_DONE || |
| result == Callback::TRAVERSE_CULL) { |
| return result; |
| } |
| if (result == Callback::TRAVERSE_NEXT_KEY) { |
| break; // Go to the traversal phase. |
| } |
| |
| const StringPiece encoded_actual_prefix(actual_key_buffer, key_pos); |
| actual_prefix->clear(); |
| codec_->DecodeKey(encoded_actual_prefix, actual_prefix); |
| result = callback->OnActualKey(prefix, *actual_prefix, is_expanded); |
| if (result == Callback::TRAVERSE_DONE || |
| result == Callback::TRAVERSE_CULL) { |
| return result; |
| } |
| if (result == Callback::TRAVERSE_NEXT_KEY) { |
| break; // Go to the traversal phase. |
| } |
| |
| const int key_id = key_trie_.GetKeyIdOfTerminalNode(node); |
| for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_, |
| *actual_prefix, |
| GetTokenArrayPtr(token_array_, key_id)); |
| !iter.Done(); iter.Next()) { |
| const TokenInfo &token_info = iter.Get(); |
| result = callback->OnToken(prefix, *actual_prefix, *token_info.token); |
| if (result == Callback::TRAVERSE_DONE || |
| result == Callback::TRAVERSE_CULL) { |
| return result; |
| } |
| if (result == Callback::TRAVERSE_NEXT_KEY) { |
| break; // Go to the traversal phase. |
| } |
| } |
| } while (false); |
| |
| // Traversal phase. |
| if (key_pos == encoded_key.size()) { |
| return Callback::TRAVERSE_CONTINUE; |
| } |
| const char current_char = encoded_key[key_pos]; |
| const ExpandedKey &chars = table.ExpandKey(current_char); |
| for (key_trie_.MoveToFirstChild(&node); key_trie_.IsValidNode(node); |
| key_trie_.MoveToNextSibling(&node)) { |
| const char c = key_trie_.GetEdgeLabelToParentNode(node); |
| if (!chars.IsHit(c)) { |
| continue; |
| } |
| actual_key_buffer[key_pos] = c; |
| const Callback::ResultType result = LookupPrefixWithKeyExpansionImpl( |
| key, encoded_key, table, callback, node, key_pos + 1, |
| is_expanded || c != current_char, |
| actual_key_buffer, actual_prefix); |
| if (result == Callback::TRAVERSE_DONE) { |
| return Callback::TRAVERSE_DONE; |
| } |
| } |
| |
| return Callback::TRAVERSE_CONTINUE; |
| } |
| |
| void SystemDictionary::LookupPrefix( |
| StringPiece key, |
| bool use_kana_modifier_insensitive_lookup, |
| Callback *callback) const { |
| string encoded_key; |
| codec_->EncodeKey(key, &encoded_key); |
| |
| if (!use_kana_modifier_insensitive_lookup) { |
| RunCallbackOnEachPrefix(key_trie_, value_trie_, token_array_, codec_, |
| frequent_pos_, key.data(), encoded_key, callback, |
| SelectAllTokens()); |
| return; |
| } |
| |
| char actual_key_buffer[LoudsTrie::kMaxDepth + 1]; |
| string actual_prefix; |
| actual_prefix.reserve(key.size() * 3); |
| LookupPrefixWithKeyExpansionImpl(key.data(), encoded_key, |
| hiragana_expansion_table_, callback, |
| LoudsTrie::Node(), 0, false, |
| actual_key_buffer, &actual_prefix); |
| } |
| |
| void SystemDictionary::LookupExact(StringPiece key, Callback *callback) const { |
| // Find the key in the key trie. |
| string encoded_key; |
| codec_->EncodeKey(key, &encoded_key); |
| const int key_id = key_trie_.ExactSearch(encoded_key); |
| if (key_id == -1) { |
| return; |
| } |
| if (callback->OnKey(key) != Callback::TRAVERSE_CONTINUE) { |
| return; |
| } |
| |
| // Callback on each token. |
| for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_, key, |
| GetTokenArrayPtr(token_array_, key_id)); |
| !iter.Done(); iter.Next()) { |
| if (callback->OnToken(key, key, *iter.Get().token) != |
| Callback::TRAVERSE_CONTINUE) { |
| break; |
| } |
| } |
| } |
| |
| void SystemDictionary::LookupReverse(StringPiece str, |
| Callback *callback) const { |
| // 1st step: Hiragana/Katakana are not in the value trie |
| // 2nd step: Reverse lookup in value trie |
| ReverseLookupCallbackWrapper callback_wrapper(callback); |
| RegisterReverseLookupTokensForT13N(str, &callback_wrapper); |
| RegisterReverseLookupTokensForValue(str, &callback_wrapper); |
| } |
| |
| namespace { |
| |
| class AddKeyIdsToSet { |
| public: |
| explicit AddKeyIdsToSet(set<int> *output) : output_(output) {} |
| |
| void operator()(StringPiece key, size_t prefix_len, |
| const LoudsTrie &trie, LoudsTrie::Node node) { |
| output_->insert(trie.GetKeyIdOfTerminalNode(node)); |
| } |
| |
| private: |
| set<int> *output_; |
| }; |
| |
| inline void AddKeyIdsOfAllPrefixes(const LoudsTrie &trie, StringPiece key, |
| set<int> *key_ids) { |
| trie.PrefixSearch(key, AddKeyIdsToSet(key_ids)); |
| } |
| |
| } // namespace |
| |
| void SystemDictionary::PopulateReverseLookupCache(StringPiece str) const { |
| if (reverse_lookup_index_ != NULL) { |
| // We don't need to prepare cache for the current reverse conversion, |
| // as we have already built the index for reverse lookup. |
| return; |
| } |
| reverse_lookup_cache_.reset(new ReverseLookupCache); |
| DCHECK(reverse_lookup_cache_.get()); |
| |
| // Iterate each suffix and collect IDs of all substrings. |
| set<int> id_set; |
| int pos = 0; |
| string lookup_key; |
| lookup_key.reserve(str.size()); |
| while (pos < str.size()) { |
| const StringPiece suffix(str, pos); |
| lookup_key.clear(); |
| codec_->EncodeValue(suffix, &lookup_key); |
| AddKeyIdsOfAllPrefixes(value_trie_, lookup_key, &id_set); |
| pos += Util::OneCharLen(suffix.data()); |
| } |
| // Collect tokens for all IDs. |
| ScanTokens(id_set, reverse_lookup_cache_.get()); |
| } |
| |
| void SystemDictionary::ClearReverseLookupCache() const { |
| reverse_lookup_cache_.reset(NULL); |
| } |
| |
| namespace { |
| |
| class FilterTokenForRegisterReverseLookupTokensForT13N { |
| public: |
| FilterTokenForRegisterReverseLookupTokensForT13N() { |
| tmp_str_.reserve(LoudsTrie::kMaxDepth * 3); |
| } |
| |
| bool operator()(const TokenInfo &token_info) { |
| // Skip spelling corrections. |
| if (token_info.token->attributes & Token::SPELLING_CORRECTION) { |
| return false; |
| } |
| if (token_info.value_type != TokenInfo::AS_IS_HIRAGANA && |
| token_info.value_type != TokenInfo::AS_IS_KATAKANA) { |
| // SAME_AS_PREV_VALUE may be t13n token. |
| tmp_str_.clear(); |
| Util::KatakanaToHiragana(token_info.token->value, &tmp_str_); |
| if (token_info.token->key != tmp_str_) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private: |
| string tmp_str_; |
| }; |
| |
| } // namespace |
| |
| void SystemDictionary::RegisterReverseLookupTokensForT13N( |
| StringPiece value, Callback *callback) const { |
| string hiragana_value, encoded_key; |
| Util::KatakanaToHiragana(value, &hiragana_value); |
| codec_->EncodeKey(hiragana_value, &encoded_key); |
| RunCallbackOnEachPrefix(key_trie_, value_trie_, token_array_, codec_, |
| frequent_pos_, hiragana_value.data(), |
| encoded_key, callback, |
| FilterTokenForRegisterReverseLookupTokensForT13N()); |
| } |
| |
| void SystemDictionary::RegisterReverseLookupTokensForValue( |
| StringPiece value, Callback *callback) const { |
| string lookup_key; |
| codec_->EncodeValue(value, &lookup_key); |
| |
| set<int> id_set; |
| AddKeyIdsOfAllPrefixes(value_trie_, lookup_key, &id_set); |
| |
| ReverseLookupCache *results = NULL; |
| ReverseLookupCache non_cached_results; |
| if (reverse_lookup_index_ != NULL) { |
| reverse_lookup_index_->FillResultMap(id_set, &non_cached_results.results); |
| results = &non_cached_results; |
| } else if (reverse_lookup_cache_.get() != NULL && |
| reverse_lookup_cache_->IsAvailable(id_set)) { |
| results = reverse_lookup_cache_.get(); |
| } else { |
| // Cache is not available. Get token for each ID. |
| ScanTokens(id_set, &non_cached_results); |
| results = &non_cached_results; |
| } |
| DCHECK(results != NULL); |
| |
| RegisterReverseLookupResults(id_set, *results, callback); |
| } |
| |
| void SystemDictionary::ScanTokens( |
| const set<int> &id_set, ReverseLookupCache *cache) const { |
| for (TokenScanIterator iter(codec_, token_array_); |
| !iter.Done(); iter.Next()) { |
| const TokenScanIterator::Result &result = iter.Get(); |
| if (result.value_id != -1 && |
| id_set.find(result.value_id) != id_set.end()) { |
| ReverseLookupResult lookup_result; |
| lookup_result.tokens_offset = result.tokens_offset; |
| lookup_result.id_in_key_trie = result.index; |
| cache->results.insert(make_pair(result.value_id, lookup_result)); |
| } |
| } |
| } |
| |
| void SystemDictionary::RegisterReverseLookupResults( |
| const set<int> &id_set, |
| const ReverseLookupCache &cache, |
| Callback *callback) const { |
| const uint8 *encoded_tokens_ptr = GetTokenArrayPtr(token_array_, 0); |
| char buffer[LoudsTrie::kMaxDepth + 1]; |
| for (set<int>::const_iterator set_itr = id_set.begin(); |
| set_itr != id_set.end(); |
| ++set_itr) { |
| const int value_id = *set_itr; |
| typedef multimap<int, ReverseLookupResult>::const_iterator ResultItr; |
| pair<ResultItr, ResultItr> range = cache.results.equal_range(*set_itr); |
| for (ResultItr result_itr = range.first; |
| result_itr != range.second; |
| ++result_itr) { |
| const ReverseLookupResult &reverse_result = result_itr->second; |
| |
| const StringPiece encoded_key = |
| key_trie_.RestoreKeyString(reverse_result.id_in_key_trie, buffer); |
| string tokens_key; |
| codec_->DecodeKey(encoded_key, &tokens_key); |
| if (callback->OnKey(tokens_key) != Callback::TRAVERSE_CONTINUE) { |
| continue; |
| } |
| for (TokenDecodeIterator iter( |
| codec_, value_trie_, frequent_pos_, tokens_key, |
| encoded_tokens_ptr + reverse_result.tokens_offset); |
| !iter.Done(); iter.Next()) { |
| const TokenInfo &token_info = iter.Get(); |
| if (token_info.token->attributes & Token::SPELLING_CORRECTION || |
| token_info.id_in_value_trie != value_id) { |
| continue; |
| } |
| callback->OnToken(tokens_key, tokens_key, *token_info.token); |
| } |
| } |
| } |
| } |
| |
| } // namespace dictionary |
| } // namespace mozc |