src/dictionary/system/system_dictionary.cc - mozc - Git at Google

 // Copyright 2010-2015, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 // System dictionary maintains following sections
 //  (1) Key trie
 //       Trie containing encoded key. Returns ids for lookup.
 //       We can get the key using the id by performing reverse lookup
 //       against the trie.
 //  (2) Value trie
 //       Trie containing encoded value. Returns ids for lookup.
 //       We can get the value using the id by performing reverse lookup
 //       against the trie.
 //  (3) Token array
 //       Array containing encoded tokens. Array index is the id in key trie.
 //       Token contains cost, POS, the id in key trie, etc.
 //  (4) Table for high frequent POS(left/right ID)
 //       Frequenty appearing POSs are stored as POS ids in token info for
 //       reducing binary size. This table is the map from the id to the
 //       actual ids.

 #include "dictionary/system/system_dictionary.h"

 #include <algorithm>
 #include <cstring>
 #include <limits>
 #include <map>
 #include <queue>
 #include <string>
 #include <utility>
 #include <vector>

 #include "base/logging.h"
 #include "base/port.h"
 #include "base/string_piece.h"
 #include "base/system_util.h"
 #include "base/util.h"
 #include "dictionary/dictionary_token.h"
 #include "dictionary/file/dictionary_file.h"
 #include "dictionary/system/codec_interface.h"
 #include "dictionary/system/words_info.h"
 #include "storage/louds/bit_vector_based_array.h"
 #include "storage/louds/louds_trie.h"

 namespace mozc {
 namespace dictionary {

 using mozc::storage::louds::BitVectorBasedArray;
 using mozc::storage::louds::LoudsTrie;

 namespace {

 const int kMinTokenArrayBlobSize = 4;

 // Expansion table format:
 // "<Character to expand>[<Expanded character 1><Expanded character 2>...]"
 //
 // Only characters that will be encoded into 1-byte ASCII char are allowed in
 // the table.
 //
 // Note that this implementation has potential issue that the key/values may
 // be mixed.
 // TODO(hidehiko): Clean up this hacky implementation.
 const char *kHiraganaExpansionTable[] = {
   "\xe3\x81\x82\xe3\x81\x82\xe3\x81\x81",  // "ああぁ"
   "\xe3\x81\x84\xe3\x81\x84\xe3\x81\x83",  // "いいぃ"
   "\xe3\x81\x86\xe3\x81\x86\xe3\x81\x85\xe3\x82\x94",  // "ううぅゔ"
   "\xe3\x81\x88\xe3\x81\x88\xe3\x81\x87",  // "ええぇ"
   "\xe3\x81\x8a\xe3\x81\x8a\xe3\x81\x89",  // "おおぉ"
   "\xe3\x81\x8b\xe3\x81\x8b\xe3\x81\x8c",  // "かかが"
   "\xe3\x81\x8d\xe3\x81\x8d\xe3\x81\x8e",  // "ききぎ"
   "\xe3\x81\x8f\xe3\x81\x8f\xe3\x81\x90",  // "くくぐ"
   "\xe3\x81\x91\xe3\x81\x91\xe3\x81\x92",  // "けけげ"
   "\xe3\x81\x93\xe3\x81\x93\xe3\x81\x94",  // "ここご"
   "\xe3\x81\x95\xe3\x81\x95\xe3\x81\x96",  // "ささざ"
   "\xe3\x81\x97\xe3\x81\x97\xe3\x81\x98",  // "ししじ"
   "\xe3\x81\x99\xe3\x81\x99\xe3\x81\x9a",  // "すすず"
   "\xe3\x81\x9b\xe3\x81\x9b\xe3\x81\x9c",  // "せせぜ"
   "\xe3\x81\x9d\xe3\x81\x9d\xe3\x81\x9e",  // "そそぞ"
   "\xe3\x81\x9f\xe3\x81\x9f\xe3\x81\xa0",  // "たただ"
   "\xe3\x81\xa1\xe3\x81\xa1\xe3\x81\xa2",  // "ちちぢ"
   "\xe3\x81\xa4\xe3\x81\xa4\xe3\x81\xa3\xe3\x81\xa5",  // "つつっづ"
   "\xe3\x81\xa6\xe3\x81\xa6\xe3\x81\xa7",  // "ててで"
   "\xe3\x81\xa8\xe3\x81\xa8\xe3\x81\xa9",  // "ととど"
   "\xe3\x81\xaf\xe3\x81\xaf\xe3\x81\xb0\xe3\x81\xb1",  // "ははばぱ"
   "\xe3\x81\xb2\xe3\x81\xb2\xe3\x81\xb3\xe3\x81\xb4",  // "ひひびぴ"
   "\xe3\x81\xb5\xe3\x81\xb5\xe3\x81\xb6\xe3\x81\xb7",  // "ふふぶぷ"
   "\xe3\x81\xb8\xe3\x81\xb8\xe3\x81\xb9\xe3\x81\xba",  // "へへべぺ"
   "\xe3\x81\xbb\xe3\x81\xbb\xe3\x81\xbc\xe3\x81\xbd",  // "ほほぼぽ"
   "\xe3\x82\x84\xe3\x82\x84\xe3\x82\x83",  // "ややゃ"
   "\xe3\x82\x86\xe3\x82\x86\xe3\x82\x85",  // "ゆゆゅ"
   "\xe3\x82\x88\xe3\x82\x88\xe3\x82\x87",  // "よよょ"
   "\xe3\x82\x8f\xe3\x82\x8f\xe3\x82\x8e",  // "わわゎ"
 };

 const uint32 kAsciiRange = 0x80;

 // Confirm that all the characters are within ASCII range.
 bool ContainsAsciiCodeOnly(const string &str) {
   for (string::const_iterator it = str.begin(); it != str.end(); ++it) {
     if (static_cast<unsigned char>(*it) >= kAsciiRange) {
       return false;
     }
   }
   return true;
 }

 void SetKeyExpansion(char key, const string &expansion,
                      KeyExpansionTable *key_expansion_table) {
   key_expansion_table->Add(key, expansion);
 }

 void BuildHiraganaExpansionTable(
     const SystemDictionaryCodecInterface &codec,
     KeyExpansionTable *encoded_table) {
   for (size_t index = 0; index < arraysize(kHiraganaExpansionTable); ++index) {
     string encoded;
     codec.EncodeKey(kHiraganaExpansionTable[index], &encoded);
     DCHECK(ContainsAsciiCodeOnly(encoded)) <<
         "Encoded expansion data are supposed to fit within ASCII";

     DCHECK_GT(encoded.size(), 0) << "Expansion data is empty";

     if (encoded.size() == 1) {
       continue;
     } else {
       SetKeyExpansion(encoded[0], encoded.substr(1), encoded_table);
     }
   }
 }

 // Note that this class is just introduced due to performance reason.
 // Conceptually, it should be in somewhere close to the codec implementation
 // (see comments in Next method for details).
 // However, it is necessary to refactor a bit larger area, especially around
 // codec implementations, to make it happen.
 // Considering the merit to introduce this class, we temporarily put it here.
 // TODO(hidehiko): Move this class into a Codec related file.
 class TokenDecodeIterator {
  public:
   TokenDecodeIterator(
       const SystemDictionaryCodecInterface *codec,
       const LoudsTrie &value_trie,
       const uint32 *frequent_pos,
       StringPiece key,
       const uint8 *ptr)
       : codec_(codec),
         value_trie_(&value_trie),
         frequent_pos_(frequent_pos),
         key_(key),
         state_(HAS_NEXT),
         ptr_(ptr),
         token_info_(NULL) {
     key.CopyToString(&token_.key);
     NextInternal();
   }

   ~TokenDecodeIterator() {
   }

   const TokenInfo& Get() const { return token_info_; }
   bool Done() const { return state_ == DONE; }

   void Next() {
     DCHECK_NE(state_, DONE);
     if (state_ == LAST_TOKEN) {
       state_ = DONE;
       return;
     }
     NextInternal();
   }

  private:
   enum State {
     HAS_NEXT,
     LAST_TOKEN,
     DONE,
   };

   void NextInternal() {
     // Reset token_info with preserving some needed info in previous token.
     int prev_id_in_value_trie = token_info_.id_in_value_trie;
     token_info_.Clear();
     token_info_.token = &token_;

     // Do not clear key in token.
     token_info_.token->attributes = Token::NONE;

     // This implementation is depending on the internal behavior of DecodeToken
     // especially which fields are updated or not. Important fields are:
     // Token::key, Token::value : key and value are never updated.
     // Token::cost : always updated.
     // Token::lid, Token::rid : updated iff the pos_type is neither
     //   FREQUENT_POS nor SAME_AS_PREV_POS.
     // Token::attributes : updated iff the value is SPELLING_COLLECTION.
     // TokenInfo::id_in_value_trie : updated iff the value_type is
     //   DEFAULT_VALUE.
     // Thus, by not-reseting Token instance intentionally, we can skip most
     //   SAME_AS_PREV operations.
     // The exception is Token::attributes. It is not-always set, so we need
     // reset it everytime.
     // This kind of structure should be packed in the codec or some
     // related but new class.
     int read_bytes;
     if (!codec_->DecodeToken(ptr_, &token_info_, &read_bytes)) {
       state_ = LAST_TOKEN;
     }
     ptr_ += read_bytes;

     // Fill remaining values.
     switch (token_info_.value_type) {
       case TokenInfo::DEFAULT_VALUE: {
         token_.value.clear();
         LookupValue(token_info_.id_in_value_trie, &token_.value);
         break;
       }
       case TokenInfo::SAME_AS_PREV_VALUE: {
         DCHECK_NE(prev_id_in_value_trie, -1);
         token_info_.id_in_value_trie = prev_id_in_value_trie;
         // We can keep the current value here.
         break;
       }
       case TokenInfo::AS_IS_HIRAGANA: {
         token_.value = token_.key;
         break;
       }
       case TokenInfo::AS_IS_KATAKANA: {
         if (!key_.empty() && key_katakana_.empty()) {
           Util::HiraganaToKatakana(key_, &key_katakana_);
         }
         token_.value = key_katakana_;
         break;
       }
       default: {
         LOG(DFATAL) << "unknown value_type: " << token_info_.value_type;
         break;
       }
     }

     if (token_info_.accent_encoding_type == TokenInfo::EMBEDDED_IN_TOKEN) {
       token_.value.append(1, '_')
                   .append(Util::StringPrintf("%d", token_info_.accent_type));
     }

     if (token_info_.pos_type == TokenInfo::FREQUENT_POS) {
       const uint32 pos = frequent_pos_[token_info_.id_in_frequent_pos_map];
       token_.lid = pos >> 16;
       token_.rid = pos & 0xffff;
     }
   }

   void LookupValue(int id, string *value) const {
     char buffer[LoudsTrie::kMaxDepth + 1];
     const StringPiece encoded_value = value_trie_->RestoreKeyString(id, buffer);
     codec_->DecodeValue(encoded_value, value);
   }

   const SystemDictionaryCodecInterface *codec_;
   const LoudsTrie *value_trie_;
   const uint32 *frequent_pos_;

   const StringPiece key_;
   // Katakana key will be lazily initialized.
   string key_katakana_;

   State state_;
   const uint8 *ptr_;

   TokenInfo token_info_;
   Token token_;

   DISALLOW_COPY_AND_ASSIGN(TokenDecodeIterator);
 };

 inline const uint8 *GetTokenArrayPtr(const BitVectorBasedArray &token_array,
                                      int key_id) {
   size_t length = 0;
   return reinterpret_cast<const uint8*>(token_array.Get(key_id, &length));
 }

 // Iterator for scanning token array.
 // This iterator does not return actual token info but returns
 // id data and the position only.
 // This will be used only for reverse lookup.
 // Forward lookup does not need such iterator because it can access
 // a token directly without linear scan.
 //
 //  Usage:
 //    for (TokenScanIterator iter(codec_, token_array_);
 //         !iter.Done(); iter.Next()) {
 //      const TokenScanIterator::Result &result = iter.Get();
 //      // Do something with |result|.
 //    }
 class TokenScanIterator {
  public:
   struct Result {
     // Value id for the current token
     int value_id;
     // Index (= key id) for the current token
     int index;
     // Offset from the tokens section beginning.
     // (token_array_->Get(id_in_key_trie) ==
     //  token_array_->Get(0) + tokens_offset)
     int tokens_offset;
   };

   TokenScanIterator(
       const SystemDictionaryCodecInterface *codec,
       const BitVectorBasedArray &token_array)
       : codec_(codec),
         termination_flag_(codec->GetTokensTerminationFlag()),
         state_(HAS_NEXT),
         offset_(0),
         tokens_offset_(0),
         index_(0) {
     encoded_tokens_ptr_ = GetTokenArrayPtr(token_array, 0);
     NextInternal();
   }

   ~TokenScanIterator() {
   }

   const Result &Get() const { return result_; }

   bool Done() const { return state_ == DONE; }

   void Next() {
     DCHECK_NE(state_, DONE);
     NextInternal();
   }

  private:
   enum State {
     HAS_NEXT,
     DONE,
   };

   void NextInternal() {
     if (encoded_tokens_ptr_[offset_] == termination_flag_) {
       state_ = DONE;
       return;
     }
     int read_bytes;
     result_.value_id = -1;
     result_.index = index_;
     result_.tokens_offset = tokens_offset_;
     const bool is_last_token =
         !(codec_->ReadTokenForReverseLookup(encoded_tokens_ptr_ + offset_,
                                             &result_.value_id, &read_bytes));
     if (is_last_token) {
       int tokens_size = offset_ + read_bytes - tokens_offset_;
       if (tokens_size < kMinTokenArrayBlobSize) {
         tokens_size = kMinTokenArrayBlobSize;
       }
       tokens_offset_ += tokens_size;
       ++index_;
       offset_ = tokens_offset_;
     } else {
       offset_ += read_bytes;
     }
   }

   const SystemDictionaryCodecInterface *codec_;
   const uint8 *encoded_tokens_ptr_;
   const uint8 termination_flag_;
   State state_;
   Result result_;
   int offset_;
   int tokens_offset_;
   int index_;

  private:
   DISALLOW_COPY_AND_ASSIGN(TokenScanIterator);
 };

 struct ReverseLookupResult {
   ReverseLookupResult() : tokens_offset(-1), id_in_key_trie(-1) {}
   // Offset from the tokens section beginning.
   // (token_array_.Get(id_in_key_trie) == token_array_.Get(0) + tokens_offset)
   int tokens_offset;
   // Id in key trie
   int id_in_key_trie;
 };

 }  // namespace

 class SystemDictionary::ReverseLookupCache {
  public:
   ReverseLookupCache() {}

   bool IsAvailable(const set<int> &id_set) const {
     for (set<int>::const_iterator itr = id_set.begin();
          itr != id_set.end();
          ++itr) {
       if (results.find(*itr) == results.end()) {
         return false;
       }
     }
     return true;
   }

   multimap<int, ReverseLookupResult> results;

  private:
   DISALLOW_COPY_AND_ASSIGN(ReverseLookupCache);
 };

 class SystemDictionary::ReverseLookupIndex {
  public:
   ReverseLookupIndex(
       const SystemDictionaryCodecInterface *codec,
       const BitVectorBasedArray &token_array) {
     // Gets id size.
     int value_id_max = -1;
     for (TokenScanIterator iter(codec, token_array);
          !iter.Done(); iter.Next()) {
       const TokenScanIterator::Result &result = iter.Get();
       value_id_max = max(value_id_max, result.value_id);
     }

     CHECK_GE(value_id_max, 0);
     index_size_ = value_id_max + 1;
     index_.reset(new ReverseLookupResultArray[index_size_]);

     // Gets result size for each ids.
     for (TokenScanIterator iter(codec, token_array);
          !iter.Done(); iter.Next()) {
       const TokenScanIterator::Result &result = iter.Get();
       if (result.value_id != -1) {
         DCHECK_LT(result.value_id, index_size_);
         ++(index_[result.value_id].size);
       }
     }

     for (size_t i = 0; i < index_size_; ++i) {
       index_[i].results.reset(new ReverseLookupResult[index_[i].size]);
     }

     // Builds index.
     for (TokenScanIterator iter(codec, token_array);
          !iter.Done(); iter.Next()) {
       const TokenScanIterator::Result &result = iter.Get();
       if (result.value_id == -1) {
         continue;
       }

       DCHECK_LT(result.value_id, index_size_);
       ReverseLookupResultArray *result_array = &index_[result.value_id];

       // Finds uninitialized result.
       size_t result_index = 0;
       for (result_index = 0;
            result_index < result_array->size;
            ++result_index) {
         const ReverseLookupResult &lookup_result =
             result_array->results[result_index];
         if (lookup_result.tokens_offset == -1 &&
             lookup_result.id_in_key_trie == -1) {
           result_array->results[result_index].tokens_offset =
               result.tokens_offset;
           result_array->results[result_index].id_in_key_trie = result.index;
           break;
         }
       }
     }

     CHECK(index_.get() != NULL);
   }

   ~ReverseLookupIndex() {}

   void FillResultMap(const set<int> &id_set,
                      multimap<int, ReverseLookupResult> *result_map) {
     for (set<int>::const_iterator id_itr  = id_set.begin();
          id_itr != id_set.end(); ++id_itr) {
       const ReverseLookupResultArray &result_array = index_[*id_itr];
       for (size_t i = 0; i < result_array.size; ++i) {
         result_map->insert(make_pair(*id_itr, result_array.results[i]));
       }
     }
   }

  private:
   struct ReverseLookupResultArray {
     ReverseLookupResultArray() : size(0) {}
     // Use scoped_ptr for reducing memory consumption as possible.
     // Using vector requires 90 MB even when we call resize explicitly.
     // On the other hand, scoped_ptr requires 57 MB.
     scoped_ptr<ReverseLookupResult[]> results;
     size_t size;
   };

   // Use scoped array for reducing memory consumption as possible.
   scoped_ptr<ReverseLookupResultArray[]> index_;
   size_t index_size_;

   DISALLOW_COPY_AND_ASSIGN(ReverseLookupIndex);
 };

 struct SystemDictionary::PredictiveLookupSearchState {
   PredictiveLookupSearchState() : key_pos(0), is_expanded(false) {}
   PredictiveLookupSearchState(const storage::louds::LoudsTrie::Node &n,
                               size_t pos, bool expanded)
       : node(n), key_pos(pos), is_expanded(expanded) {}

   storage::louds::LoudsTrie::Node node;
   size_t key_pos;
   bool is_expanded;
 };

 struct SystemDictionary::Builder::Specification {
   enum InputType {
     FILENAME,
     IMAGE,
   };

   Specification(InputType t, const string &fn, const char *p, int l, Options o,
                 const SystemDictionaryCodecInterface *codec)
       : type(t), filename(fn), ptr(p), len(l), options(o), codec(codec) {}

   InputType type;

   // For InputType::FILENAME
   const string filename;

   // For InputType::IMAGE
   const char *ptr;
   const int len;

   Options options;
   const SystemDictionaryCodecInterface *codec;
 };

 SystemDictionary::Builder::Builder(const string &filename)
     : spec_(new Specification(Specification::FILENAME,
                               filename, NULL, -1, NONE, NULL)) {}

 SystemDictionary::Builder::Builder(const char *ptr, int len)
     : spec_(new Specification(Specification::IMAGE,
                               "", ptr, len, NONE, NULL)) {}

 SystemDictionary::Builder::~Builder() {}

 SystemDictionary::Builder & SystemDictionary::Builder::SetOptions(
     Options options) {
   spec_->options = options;
   return *this;
 }

 SystemDictionary::Builder &SystemDictionary::Builder::SetCodec(
     const SystemDictionaryCodecInterface *codec) {
   spec_->codec = codec;
   return *this;
 }

 SystemDictionary *SystemDictionary::Builder::Build() {
   if (spec_->codec == NULL) {
     spec_->codec = SystemDictionaryCodecFactory::GetCodec();
   }

   scoped_ptr<SystemDictionary> instance(new SystemDictionary(spec_->codec));

   switch (spec_->type) {
     case Specification::FILENAME:
       if (!instance->dictionary_file_->OpenFromFile(spec_->filename)) {
         LOG(ERROR) << "Failed to open system dictionary file";
         return NULL;
       }
       break;
     case Specification::IMAGE:
       // Make the dictionary not to be paged out.
       // We don't check the return value because the process doesn't necessarily
       // has the priviledge to mlock.
       // Note that we don't munlock the space because it's always better to keep
       // the singleton system dictionary paged in as long as the process runs.
       SystemUtil::MaybeMLock(spec_->ptr, spec_->len);
       if (!instance->dictionary_file_->OpenFromImage(spec_->ptr, spec_->len)) {
         LOG(ERROR) << "Failed to open system dictionary image";
         return NULL;
       }
       break;
     default:
       LOG(ERROR) << "Invalid input type.";
       return NULL;
   }

   if (!instance->OpenDictionaryFile(
           (spec_->options & ENABLE_REVERSE_LOOKUP_INDEX) != 0)) {
     LOG(ERROR) << "Failed to create system dictionary";
     return NULL;
   }

   return instance.release();
 }

 SystemDictionary::SystemDictionary(const SystemDictionaryCodecInterface *codec)
     : frequent_pos_(NULL),
       codec_(codec),
       dictionary_file_(new DictionaryFile) {}

 SystemDictionary::~SystemDictionary() {}

 bool SystemDictionary::OpenDictionaryFile(bool enable_reverse_lookup_index) {
   int len;

   const uint8 *key_image = reinterpret_cast<const uint8 *>(
       dictionary_file_->GetSection(codec_->GetSectionNameForKey(), &len));
   if (!key_trie_.Open(key_image)) {
     LOG(ERROR) << "cannot open key trie";
     return false;
   }

   BuildHiraganaExpansionTable(*codec_, &hiragana_expansion_table_);

   const uint8 *value_image = reinterpret_cast<const uint8 *>(
       dictionary_file_->GetSection(codec_->GetSectionNameForValue(), &len));
   if (!value_trie_.Open(value_image)) {
     LOG(ERROR) << "can not open value trie";
     return false;
   }

   const unsigned char *token_image = reinterpret_cast<const unsigned char *>(
       dictionary_file_->GetSection(codec_->GetSectionNameForTokens(), &len));
   token_array_.Open(token_image);

   frequent_pos_ = reinterpret_cast<const uint32*>(
       dictionary_file_->GetSection(codec_->GetSectionNameForPos(), &len));
   if (frequent_pos_ == NULL) {
     LOG(ERROR) << "can not find frequent pos section";
     return false;
   }

   if (enable_reverse_lookup_index) {
     InitReverseLookupIndex();
   }

   return true;
 }

 void SystemDictionary::InitReverseLookupIndex() {
   if (reverse_lookup_index_.get() != NULL) {
     return;
   }
   reverse_lookup_index_.reset(new ReverseLookupIndex(codec_, token_array_));
 }

 bool SystemDictionary::HasKey(StringPiece key) const {
   string encoded_key;
   codec_->EncodeKey(key, &encoded_key);
   return key_trie_.HasKey(encoded_key);
 }

 bool SystemDictionary::HasValue(StringPiece value) const {
   string encoded_value;
   codec_->EncodeValue(value, &encoded_value);
   if (value_trie_.HasKey(encoded_value)) {
     return true;
   }

   // Because Hiragana, Katakana and Alphabet words are not stored in the
   // value_trie for the data compression.  They are only stored in the
   // key_trie with flags.  So we also need to check the existence in
   // the key_trie.

   // Normalize the value as the key.  This process depends on the
   // implementation of SystemDictionaryBuilder::BuildValueTrie.
   string key;
   Util::KatakanaToHiragana(value, &key);

   string encoded_key;
   codec_->EncodeKey(key, &encoded_key);
   const int key_id = key_trie_.ExactSearch(encoded_key);
   if (key_id == -1) {
     return false;
   }

   // We need to check the contents of value_trie for Katakana values.
   // If (key, value) = (かな, カナ) is in the dictionary, "カナ" is
   // not used as a key for value_trie or key_trie.  Only "かな" is
   // used as a key for key_trie.  If we accept this limitation, we can
   // skip the following code.
   //
   // If we add "if (key == value) return true;" here, we can check
   // almost all cases of Hiragana and Alphabet words without the
   // following iteration.  However, when (mozc, MOZC) is stored but
   // (mozc, mozc) is NOT stored, HasValue("mozc") wrongly returns
   // true.

   // Get the block of tokens for this key.
   const uint8 *encoded_tokens_ptr = GetTokenArrayPtr(token_array_, key_id);

   // Check tokens.
   for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_, key,
                                 encoded_tokens_ptr);
        !iter.Done(); iter.Next()) {
     const Token *token = iter.Get().token;
     if (value == token->value) {
       return true;
     }
   }
   return false;
 }

 void SystemDictionary::CollectPredictiveNodesInBfsOrder(
     StringPiece encoded_key,
     const KeyExpansionTable &table,
     size_t limit,
     vector<PredictiveLookupSearchState> *result) const {
   queue<PredictiveLookupSearchState> queue;
   queue.push(PredictiveLookupSearchState(LoudsTrie::Node(), 0, false));
   do {
     PredictiveLookupSearchState state = queue.front();
     queue.pop();

     // Update traversal state for |encoded_key| and its expanded keys.
     if (state.key_pos < encoded_key.size()) {
       const char target_char = encoded_key[state.key_pos];
       const ExpandedKey &chars = table.ExpandKey(target_char);

       for (key_trie_.MoveToFirstChild(&state.node);
            key_trie_.IsValidNode(state.node);
            key_trie_.MoveToNextSibling(&state.node)) {
         const char c = key_trie_.GetEdgeLabelToParentNode(state.node);
         if (!chars.IsHit(c)) {
           continue;
         }
         const bool is_expanded = state.is_expanded || c != target_char;
         queue.push(PredictiveLookupSearchState(state.node,
                                                state.key_pos + 1,
                                                is_expanded));
       }
       continue;
     }

     // Collect prediction keys (state.key_pos >= encoded_key.size()).
     if (key_trie_.IsTerminalNode(state.node)) {
       result->push_back(state);
     }

     // Collected enough entries.  Collect all the remaining keys that have the
     // same length as the longest key.
     if (result->size() > limit) {
       // The current key is the longest because of BFS property.
       const size_t max_key_len = state.key_pos;
       while (!queue.empty()) {
         state = queue.front();
         queue.pop();
         if (state.key_pos > max_key_len) {
           // Key length in the queue is monotonically increasing because of BFS
           // property.  So we don't need to check all the elements in the queue.
           break;
         }
         DCHECK_EQ(state.key_pos, max_key_len);
         if (key_trie_.IsTerminalNode(state.node)) {
           result->push_back(state);
         }
       }
       break;
     }

     // Update traversal state for children.
     for (key_trie_.MoveToFirstChild(&state.node);
          key_trie_.IsValidNode(state.node);
          key_trie_.MoveToNextSibling(&state.node)) {
       queue.push(PredictiveLookupSearchState(state.node,
                                              state.key_pos + 1,
                                              state.is_expanded));
     }
   } while (!queue.empty());
 }

 void SystemDictionary::LookupPredictive(
     StringPiece key, bool use_kana_modifier_insensitive_lookup,
     Callback *callback) const {
   // Do nothing for empty key, although looking up all the entries with empty
   // string seems natural.
   if (key.empty()) {
     return;
   }

   string encoded_key;
   codec_->EncodeKey(key, &encoded_key);
   if (encoded_key.size() > LoudsTrie::kMaxDepth) {
     return;
   }

   const KeyExpansionTable &table = use_kana_modifier_insensitive_lookup
       ? hiragana_expansion_table_
       : KeyExpansionTable::GetDefaultInstance();

   // TODO(noriyukit): Lookup limit should be implemented at caller side by using
   // callback mechanism.  This hard-coding limits the capability and generality
   // of dictionary module.  CollectPredictiveNodesInBfsOrder() and the following
   // loop for callback should be integrated for this purpose.
   const size_t kLookupLimit = 64;
   vector<PredictiveLookupSearchState> result;
   result.reserve(kLookupLimit);
   CollectPredictiveNodesInBfsOrder(encoded_key, table, kLookupLimit, &result);

   // Reused buffer and instances inside the following loop.
   char encoded_actual_key_buffer[LoudsTrie::kMaxDepth + 1];
   string decoded_key, actual_key_str;
   decoded_key.reserve(key.size() * 2);
   actual_key_str.reserve(key.size() * 2);
   for (size_t i = 0; i < result.size(); ++i) {
     const PredictiveLookupSearchState &state = result[i];

     // Computes the actual key.  For example:
     // key = "くー"
     // encoded_actual_key = encode("ぐーぐる")  [expanded]
     // encoded_actual_key_prediction_suffix = encode("ぐる")
     const StringPiece encoded_actual_key =
         key_trie_.RestoreKeyString(state.node, encoded_actual_key_buffer);
     const StringPiece encoded_actual_key_prediction_suffix(
         encoded_actual_key,
         encoded_key.size(),
         encoded_actual_key.size() - encoded_key.size());

     // decoded_key = "くーぐる" (= key + prediction suffix)
     decoded_key.clear();
     key.CopyToString(&decoded_key);
     codec_->DecodeKey(encoded_actual_key_prediction_suffix, &decoded_key);
     switch (callback->OnKey(decoded_key)) {
       case Callback::TRAVERSE_DONE:
         return;
       case Callback::TRAVERSE_NEXT_KEY:
         continue;
       case DictionaryInterface::Callback::TRAVERSE_CULL:
         LOG(FATAL) << "Culling is not implemented.";
         continue;
       default:
         break;
     }

     StringPiece actual_key;
     if (state.is_expanded) {
       actual_key_str.clear();
       codec_->DecodeKey(encoded_actual_key, &actual_key_str);
       actual_key = actual_key_str;
     } else {
       actual_key = decoded_key;
     }
     switch (callback->OnActualKey(decoded_key, actual_key, state.is_expanded)) {
       case Callback::TRAVERSE_DONE:
         return;
       case Callback::TRAVERSE_NEXT_KEY:
         continue;
       case Callback::TRAVERSE_CULL:
         LOG(FATAL) << "Culling is not implemented.";
         continue;
       default:
         break;
     }

     const int key_id = key_trie_.GetKeyIdOfTerminalNode(state.node);
     for (TokenDecodeIterator iter(codec_, value_trie_,
                                   frequent_pos_, actual_key,
                                   GetTokenArrayPtr(token_array_, key_id));
          !iter.Done(); iter.Next()) {
       const TokenInfo &token_info = iter.Get();
       const Callback::ResultType result =
           callback->OnToken(decoded_key, actual_key, *token_info.token);
       if (result == Callback::TRAVERSE_DONE) {
         return;
       }
       if (result == Callback::TRAVERSE_NEXT_KEY) {
         break;
       }
       DCHECK_NE(Callback::TRAVERSE_CULL, result) << "Not implemented";
     }
   }
 }

 namespace {

 // An implementation of prefix search without key expansion.  Runs |callback|
 // for prefixes of |encoded_key| in |key_trie|.
 // Args:
 //   key_trie, value_trie, token_array, codec, frequent_pos:
 //     Members in SystemDictionary.
 //   key:
 //     The head address of the original key before applying codec.
 //   encoded_key:
 //     The encoded |key|.
 //   callback:
 //     A callback function to be called.
 //   token_filter:
 //     A functor of signature bool(const TokenInfo &).  Only tokens for which
 //     this functor returns true are passed to callback function.
 template <typename Func>
 void RunCallbackOnEachPrefix(const LoudsTrie &key_trie,
                              const LoudsTrie &value_trie,
                              const BitVectorBasedArray &token_array,
                              const SystemDictionaryCodecInterface *codec,
                              const uint32 *frequent_pos,
                              const char *key,
                              StringPiece encoded_key,
                              DictionaryInterface::Callback *callback,
                              Func token_filter) {
   typedef DictionaryInterface::Callback Callback;
   LoudsTrie::Node node;
   for (StringPiece::size_type i = 0; i < encoded_key.size(); ) {
     if (!key_trie.MoveToChildByLabel(encoded_key[i], &node)) {
       return;
     }
     ++i;  // Increment here for next loop and |encoded_prefix| defined below.
     if (!key_trie.IsTerminalNode(node)) {
       continue;
     }
     const StringPiece encoded_prefix(encoded_key, 0, i);
     const StringPiece prefix(key, codec->GetDecodedKeyLength(encoded_prefix));

     switch (callback->OnKey(prefix)) {
       case Callback::TRAVERSE_DONE:
       case Callback::TRAVERSE_CULL:
         return;
       case Callback::TRAVERSE_NEXT_KEY:
         continue;
       default:
         break;
     }

     switch (callback->OnActualKey(prefix, prefix, false)) {
       case Callback::TRAVERSE_DONE:
       case Callback::TRAVERSE_CULL:
         return;
       case Callback::TRAVERSE_NEXT_KEY:
         continue;
       default:
         break;
     }

     const int key_id = key_trie.GetKeyIdOfTerminalNode(node);
     for (TokenDecodeIterator iter(codec, value_trie, frequent_pos, prefix,
                                   GetTokenArrayPtr(token_array, key_id));
          !iter.Done(); iter.Next()) {
       const TokenInfo &token_info = iter.Get();
       if (!token_filter(token_info)) {
         continue;
       }
       const Callback::ResultType res =
           callback->OnToken(prefix, prefix, *token_info.token);
       if (res == Callback::TRAVERSE_DONE || res == Callback::TRAVERSE_CULL) {
         return;
       }
       if (res == Callback::TRAVERSE_NEXT_KEY) {
         break;
       }
     }
   }
 }

 struct SelectAllTokens {
   bool operator()(const TokenInfo &token_info) const { return true; }
 };

 class ReverseLookupCallbackWrapper : public DictionaryInterface::Callback {
  public:
   explicit ReverseLookupCallbackWrapper(DictionaryInterface::Callback *callback)
       : callback_(callback) {}
   virtual ~ReverseLookupCallbackWrapper() {}
   virtual SystemDictionary::Callback::ResultType OnToken(
       StringPiece key, StringPiece actual_key, const Token &token) {
     Token modified_token = token;
     modified_token.key.swap(modified_token.value);
     return callback_->OnToken(key, actual_key, modified_token);
   }

   DictionaryInterface::Callback *callback_;
 };

 }  // namespace

 // Recursive implemention of depth-first prefix search with key expansion.
 // Input parameters:
 //   key:
 //     The head address of the original key before applying codec.
 //   encoded_key:
 //     The encoded |key|.
 //   table:
 //     Key expansion table.
 //   callback:
 //     A callback function to be called.
 // Parameters for recursion:
 //   node:
 //     Stores the current location in |key_trie_|.
 //   key_pos:
 //     Depth of node, i.e., encoded_key.substr(0, key_pos) is the current prefix
 //     for search.
 //   is_expanded:
 //     true is stored if the current node is reached by key expansion.
 //   actual_key_buffer:
 //     Buffer for storing actually used characters to reach this node, i.e.,
 //     StringPiece(actual_key_buffer, key_pos) is the matched prefix using key
 //     expansion.
 //   actual_prefix:
 //     A reused string for decoded actual key.  This is just for performance
 //     purpose.
 DictionaryInterface::Callback::ResultType
 SystemDictionary::LookupPrefixWithKeyExpansionImpl(
     const char *key,
     StringPiece encoded_key,
     const KeyExpansionTable &table,
     Callback *callback,
     LoudsTrie::Node node,
     StringPiece::size_type key_pos,
     bool is_expanded,
     char *actual_key_buffer,
     string *actual_prefix) const {
   // This do-block handles a terminal node and callback.  do-block is used to
   // break the block and continue to the subsequent traversal phase.
   do {
     if (!key_trie_.IsTerminalNode(node)) {
       break;
     }

     const StringPiece encoded_prefix(encoded_key, 0, key_pos);
     const StringPiece prefix(key, codec_->GetDecodedKeyLength(encoded_prefix));
     Callback::ResultType result = callback->OnKey(prefix);
     if (result == Callback::TRAVERSE_DONE ||
         result == Callback::TRAVERSE_CULL) {
       return result;
     }
     if (result == Callback::TRAVERSE_NEXT_KEY) {
       break;  // Go to the traversal phase.
     }

     const StringPiece encoded_actual_prefix(actual_key_buffer, key_pos);
     actual_prefix->clear();
     codec_->DecodeKey(encoded_actual_prefix, actual_prefix);
     result = callback->OnActualKey(prefix, *actual_prefix, is_expanded);
     if (result == Callback::TRAVERSE_DONE ||
         result == Callback::TRAVERSE_CULL) {
       return result;
     }
     if (result == Callback::TRAVERSE_NEXT_KEY) {
       break;  // Go to the traversal phase.
     }

     const int key_id = key_trie_.GetKeyIdOfTerminalNode(node);
     for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_,
                                   *actual_prefix,
                                   GetTokenArrayPtr(token_array_, key_id));
          !iter.Done(); iter.Next()) {
       const TokenInfo &token_info = iter.Get();
       result = callback->OnToken(prefix, *actual_prefix, *token_info.token);
       if (result == Callback::TRAVERSE_DONE ||
           result == Callback::TRAVERSE_CULL) {
         return result;
       }
       if (result == Callback::TRAVERSE_NEXT_KEY) {
         break;  // Go to the traversal phase.
       }
     }
   } while (false);

   // Traversal phase.
   if (key_pos == encoded_key.size()) {
     return Callback::TRAVERSE_CONTINUE;
   }
   const char current_char = encoded_key[key_pos];
   const ExpandedKey &chars = table.ExpandKey(current_char);
   for (key_trie_.MoveToFirstChild(&node); key_trie_.IsValidNode(node);
        key_trie_.MoveToNextSibling(&node)) {
     const char c = key_trie_.GetEdgeLabelToParentNode(node);
     if (!chars.IsHit(c)) {
       continue;
     }
     actual_key_buffer[key_pos] = c;
     const Callback::ResultType result = LookupPrefixWithKeyExpansionImpl(
         key, encoded_key, table, callback, node, key_pos + 1,
         is_expanded || c != current_char,
         actual_key_buffer, actual_prefix);
     if (result == Callback::TRAVERSE_DONE) {
       return Callback::TRAVERSE_DONE;
     }
   }

   return Callback::TRAVERSE_CONTINUE;
 }

 void SystemDictionary::LookupPrefix(
     StringPiece key,
     bool use_kana_modifier_insensitive_lookup,
     Callback *callback) const {
   string encoded_key;
   codec_->EncodeKey(key, &encoded_key);

   if (!use_kana_modifier_insensitive_lookup) {
     RunCallbackOnEachPrefix(key_trie_, value_trie_, token_array_, codec_,
                             frequent_pos_, key.data(), encoded_key, callback,
                             SelectAllTokens());
     return;
   }

   char actual_key_buffer[LoudsTrie::kMaxDepth + 1];
   string actual_prefix;
   actual_prefix.reserve(key.size() * 3);
   LookupPrefixWithKeyExpansionImpl(key.data(), encoded_key,
                                    hiragana_expansion_table_, callback,
                                    LoudsTrie::Node(), 0, false,
                                    actual_key_buffer, &actual_prefix);
 }

 void SystemDictionary::LookupExact(StringPiece key, Callback *callback) const {
   // Find the key in the key trie.
   string encoded_key;
   codec_->EncodeKey(key, &encoded_key);
   const int key_id = key_trie_.ExactSearch(encoded_key);
   if (key_id == -1) {
     return;
   }
   if (callback->OnKey(key) != Callback::TRAVERSE_CONTINUE) {
     return;
   }

   // Callback on each token.
   for (TokenDecodeIterator iter(codec_, value_trie_, frequent_pos_, key,
                                 GetTokenArrayPtr(token_array_, key_id));
        !iter.Done(); iter.Next()) {
     if (callback->OnToken(key, key, *iter.Get().token) !=
         Callback::TRAVERSE_CONTINUE) {
       break;
     }
   }
 }

 void SystemDictionary::LookupReverse(StringPiece str,
                                      Callback *callback) const {
   // 1st step: Hiragana/Katakana are not in the value trie
   // 2nd step: Reverse lookup in value trie
   ReverseLookupCallbackWrapper callback_wrapper(callback);
   RegisterReverseLookupTokensForT13N(str, &callback_wrapper);
   RegisterReverseLookupTokensForValue(str, &callback_wrapper);
 }

 namespace {

 class AddKeyIdsToSet {
  public:
   explicit AddKeyIdsToSet(set<int> *output) : output_(output) {}

   void operator()(StringPiece key, size_t prefix_len,
                   const LoudsTrie &trie, LoudsTrie::Node node) {
     output_->insert(trie.GetKeyIdOfTerminalNode(node));
   }

  private:
   set<int> *output_;
 };

 inline void AddKeyIdsOfAllPrefixes(const LoudsTrie &trie, StringPiece key,
                                    set<int> *key_ids) {
   trie.PrefixSearch(key, AddKeyIdsToSet(key_ids));
 }

 }  // namespace

 void SystemDictionary::PopulateReverseLookupCache(StringPiece str) const {
   if (reverse_lookup_index_ != NULL) {
     // We don't need to prepare cache for the current reverse conversion,
     // as we have already built the index for reverse lookup.
     return;
   }
   reverse_lookup_cache_.reset(new ReverseLookupCache);
   DCHECK(reverse_lookup_cache_.get());

   // Iterate each suffix and collect IDs of all substrings.
   set<int> id_set;
   int pos = 0;
   string lookup_key;
   lookup_key.reserve(str.size());
   while (pos < str.size()) {
     const StringPiece suffix(str, pos);
     lookup_key.clear();
     codec_->EncodeValue(suffix, &lookup_key);
     AddKeyIdsOfAllPrefixes(value_trie_, lookup_key, &id_set);
     pos += Util::OneCharLen(suffix.data());
   }
   // Collect tokens for all IDs.
   ScanTokens(id_set, reverse_lookup_cache_.get());
 }

 void SystemDictionary::ClearReverseLookupCache() const {
   reverse_lookup_cache_.reset(NULL);
 }

 namespace {

 class FilterTokenForRegisterReverseLookupTokensForT13N {
  public:
   FilterTokenForRegisterReverseLookupTokensForT13N() {
     tmp_str_.reserve(LoudsTrie::kMaxDepth * 3);
   }

   bool operator()(const TokenInfo &token_info) {
     // Skip spelling corrections.
     if (token_info.token->attributes & Token::SPELLING_CORRECTION) {
       return false;
     }
     if (token_info.value_type != TokenInfo::AS_IS_HIRAGANA &&
         token_info.value_type != TokenInfo::AS_IS_KATAKANA) {
       // SAME_AS_PREV_VALUE may be t13n token.
       tmp_str_.clear();
       Util::KatakanaToHiragana(token_info.token->value, &tmp_str_);
       if (token_info.token->key != tmp_str_) {
         return false;
       }
     }
     return true;
   }

  private:
   string tmp_str_;
 };

 }  // namespace

 void SystemDictionary::RegisterReverseLookupTokensForT13N(
     StringPiece value, Callback *callback) const {
   string hiragana_value, encoded_key;
   Util::KatakanaToHiragana(value, &hiragana_value);
   codec_->EncodeKey(hiragana_value, &encoded_key);
   RunCallbackOnEachPrefix(key_trie_, value_trie_, token_array_, codec_,
                           frequent_pos_, hiragana_value.data(),
                           encoded_key, callback,
                           FilterTokenForRegisterReverseLookupTokensForT13N());
 }

 void SystemDictionary::RegisterReverseLookupTokensForValue(
     StringPiece value, Callback *callback) const {
   string lookup_key;
   codec_->EncodeValue(value, &lookup_key);

   set<int> id_set;
   AddKeyIdsOfAllPrefixes(value_trie_, lookup_key, &id_set);

   ReverseLookupCache *results = NULL;
   ReverseLookupCache non_cached_results;
   if (reverse_lookup_index_ != NULL) {
     reverse_lookup_index_->FillResultMap(id_set, &non_cached_results.results);
     results = &non_cached_results;
   } else if (reverse_lookup_cache_.get() != NULL &&
              reverse_lookup_cache_->IsAvailable(id_set)) {
     results = reverse_lookup_cache_.get();
   } else {
     // Cache is not available. Get token for each ID.
     ScanTokens(id_set, &non_cached_results);
     results = &non_cached_results;
   }
   DCHECK(results != NULL);

   RegisterReverseLookupResults(id_set, *results, callback);
 }

 void SystemDictionary::ScanTokens(
     const set<int> &id_set, ReverseLookupCache *cache) const {
   for (TokenScanIterator iter(codec_, token_array_);
        !iter.Done(); iter.Next()) {
     const TokenScanIterator::Result &result = iter.Get();
     if (result.value_id != -1 &&
         id_set.find(result.value_id) != id_set.end()) {
       ReverseLookupResult lookup_result;
       lookup_result.tokens_offset = result.tokens_offset;
       lookup_result.id_in_key_trie = result.index;
       cache->results.insert(make_pair(result.value_id, lookup_result));
     }
   }
 }

 void SystemDictionary::RegisterReverseLookupResults(
     const set<int> &id_set,
     const ReverseLookupCache &cache,
     Callback *callback) const {
   const uint8 *encoded_tokens_ptr = GetTokenArrayPtr(token_array_, 0);
   char buffer[LoudsTrie::kMaxDepth + 1];
   for (set<int>::const_iterator set_itr = id_set.begin();
        set_itr != id_set.end();
        ++set_itr) {
     const int value_id = *set_itr;
     typedef multimap<int, ReverseLookupResult>::const_iterator ResultItr;
     pair<ResultItr, ResultItr> range = cache.results.equal_range(*set_itr);
     for (ResultItr result_itr = range.first;
          result_itr != range.second;
          ++result_itr) {
       const ReverseLookupResult &reverse_result = result_itr->second;

       const StringPiece encoded_key =
           key_trie_.RestoreKeyString(reverse_result.id_in_key_trie, buffer);
       string tokens_key;
       codec_->DecodeKey(encoded_key, &tokens_key);
       if (callback->OnKey(tokens_key) != Callback::TRAVERSE_CONTINUE) {
         continue;
       }
       for (TokenDecodeIterator iter(
                codec_, value_trie_, frequent_pos_, tokens_key,
                encoded_tokens_ptr  + reverse_result.tokens_offset);
            !iter.Done(); iter.Next()) {
         const TokenInfo &token_info = iter.Get();
         if (token_info.token->attributes & Token::SPELLING_CORRECTION ||
             token_info.id_in_value_trie != value_id) {
           continue;
         }
         callback->OnToken(tokens_key, tokens_key, *token_info.token);
       }
     }
   }
 }

 }  // namespace dictionary
 }  // namespace mozc