| // Copyright 2010-2014, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "prediction/user_history_predictor.h" |
| |
| #include <algorithm> |
| #include <cctype> |
| #include <climits> |
| #include <string> |
| |
| #include "base/config_file_stream.h" |
| #include "base/flags.h" |
| #include "base/init.h" |
| #include "base/logging.h" |
| #include "base/thread.h" |
| #include "base/trie.h" |
| #include "base/util.h" |
| #include "composer/composer.h" |
| #include "config/config.pb.h" |
| #include "config/config_handler.h" |
| #include "converter/conversion_request.h" |
| #include "converter/segments.h" |
| #include "dictionary/dictionary_interface.h" |
| #include "dictionary/pos_matcher.h" |
| #include "dictionary/suppression_dictionary.h" |
| #include "prediction/predictor_interface.h" |
| #include "prediction/user_history_predictor.pb.h" |
| #include "rewriter/variants_rewriter.h" |
| #include "session/commands.pb.h" |
| #include "storage/encrypted_string_storage.h" |
| #include "storage/lru_cache.h" |
| #include "usage_stats/usage_stats.h" |
| |
| |
| // This flag is set by predictor.cc |
| // We can remove this after the ambiguity expansion feature get stable. |
| DEFINE_bool(enable_expansion_for_user_history_predictor, |
| false, |
| "enable ambiguity expansion for user_history_predictor."); |
| |
| namespace mozc { |
| |
| using commands::Request; |
| |
| namespace { |
| // find suggestion candidates from the most recent 3000 history in LRU. |
| // We don't check all history, since suggestion is called every key event |
| const size_t kMaxSuggestionTrial = 3000; |
| |
| // find suffix matches of history_segments from the most recent 500 histories |
| // in LRU. |
| const size_t kMaxPrevValueTrial = 500; |
| |
| // cache size |
| // Typically memory/storage footprint becomes kLRUCacheSize * 70 bytes. |
| #ifdef OS_ANDROID |
| const size_t kLRUCacheSize = 2000; |
| #else // OS_ANDROID |
| const size_t kLRUCacheSize = 10000; |
| #endif // OS_ANDROID |
| |
| // don't save key/value that are |
| // longer than kMaxCandidateSize to avoid memory explosion |
| const size_t kMaxStringLength = 256; |
| |
| // maximum size of next_entries |
| const size_t kMaxNextEntriesSize = 4; |
| |
| // revert id for user_history_predictor |
| const uint16 kRevertId = 1; |
| |
| // default object pool size for EntryPriorityQueue |
| const size_t kEntryPoolSize = 16; |
| |
| // file name for the history |
| #ifdef OS_WIN |
| const char kFileName[] = "user://history.db"; |
| #else |
| const char kFileName[] = "user://.history.db"; |
| #endif |
| |
| // use '\t' as a key/value delimiter |
| const char kDelimiter[] = "\t"; |
| |
| // "絵文字" |
| const char kEmojiDescription[] = "\xE7\xB5\xB5\xE6\x96\x87\xE5\xAD\x97"; |
| |
| // TODO(peria, hidehiko): Unify this checker and IsEmojiCandidate in |
| // EmojiRewriter. If you make similar functions before the merging in |
| // case, put a similar note to avoid twisted dependency. |
| bool IsEmojiEntry(const UserHistoryPredictor::Entry &entry) { |
| return (entry.has_description() && |
| entry.description().find(kEmojiDescription) != string::npos); |
| } |
| |
| bool IsPunctuation(const string &value) { |
| // return (value == "。" || value == "." || |
| // value == "、" || value == "," || |
| // value == "?" || value == "?" || |
| // value == "!" || value == "!" || |
| // value == "," || value == "."); |
| return (value == "\xE3\x80\x82" || value == "." || |
| value == "\xE3\x80\x81" || value == "," || |
| value == "\xEF\xBC\x9F" || value == "?" || |
| value == "\xEF\xBC\x81" || value == "!" || |
| value == "\xEF\xBC\x8C" || value == "\xEF\xBC\x8E"); |
| } |
| |
| // Return romanaized string. |
| string ToRoman(const string &str) { |
| string result; |
| Util::HiraganaToRomanji(str, &result); |
| return result; |
| } |
| |
| // return true if value looks like a content word. |
| // Currently, just checks the script type. |
| bool IsContentWord(const string &value) { |
| return Util::CharsLen(value) > 1 || |
| Util::GetScriptType(value) != Util::UNKNOWN_SCRIPT; |
| } |
| |
| // Return candidate description. |
| // If candidate is spelling correction, typing correction |
| // or auto partial suggestion, |
| // don't use the description, since "did you mean" like description must be |
| // provided at an appropriate timing and context. |
| string GetDescription(const Segment::Candidate &candidate) { |
| if (candidate.attributes & |
| (Segment::Candidate::SPELLING_CORRECTION | |
| Segment::Candidate::TYPING_CORRECTION | |
| Segment::Candidate::AUTO_PARTIAL_SUGGESTION)) { |
| return ""; |
| } |
| return candidate.description; |
| } |
| |
| } // namespace |
| |
| // Returns true if the input first candidate seems to be a privacy sensitive |
| // such like password. |
| bool UserHistoryPredictor::IsPrivacySensitive(const Segments *segments) const { |
| const bool kNonSensitive = false; |
| const bool kSensitive = true; |
| |
| // Skip privacy sensitive check if |segments| consists of multiple conversion |
| // segment. That is, segments like "パスワードは|x7LAGhaR" where '|' |
| // represents segment boundary is not considered to be privacy sensitive. |
| // TODO(team): Revisit this rule if necessary. |
| if (segments->conversion_segments_size() != 1) { |
| return kNonSensitive; |
| } |
| |
| // Hereafter, we must have only one conversion segment. |
| const Segment &conversion_segment = segments->conversion_segment(0); |
| const string &segment_key = conversion_segment.key(); |
| |
| // The top candidate, which is about to be commited. |
| const Segment::Candidate &candidate = conversion_segment.candidate(0); |
| const string &candidate_value = candidate.value; |
| |
| // If |candidate_value| contains any non-ASCII character, do not treat |
| // it as privacy sensitive information. |
| // TODO(team): Improve the following rule. For example, |
| // "0000-0000-0000-0000" is not treated as privacy sensitive |
| // because of this rule. When a user commits his password in |
| // full-width form by mistake, like "x7LAGhaR", it is not |
| // treated as privacy sensitive too. |
| if (Util::GetCharacterSet(candidate_value) != Util::ASCII) { |
| return kNonSensitive; |
| } |
| |
| // Hereafter, |candidate_value| consists of ASCII characters only. |
| |
| // Note: if the key looks like hiragana, the candidate might be Katakana to |
| // English transliteration. Don't suppress transliterated candidates. |
| // http://b/4394325 |
| |
| // If the key consists of number characters only, treat it as privacy |
| // sensitive. |
| if (Util::GetScriptType(segment_key) == Util::NUMBER) { |
| return kSensitive; |
| } |
| |
| // If the key contains any alphabetical character but it is in our dictionary, |
| // it can be treated as privacy nonsensitive word; cf. b/5995529. Besides, |
| // short words would be considered as privacy nonsensitive word as well. |
| if (segment_key.size() <= 3) { |
| return kNonSensitive; |
| } |
| |
| // Dictionary-based sensitivity test. If the word user typed is in dictionary, |
| // treat it as privacy insensitive. For English (ASCII) words, |
| // dictionary-based test is extended to the following forms: |
| // 1) All lower case (e.g., hello) |
| // 2) All upper case (e.g., HELLO) |
| // 3) Capitalized (e.g., Hello) |
| // 4) As-is (e.g., HeLlO) |
| // Since English words are stored in lower case, in case of upper case and |
| // capitalized keys, we convert it to lower case in advance. |
| if (Util::IsUpperOrCapitalizedAscii(candidate_value)) { |
| // Look up for keys that are all in upper case or capitalized ASCII. |
| string lower_case_value(candidate_value); |
| Util::LowerString(&lower_case_value); |
| if (dictionary_->HasValue(lower_case_value)) { |
| return kNonSensitive; |
| } |
| } else { |
| // Look up for the original key, including those that are all in lower case |
| // ASCII. |
| if (dictionary_->HasValue(candidate_value)) { |
| return kNonSensitive; |
| } |
| } |
| // If the key contains any alphabetical character and is not in our |
| // dictionary, treat it as privacy sensitive. There also remains some cases to |
| // be considered. Compare following two cases. |
| // Case A: |
| // 1. Type "ywwz1sxm" in Roman-input style then get "yっwz1sxm". |
| // 2. Hit F10 key to convert it to "ywwz1sxm" by |
| // ConvertToHalfAlphanumeric command. |
| // 3. Commit it. |
| // In this case, |segment_key| is "yっwz1sxm" and actually contains |
| // alphabetical characters. So kSensitive will be returned. |
| // So far so good. |
| // Case B: |
| // 1. type "ia1bo3xu" in Roman-input style then get "いあ1ぼ3ぅ". |
| // 2. hit F10 key to convert it to "ia1bo3xu" by |
| // ConvertToHalfAlphanumeric command. |
| // 3. commit it. |
| // In this case, |segment_key| is "ia1bo3xu" and contains no |
| // alphabetical character. So the following check does nothing. |
| // TODO(team): Improve the following rule so that our user experience |
| // can be consistent between case A and B. |
| if (Util::ContainsScriptType(segment_key, Util::ALPHABET)) { |
| return kSensitive; |
| } |
| |
| return kNonSensitive; |
| } |
| |
| UserHistoryStorage::UserHistoryStorage(const string &filename) |
| : storage_(new storage::EncryptedStringStorage(filename)) { |
| } |
| |
| UserHistoryStorage::~UserHistoryStorage() {} |
| |
| bool UserHistoryStorage::Load() { |
| string input; |
| if (!storage_->Load(&input)) { |
| LOG(ERROR) << "Can't load user history data."; |
| return false; |
| } |
| |
| if (!ParseFromString(input)) { |
| LOG(ERROR) << "ParseFromString failed. message looks broken"; |
| return false; |
| } |
| |
| VLOG(1) << "Loaded user histroy, size=" << entries_size(); |
| return true; |
| } |
| |
| bool UserHistoryStorage::Save() const { |
| if (entries_size() == 0) { |
| LOG(WARNING) << "etries size is 0. Not saved"; |
| return false; |
| } |
| |
| string output; |
| if (!AppendToString(&output)) { |
| LOG(ERROR) << "AppendToString failed"; |
| return false; |
| } |
| |
| if (!storage_->Save(output)) { |
| LOG(ERROR) << "Can't save user history data."; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| UserHistoryPredictor::EntryPriorityQueue::EntryPriorityQueue() |
| : pool_(kEntryPoolSize) {} |
| |
| UserHistoryPredictor::EntryPriorityQueue::~EntryPriorityQueue() {} |
| |
| bool UserHistoryPredictor::EntryPriorityQueue::Push(Entry *entry) { |
| DCHECK(entry); |
| if (!seen_.insert(Util::Fingerprint32(entry->value())).second) { |
| VLOG(2) << "found dups"; |
| return false; |
| } |
| const uint32 score = UserHistoryPredictor::GetScore(*entry); |
| agenda_.push(make_pair(score, entry)); |
| return true; |
| } |
| |
| UserHistoryPredictor::Entry * |
| UserHistoryPredictor::EntryPriorityQueue::Pop() { |
| if (agenda_.empty()) { |
| return NULL; |
| } |
| const QueueElement &element = agenda_.top(); |
| Entry *result = element.second; |
| DCHECK(result); |
| agenda_.pop(); |
| return result; |
| } |
| |
| UserHistoryPredictor::Entry * |
| UserHistoryPredictor::EntryPriorityQueue::NewEntry() { |
| return pool_.Alloc(); |
| } |
| |
| class UserHistoryPredictorSyncer : public Thread { |
| public: |
| enum RequestType { |
| LOAD, |
| SAVE |
| }; |
| |
| UserHistoryPredictorSyncer(UserHistoryPredictor *predictor, |
| RequestType type) |
| : predictor_(predictor), type_(type) { |
| DCHECK(predictor_); |
| } |
| |
| virtual void Run() { |
| switch (type_) { |
| case LOAD: |
| VLOG(1) << "Executing Reload method"; |
| predictor_->Load(); |
| break; |
| case SAVE: |
| VLOG(1) << "Executing Sync method"; |
| predictor_->Save(); |
| break; |
| default: |
| LOG(ERROR) << "Unknown request: " << static_cast<int>(type_); |
| } |
| } |
| |
| virtual ~UserHistoryPredictorSyncer() { |
| Join(); |
| } |
| |
| UserHistoryPredictor *predictor_; |
| RequestType type_; |
| }; |
| |
| UserHistoryPredictor::UserHistoryPredictor( |
| const DictionaryInterface *dictionary, |
| const POSMatcher *pos_matcher, |
| const SuppressionDictionary *suppression_dictionary, |
| bool enable_content_word_learning) |
| : dictionary_(dictionary), |
| pos_matcher_(pos_matcher), |
| suppression_dictionary_(suppression_dictionary), |
| predictor_name_("UserHistoryPredictor"), |
| content_word_learning_enabled_(enable_content_word_learning), |
| updated_(false), |
| dic_(new DicCache(UserHistoryPredictor::cache_size())) { |
| AsyncLoad(); // non-blocking |
| // Load() blocking version can be used if any |
| } |
| |
| UserHistoryPredictor::~UserHistoryPredictor() { |
| // In destructor, must call blocking version |
| WaitForSyncer(); |
| Save(); // blocking |
| } |
| |
| string UserHistoryPredictor::GetUserHistoryFileName() { |
| return ConfigFileStream::GetFileName(kFileName); |
| } |
| |
| // return revert id |
| // static |
| uint16 UserHistoryPredictor::revert_id() { |
| return kRevertId; |
| } |
| |
| void UserHistoryPredictor::WaitForSyncer() { |
| if (syncer_.get() != NULL) { |
| syncer_->Join(); |
| syncer_.reset(NULL); |
| } |
| } |
| |
| bool UserHistoryPredictor::WaitForSyncerForTest() { |
| WaitForSyncer(); |
| return true; |
| } |
| |
| bool UserHistoryPredictor::CheckSyncerAndDelete() const { |
| if (syncer_.get() != NULL) { |
| if (syncer_->IsRunning()) { |
| return false; |
| } else { |
| syncer_.reset(NULL); // remove |
| } |
| } |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::Sync() { |
| return AsyncSave(); |
| // return Save(); blocking version |
| } |
| |
| bool UserHistoryPredictor::Reload() { |
| WaitForSyncer(); |
| return AsyncLoad(); |
| } |
| |
| bool UserHistoryPredictor::AsyncLoad() { |
| if (!CheckSyncerAndDelete()) { // now loading/saving |
| return true; |
| } |
| |
| syncer_.reset(new UserHistoryPredictorSyncer( |
| this, |
| UserHistoryPredictorSyncer::LOAD)); |
| syncer_->Start(); |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::AsyncSave() { |
| if (!updated_) { |
| return true; |
| } |
| |
| if (!CheckSyncerAndDelete()) { // now loading/saving |
| return true; |
| } |
| |
| syncer_.reset(new UserHistoryPredictorSyncer( |
| this, |
| UserHistoryPredictorSyncer::SAVE)); |
| syncer_->Start(); |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::Load() { |
| const string filename = GetUserHistoryFileName(); |
| |
| UserHistoryStorage history(filename); |
| if (!history.Load()) { |
| LOG(ERROR) << "UserHistoryStorage::Load() failed"; |
| return false; |
| } |
| |
| for (size_t i = 0; i < history.entries_size(); ++i) { |
| dic_->Insert(EntryFingerprint(history.entries(i)), |
| history.entries(i)); |
| } |
| |
| VLOG(1) << "Loaded user histroy, size=" << history.entries_size(); |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::Save() { |
| if (!updated_) { |
| return true; |
| } |
| |
| if (GET_CONFIG(incognito_mode)) { |
| VLOG(2) << "incognito mode"; |
| return true; |
| } |
| |
| if (!GET_CONFIG(use_history_suggest)) { |
| VLOG(2) << "no history suggest"; |
| return true; |
| } |
| |
| const DicElement *tail = dic_->Tail(); |
| if (tail == NULL) { |
| return true; |
| } |
| |
| const string filename = GetUserHistoryFileName(); |
| |
| UserHistoryStorage history(filename); |
| for (const DicElement *elm = tail; elm != NULL; elm = elm->prev) { |
| history.add_entries()->CopyFrom(elm->value); |
| } |
| |
| // update usage stats here. |
| usage_stats::UsageStats::SetInteger( |
| "UserHistoryPredictorEntrySize", |
| static_cast<int>(history.entries_size())); |
| |
| if (!history.Save()) { |
| LOG(ERROR) << "UserHistoryStorage::Save() failed"; |
| return false; |
| } |
| |
| updated_ = false; |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::ClearAllHistory() { |
| // Wait until syncer finishes |
| WaitForSyncer(); |
| |
| VLOG(1) << "Clearing user prediction"; |
| // renew DicCache as LRUCache tries to reuse the internal value by |
| // using FreeList |
| dic_.reset(new DicCache(UserHistoryPredictor::cache_size())); |
| |
| // insert a dummy event entry. |
| InsertEvent(Entry::CLEAN_ALL_EVENT); |
| |
| updated_ = true; |
| |
| Sync(); |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::ClearUnusedHistory() { |
| // Wait until syncer finishes |
| WaitForSyncer(); |
| |
| VLOG(1) << "Clearing unused prediction"; |
| const DicElement *head = dic_->Head(); |
| if (head == NULL) { |
| VLOG(2) << "dic head is NULL"; |
| return false; |
| } |
| |
| vector<uint32> keys; |
| for (const DicElement *elm = head; elm != NULL; elm = elm->next) { |
| VLOG(3) << elm->key << " " << elm->value.suggestion_freq(); |
| if (elm->value.suggestion_freq() == 0) { |
| keys.push_back(elm->key); |
| } |
| } |
| |
| for (size_t i = 0; i < keys.size(); ++i) { |
| VLOG(2) << "Removing: " << keys[i]; |
| if (!dic_->Erase(keys[i])) { |
| LOG(ERROR) << "cannot erase " << keys[i]; |
| } |
| } |
| |
| // insert a dummy event entry. |
| InsertEvent(Entry::CLEAN_UNUSED_EVENT); |
| |
| updated_ = true; |
| |
| Sync(); |
| |
| VLOG(1) << keys.size() << " removed"; |
| |
| return true; |
| } |
| |
| // Erases all the next_entries whose entry_fp field equals |fp|. |
| void UserHistoryPredictor::EraseNextEntries(uint32 fp, Entry *entry) { |
| const size_t orig_size = entry->next_entries_size(); |
| size_t new_size = orig_size; |
| for (size_t pos = 0; pos < new_size; ) { |
| if (entry->next_entries(pos).entry_fp() == fp) { |
| entry->mutable_next_entries()->SwapElements(pos, --new_size); |
| } else { |
| ++pos; |
| } |
| } |
| for (size_t i = 0; i < orig_size - new_size; ++i) { |
| entry->mutable_next_entries()->RemoveLast(); |
| } |
| } |
| |
| // Recursively finds the Ngram history that produces |target_key| and |
| // |target_value| and removes the last link. For example, if there exists a |
| // chain like |
| // ("aaa", "AAA") -- ("bbb", "BBB") -- ("ccc", "CCC"), |
| // and if target_key == "aaabbbccc" and target_value == "AAABBBCCC", the link |
| // from ("bbb", "BBB") to ("ccc", "CCC") is removed. If a link was removed, this |
| // method returns DONE. If no history entries can produce the target key |
| // value, then NOT_FOUND is returned. TAIL is returned only when the |
| // tail was found, e.g., in the above example, when the method finds the tail |
| // node ("ccc", "CCC"). |
| UserHistoryPredictor::RemoveNgramChainResult |
| UserHistoryPredictor::RemoveNgramChain(const string &target_key, |
| const string &target_value, |
| Entry *entry, |
| vector<StringPiece> *key_ngrams, |
| size_t key_ngrams_len, |
| vector<StringPiece> *value_ngrams, |
| size_t value_ngrams_len) { |
| DCHECK(entry); |
| DCHECK(key_ngrams); |
| DCHECK(value_ngrams); |
| |
| // Update the lengths with the current entry node. |
| key_ngrams_len += entry->key().size(); |
| value_ngrams_len += entry->value().size(); |
| |
| // This is the case where ngram key and value are shorter than the target key |
| // and value, respectively. In this case, we need to find further entries to |
| // concatenate in order to make |target_key| and |target_value|. |
| if (key_ngrams_len < target_key.size() && |
| value_ngrams_len < target_value.size()) { |
| key_ngrams->push_back(entry->key()); |
| value_ngrams->push_back(entry->value()); |
| for (size_t i = 0; i < entry->next_entries().size(); ++i) { |
| const uint32 fp = entry->next_entries(i).entry_fp(); |
| Entry *e = dic_->MutableLookupWithoutInsert(fp); |
| if (e == NULL) { |
| continue; |
| } |
| const RemoveNgramChainResult r = RemoveNgramChain(target_key, |
| target_value, |
| e, |
| key_ngrams, |
| key_ngrams_len, |
| value_ngrams, |
| value_ngrams_len); |
| switch (r) { |
| case DONE: |
| return DONE; |
| case TAIL: |
| // |entry| is the second-to-the-last node. So cut the link to the |
| // child entry. |
| EraseNextEntries(fp, entry); |
| return DONE; |
| default: |
| break; |
| } |
| } |
| // Recover the state. |
| key_ngrams->pop_back(); |
| value_ngrams->pop_back(); |
| return NOT_FOUND; |
| } |
| |
| // This is the case where the current ngram key and value have the same |
| // lengths as those of |target_key| and |target_value|, respectively. |
| if (key_ngrams_len == target_key.size() && |
| value_ngrams_len == target_value.size()) { |
| key_ngrams->push_back(entry->key()); |
| value_ngrams->push_back(entry->value()); |
| string ngram_key, ngram_value; |
| Util::JoinStringPieces(*key_ngrams, "", &ngram_key); |
| Util::JoinStringPieces(*value_ngrams, "", &ngram_value); |
| if (ngram_key == target_key && ngram_value == target_value) { |
| // |entry| is the last node. So return TAIL to tell the caller so |
| // that it can remove the link to this last node. |
| return TAIL; |
| } |
| key_ngrams->pop_back(); |
| value_ngrams->pop_back(); |
| return NOT_FOUND; |
| } |
| |
| return NOT_FOUND; |
| } |
| |
| bool UserHistoryPredictor::ClearHistoryEntry(const string &key, |
| const string &value) { |
| bool deleted = false; |
| { |
| // Find the history entry that has the exactly same key and value and has |
| // not been removed yet. If exists, remove it. |
| Entry *entry = dic_->MutableLookupWithoutInsert(Fingerprint(key, value)); |
| if (entry != NULL && !entry->removed()) { |
| entry->set_suggestion_freq(0); |
| entry->set_conversion_freq(0); |
| entry->set_removed(true); |
| // We don't clear entry->next_entries() so that we can generate prediction |
| // by chaining. |
| deleted = true; |
| } |
| } |
| { |
| // Find a chain of history entries that produces key and value. If exists, |
| // remove the link so that N-gram history prediction never generates this |
| // key value pair.. |
| for (DicElement *elm = dic_->MutableHead(); elm != NULL; elm = elm->next) { |
| Entry *entry = &elm->value; |
| if (!Util::StartsWith(key, entry->key()) || |
| !Util::StartsWith(value, entry->value())) { |
| continue; |
| } |
| vector<StringPiece> key_ngrams, value_ngrams; |
| if (RemoveNgramChain( |
| key, value, entry, &key_ngrams, 0, &value_ngrams, 0) == DONE) { |
| deleted = true; |
| } |
| } |
| } |
| if (deleted) { |
| updated_ = true; |
| } |
| return deleted; |
| } |
| |
| // return true if prev_entry has a next_fp link to entry |
| // static |
| bool UserHistoryPredictor::HasBigramEntry( |
| const UserHistoryPredictor::Entry &entry, |
| const UserHistoryPredictor::Entry &prev_entry) { |
| const uint32 fp = EntryFingerprint(entry); |
| for (int i = 0; i < prev_entry.next_entries_size(); ++i) { |
| if (fp == prev_entry.next_entries(i).entry_fp()) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // static |
| string UserHistoryPredictor::GetRomanMisspelledKey( |
| const Segments &segments) { |
| if (GET_CONFIG(preedit_method) != config::Config::ROMAN) { |
| return ""; |
| } |
| |
| const string &preedit = segments.conversion_segment(0).key(); |
| // TODO(team): Use composer if it is available. |
| // segments.composer()->GetQueryForConversion(&preedit); |
| // Since ConverterInterface doesn't have StartPredictionWithComposer, |
| // we cannot use composer currently. |
| if (!preedit.empty() && MaybeRomanMisspelledKey(preedit)) { |
| return ToRoman(preedit); |
| } |
| |
| return ""; |
| } |
| |
| // static |
| bool UserHistoryPredictor::MaybeRomanMisspelledKey(const string &key) { |
| int num_alpha = 0; |
| int num_hiragana = 0; |
| int num_unknown = 0; |
| for (ConstChar32Iterator iter(key); !iter.Done(); iter.Next()) { |
| const char32 w = iter.Get(); |
| const Util::ScriptType type = Util::GetScriptType(w); |
| if (type == Util::HIRAGANA || w == 0x30FC) { // "ー". |
| ++num_hiragana; |
| continue; |
| } |
| if (type == Util::UNKNOWN_SCRIPT && num_unknown <= 0) { |
| ++num_unknown; |
| continue; |
| } |
| if (type == Util::ALPHABET && num_alpha <= 0) { |
| ++num_alpha; |
| continue; |
| } |
| return false; |
| } |
| |
| return (num_hiragana > 0 && |
| ((num_alpha == 1 && num_unknown == 0) || |
| (num_alpha == 0 && num_unknown == 1))); |
| } |
| |
| // static |
| bool UserHistoryPredictor::RomanFuzzyPrefixMatch( |
| const string &str, const string &prefix) { |
| if (prefix.empty() || prefix.size() > str.size()) { |
| return false; |
| } |
| |
| // 1. allow one character delete in Romanji sequence. |
| // 2. allow one swap in Romanji sequence. |
| for (size_t i = 0; i < prefix.size(); ++i) { |
| if (prefix[i] == str[i]) { |
| continue; |
| } |
| |
| if (str[i] == '-') { |
| // '-' voice sound mark can be matched to any |
| // non-alphanum character. |
| if (!isalnum(prefix[i])) { |
| string replaced_prefix = prefix; |
| replaced_prefix[i] = str[i]; |
| if (Util::StartsWith(str, replaced_prefix)) { |
| return true; |
| } |
| } |
| } else { |
| // deletion. |
| string inserted_prefix = prefix; |
| inserted_prefix.insert(i, 1, str[i]); |
| if (Util::StartsWith(str, inserted_prefix)) { |
| return true; |
| } |
| |
| // swap. |
| if (i + 1 < prefix.size()) { |
| string swapped_prefix = prefix; |
| swap(swapped_prefix[i], swapped_prefix[i + 1]); |
| if (Util::StartsWith(str, swapped_prefix)) { |
| return true; |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| // |prefix| is an exact suffix of |str|. |
| return false; |
| } |
| |
| bool UserHistoryPredictor::RomanFuzzyLookupEntry( |
| const string &roman_input_key, |
| const UserHistoryPredictor::Entry *entry, |
| EntryPriorityQueue *results) const { |
| if (roman_input_key.empty()) { |
| return false; |
| } |
| |
| DCHECK(entry); |
| DCHECK(results); |
| |
| if (!RomanFuzzyPrefixMatch(ToRoman(entry->key()), |
| roman_input_key)) { |
| return false; |
| } |
| |
| Entry *result = results->NewEntry(); |
| DCHECK(result); |
| result->Clear(); |
| result->CopyFrom(*entry); |
| result->set_spelling_correction(true); |
| results->Push(result); |
| |
| return true; |
| } |
| |
| UserHistoryPredictor::Entry *UserHistoryPredictor::AddEntry( |
| const Entry &entry, EntryPriorityQueue *results) const { |
| // We add an entry even if it was marked as removed so that it can be used to |
| // generate prediction by entry chaining. The deleted entry itself is never |
| // shown in the final prediction result as it is filtered finally. |
| Entry *new_entry = results->NewEntry(); |
| new_entry->Clear(); |
| new_entry->CopyFrom(entry); |
| return new_entry; |
| } |
| |
| UserHistoryPredictor::Entry *UserHistoryPredictor::AddEntryWithNewKeyValue( |
| const string &key, const string &value, const Entry &entry, |
| EntryPriorityQueue *results) const { |
| // We add an entry even if it was marked as removed so that it can be used to |
| // generate prediction by entry chaining. The deleted entry itself is never |
| // shown in the final prediction result as it is filtered finally. |
| Entry *new_entry = results->NewEntry(); |
| new_entry->Clear(); |
| new_entry->CopyFrom(entry); |
| new_entry->set_key(key); |
| new_entry->set_value(value); |
| |
| // Set removed field true if the new key and value were removed. |
| const Entry *e = dic_->LookupWithoutInsert(Fingerprint(key, value)); |
| new_entry->set_removed(e != NULL && e->removed()); |
| |
| return new_entry; |
| } |
| |
| bool UserHistoryPredictor::LookupEntry( |
| const string &input_key, |
| const string &key_base, |
| const Trie<string> *key_expanded, |
| const UserHistoryPredictor::Entry *entry, |
| const UserHistoryPredictor::Entry *prev_entry, |
| EntryPriorityQueue *results) const { |
| CHECK(entry); |
| CHECK(results); |
| |
| Entry *result = NULL; |
| |
| const Entry *last_entry = NULL; |
| |
| // last_access_time of the left-closest content word. |
| uint64 left_last_access_time = 0; |
| |
| // last_access_time of the left-most content word. |
| uint64 left_most_last_access_time = 0; |
| |
| // Example: [a|B|c|D] |
| // a,c: functional word |
| // B,D: content word |
| // left_last_access_time: timestamp of D |
| // left_most_last_access_time: timestamp of B |
| |
| // |input_key| is a query user is now typing. |
| // |entry->key()| is a target value saved in the database. |
| // const string input_key = key_base; |
| |
| const MatchType mtype = GetMatchTypeFromInput( |
| input_key, key_base, key_expanded, entry->key()); |
| if (mtype == NO_MATCH) { |
| return false; |
| } else if (mtype == LEFT_EMPTY_MATCH) { // zero-query-suggestion |
| // if |input_key| is empty, the |prev_entry| and |entry| must |
| // have bigram relation. |
| if (prev_entry != NULL && HasBigramEntry(*entry, *prev_entry)) { |
| result = AddEntry(*entry, results); |
| if (result) { |
| last_entry = entry; |
| left_last_access_time = entry->last_access_time(); |
| left_most_last_access_time = IsContentWord(entry->value()) ? |
| left_last_access_time : 0; |
| } |
| } else { |
| return false; |
| } |
| } else if (mtype == LEFT_PREFIX_MATCH) { |
| // |input_key| is shorter than |entry->key()| |
| // This scenario is a simple prefix match. |
| // e.g., |input_key|="foo", |entry->key()|="foobar" |
| result = AddEntry(*entry, results); |
| if (result) { |
| last_entry = entry; |
| left_last_access_time = entry->last_access_time(); |
| left_most_last_access_time = IsContentWord(entry->value()) ? |
| left_last_access_time : 0; |
| } |
| } else if (mtype == RIGHT_PREFIX_MATCH || mtype == EXACT_MATCH) { |
| // |input_key| is longer than or the same as |entry->key()|. |
| // In this case, recursively traverse "next_entries" until |
| // target entry gets longer than input_key. |
| // e.g., |input_key|="foobar", |entry->key()|="foo" |
| left_last_access_time = entry->last_access_time(); |
| left_most_last_access_time = IsContentWord(entry->value()) ? |
| left_last_access_time : 0; |
| string key = entry->key(); |
| string value = entry->value(); |
| const Entry *current_entry = entry; |
| set<uint32> seen; |
| seen.insert(EntryFingerprint(*current_entry)); |
| // Until target entry gets longer than input_key. |
| while (key.size() <= input_key.size()) { |
| const Entry *latest_entry = NULL; |
| const Entry *left_same_timestamp_entry = NULL; |
| const Entry *left_most_same_timestamp_entry = NULL; |
| for (size_t i = 0; i < current_entry->next_entries_size(); ++i) { |
| const Entry *tmp_entry = dic_->LookupWithoutInsert( |
| current_entry->next_entries(i).entry_fp()); |
| if (tmp_entry == NULL || tmp_entry->key().empty()) { |
| continue; |
| } |
| const MatchType mtype2 = GetMatchType(key + tmp_entry->key(), |
| input_key); |
| if (mtype2 == NO_MATCH || mtype2 == LEFT_EMPTY_MATCH) { |
| continue; |
| } |
| if (latest_entry == NULL || |
| latest_entry->last_access_time() < tmp_entry->last_access_time()) { |
| latest_entry = tmp_entry; |
| } |
| if (tmp_entry->last_access_time() == left_last_access_time) { |
| left_same_timestamp_entry = tmp_entry; |
| } |
| if (tmp_entry->last_access_time() == left_most_last_access_time) { |
| left_most_same_timestamp_entry = tmp_entry; |
| } |
| } |
| |
| // Prefer bigrams which are generated at the same time. |
| // When last_access_time are the same, these two bigrams were |
| // input together. |
| // The preferences: |
| // (1). The current entry's time stamp is equal to that of |
| // left most content word |
| // (2). The current entry's time stamp is equal to that of |
| // left closest content word |
| // (3). The current entry is the latest |
| const Entry *next_entry = left_most_same_timestamp_entry; |
| if (next_entry == NULL) { |
| next_entry = left_same_timestamp_entry; |
| } |
| if (next_entry == NULL) { |
| next_entry = latest_entry; |
| } |
| |
| if (next_entry == NULL || next_entry->key().empty()) { |
| break; |
| } |
| |
| // if duplicate entry is found, don't expand more. |
| // This is because an entry only has one timestamp. |
| // we cannot trust the timestamp if there are duplicate values |
| // in one input. |
| if (!seen.insert(EntryFingerprint(*next_entry)).second) { |
| break; |
| } |
| |
| key += next_entry->key(); |
| value += next_entry->value(); |
| current_entry = next_entry; |
| last_entry = next_entry; |
| |
| // Don't update left_access_time if the current entry is |
| // not a content word. |
| // The time-stamp of non-content-word will be updated frequently. |
| // The time-stamp of the previous candidate is more trustful. |
| // It partially fixes the bug http://b/2843371. |
| const bool is_content_word = IsContentWord(current_entry->value()); |
| |
| if (is_content_word) { |
| left_last_access_time = current_entry->last_access_time(); |
| } |
| |
| // if left_most entry is a functional word (symbols/punctuations), |
| // we don't take it as a canonical candidate. |
| if (left_most_last_access_time == 0 && is_content_word) { |
| left_most_last_access_time = current_entry->last_access_time(); |
| } |
| } |
| |
| if (key.size() < input_key.size()) { |
| VLOG(3) << "Cannot find prefix match even after chain rules"; |
| return false; |
| } |
| |
| result = AddEntryWithNewKeyValue(key, value, *entry, results); |
| } else { |
| LOG(ERROR) << "Unknown match mode: " << mtype; |
| return false; |
| } |
| |
| if (result == NULL) { |
| return false; |
| } |
| |
| // if prev entry is not NULL, check whether there is a bigram |
| // from |prev_entry| to |entry|. |
| result->set_bigram_boost(false); |
| |
| if (prev_entry != NULL && HasBigramEntry(*entry, *prev_entry)) { |
| // set bigram_boost flag so that this entry is boosted |
| // against LRU policy. |
| result->set_bigram_boost(true); |
| } |
| |
| if (!result->removed()) { |
| results->Push(result); |
| } |
| |
| // Expand new entry which was input just after "last_entry" |
| if (last_entry != NULL && |
| Util::CharsLen(result->key()) >= 1 && |
| 2 * Util::CharsLen(input_key) >= Util::CharsLen(result->key())) { |
| const Entry *latest_entry = NULL; |
| const Entry *left_same_timestamp_entry = NULL; |
| const Entry *left_most_same_timestamp_entry = NULL; |
| for (int i = 0; i < last_entry->next_entries_size(); ++i) { |
| const Entry *tmp_entry = dic_->LookupWithoutInsert( |
| last_entry->next_entries(i).entry_fp()); |
| if (tmp_entry == NULL || tmp_entry->key().empty()) { |
| continue; |
| } |
| if (latest_entry == NULL || |
| latest_entry->last_access_time() < tmp_entry->last_access_time()) { |
| latest_entry = tmp_entry; |
| } |
| if (tmp_entry->last_access_time() == left_last_access_time) { |
| left_same_timestamp_entry = tmp_entry; |
| } |
| if (tmp_entry->last_access_time() == left_most_last_access_time) { |
| left_most_same_timestamp_entry = tmp_entry; |
| } |
| } |
| |
| const Entry *next_entry = left_most_same_timestamp_entry; |
| if (next_entry == NULL) { |
| next_entry = left_same_timestamp_entry; |
| } |
| if (next_entry == NULL) { |
| next_entry = latest_entry; |
| } |
| |
| // the new entry was input within 10 seconds. |
| // TODO(taku): This is a simple heuristics. |
| if (next_entry != NULL && !next_entry->key().empty() && |
| abs(static_cast<int32>(next_entry->last_access_time() - |
| last_entry->last_access_time())) <= 10 && |
| IsContentWord(next_entry->value())) { |
| Entry *result2 = AddEntryWithNewKeyValue( |
| result->key() + next_entry->key(), |
| result->value() + next_entry->value(), |
| *result, |
| results); |
| if (!result2->removed()) { |
| results->Push(result2); |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| bool UserHistoryPredictor::Predict(Segments *segments) const { |
| ConversionRequest default_request; |
| return PredictForRequest(default_request, segments); |
| } |
| |
| bool UserHistoryPredictor::PredictForRequest(const ConversionRequest &request, |
| Segments *segments) const { |
| if (!CheckSyncerAndDelete()) { |
| LOG(WARNING) << "Syncer is running"; |
| return false; |
| } |
| |
| if (GET_CONFIG(incognito_mode)) { |
| VLOG(2) << "incognito mode"; |
| return false; |
| } |
| |
| if (segments->request_type() == Segments::CONVERSION) { |
| VLOG(2) << "request type is CONVERSION"; |
| return false; |
| } |
| |
| if (!GET_CONFIG(use_history_suggest) && |
| segments->request_type() == Segments::SUGGESTION) { |
| VLOG(2) << "no history suggest"; |
| return false; |
| } |
| |
| if (segments->conversion_segments_size() < 1) { |
| VLOG(2) << "segment size < 1"; |
| return false; |
| } |
| |
| if (dic_->Head() == NULL) { |
| VLOG(2) << "dic head is NULL"; |
| return false; |
| } |
| |
| const RequestType request_type = request.request().zero_query_suggestion() ? |
| ZERO_QUERY_SUGGESTION : DEFAULT; |
| const string &input_key = segments->conversion_segment(0).key(); |
| if (IsPunctuation(Util::SubString(input_key, 0, 1))) { |
| VLOG(2) << "input_key starts with punctuations"; |
| return false; |
| } |
| |
| const size_t input_key_len = Util::CharsLen(input_key); |
| if (input_key_len == 0 && request_type == DEFAULT) { |
| VLOG(2) << "key length is 0"; |
| return false; |
| } |
| |
| const Entry *prev_entry = |
| LookupPrevEntry(*segments, request.request().available_emoji_carrier()); |
| if (input_key_len == 0 && prev_entry == NULL) { |
| VLOG(1) << "If input_key_len is 0, prev_entry must be set"; |
| return false; |
| } |
| |
| EntryPriorityQueue results; |
| GetResultsFromHistoryDictionary(request, *segments, prev_entry, &results); |
| if (results.size() == 0) { |
| VLOG(2) << "no prefix match candiate is found."; |
| return false; |
| } |
| |
| return InsertCandidates(request_type, request, segments, &results); |
| } |
| |
| const UserHistoryPredictor::Entry *UserHistoryPredictor::LookupPrevEntry( |
| const Segments &segments, uint32 available_emoji_carrier) const { |
| const size_t history_segments_size = segments.history_segments_size(); |
| const Entry *prev_entry = NULL; |
| // when threre are non-zero history segments, lookup an entry |
| // from the LRU dictionary, which is correspoinding to the last |
| // history segment. |
| if (history_segments_size == 0) { |
| return NULL; |
| } |
| |
| const Segment &history_segment = |
| segments.history_segment(history_segments_size - 1); |
| |
| // Simply lookup the history_segment. |
| prev_entry = dic_->LookupWithoutInsert(SegmentFingerprint(history_segment)); |
| |
| // When |prev_entry| is NULL or |prev_entry| has no valid next_entries, |
| // do linear-search over the LRU. |
| if ((prev_entry == NULL && history_segment.candidates_size() > 0) || |
| (prev_entry != NULL && prev_entry->next_entries_size() == 0)) { |
| const string &prev_value = prev_entry == NULL ? |
| history_segment.candidate(0).value : prev_entry->value(); |
| int trial = 0; |
| for (const DicElement *elm = dic_->Head(); |
| trial++ < kMaxPrevValueTrial && elm != NULL; elm = elm->next) { |
| const Entry *entry = &(elm->value); |
| // entry->value() equals to the prev_value or |
| // entry->value() is a SUFFIX of prev_value. |
| // length of entry->value() must be >= 2, as single-length |
| // match would be noisy. |
| if (IsValidEntry(*entry, available_emoji_carrier) && |
| entry != prev_entry && |
| entry->next_entries_size() > 0 && |
| Util::CharsLen(entry->value()) >= 2 && |
| (entry->value() == prev_value || |
| Util::EndsWith(prev_value, entry->value()))) { |
| prev_entry = entry; |
| break; |
| } |
| } |
| } |
| return prev_entry; |
| } |
| |
| void UserHistoryPredictor::GetResultsFromHistoryDictionary( |
| const ConversionRequest &request, |
| const Segments &segments, const Entry *prev_entry, |
| EntryPriorityQueue *results) const { |
| DCHECK(results); |
| const size_t max_results_size = 5 * segments.max_prediction_candidates_size(); |
| |
| // Get romanized input key if the given preedit looks misspelled. |
| const string roman_input_key = GetRomanMisspelledKey(segments); |
| |
| // TODO(team): make GetKanaMisspelledKey(segments); |
| // const string kana_input_key = GetKanaMisspelledKey(segments); |
| |
| // If we have ambiguity for the input, get expanded key. |
| // Example1 roman input: for "あk", we will get |base|, "あ" and |expanded|, |
| // "か", "き", etc |
| // Example2 kana input: for "あか", we will get |base|, "あ" and |expanded|, |
| // "か", and "が". |
| |
| // |base_key| and |input_key| could be different |
| // For kana-input, we will expand the ambiguity for "゛". |
| // When we input "もす", |
| // |base_key|: "も" |
| // |expanded|: "す", "ず" |
| // |input_key|: "もす" |
| // In this case, we want to show candidates for "もす" as EXACT match, |
| // and candidates for "もず" as LEFT_PREFIX_MATCH |
| // |
| // For roman-input, when we input "あk", |
| // |input_key| is "あk" and |base_key| is "あ" |
| string input_key; |
| string base_key; |
| scoped_ptr<Trie<string> > expanded; |
| GetInputKeyFromSegments(request, segments, &input_key, &base_key, &expanded); |
| |
| int trial = 0; |
| for (const DicElement *elm = dic_->Head(); elm != NULL; elm = elm->next) { |
| if (!IsValidEntryIgnoringRemovedField( |
| elm->value, request.request().available_emoji_carrier())) { |
| continue; |
| } |
| if (segments.request_type() == Segments::SUGGESTION && |
| trial++ >= kMaxSuggestionTrial) { |
| VLOG(2) << "too many trials"; |
| break; |
| } |
| |
| // lookup key from elm_value and prev_entry. |
| // If a new entry is found, the entry is pushed to the results. |
| // TODO(team): make KanaFuzzyLookupEntry(). |
| if (!LookupEntry(input_key, base_key, expanded.get(), &(elm->value), |
| prev_entry, results) && |
| !RomanFuzzyLookupEntry(roman_input_key, &(elm->value), results)) { |
| continue; |
| } |
| |
| // already found enough results. |
| if (results->size() >= max_results_size) { |
| break; |
| } |
| } |
| } |
| |
| // static |
| void UserHistoryPredictor::GetInputKeyFromSegments( |
| const ConversionRequest &request, const Segments &segments, |
| string *input_key, string *base, |
| scoped_ptr<Trie<string> > *expanded) { |
| DCHECK(input_key); |
| DCHECK(base); |
| |
| if (!request.has_composer() || |
| !FLAGS_enable_expansion_for_user_history_predictor) { |
| *input_key = segments.conversion_segment(0).key(); |
| *base = segments.conversion_segment(0).key(); |
| return; |
| } |
| |
| request.composer().GetStringForPreedit(input_key); |
| set<string> expanded_set; |
| request.composer().GetQueriesForPrediction(base, &expanded_set); |
| if (expanded_set.size() > 0) { |
| expanded->reset(new Trie<string>); |
| for (set<string>::const_iterator itr = expanded_set.begin(); |
| itr != expanded_set.end(); ++itr) { |
| // For getting matched key, insert values |
| (*expanded)->AddEntry(*itr, *itr); |
| } |
| } |
| } |
| |
| bool UserHistoryPredictor::InsertCandidates(RequestType request_type, |
| const ConversionRequest &request, |
| Segments *segments, |
| EntryPriorityQueue *results) const { |
| DCHECK(results); |
| Segment *segment = segments->mutable_conversion_segment(0); |
| if (segment == NULL) { |
| LOG(ERROR) << "segment is NULL"; |
| return false; |
| } |
| const uint32 input_key_len = Util::CharsLen(segment->key()); |
| while (segment->candidates_size() < |
| segments->max_prediction_candidates_size()) { |
| // |results| is a priority queue where the elemtnt |
| // in the queue is sorted by the score defined in GetScore(). |
| const Entry *result_entry = results->Pop(); |
| if (result_entry == NULL) { |
| // Pop() returns NULL when no more valid entry exists. |
| break; |
| } |
| bool is_valid_candidate = false; |
| if (segments->request_type() == Segments::PREDICTION) { |
| is_valid_candidate = true; |
| } else if (segments->request_type() == Segments::SUGGESTION) { |
| // The top result of suggestion should be a VALID suggestion candidate. |
| // i.e., SuggestionTrigerFunc should return true for the first |
| // candidate. |
| // If user types "デスノート" too many times, "デスノート" will be |
| // suggested when user types "で". It is expected, but if user types |
| // "です" after that, showing "デスノート" is annoying. |
| // In this situation, "です" is in the LRU, but SuggestionTrigerFunc |
| // returns false for "です", since it is short. |
| if (IsValidSuggestion(request_type, |
| input_key_len, *result_entry)) { |
| is_valid_candidate = true; |
| } else if (segment->candidates_size() == 0) { |
| VLOG(2) << "candidates size is 0"; |
| return false; |
| } |
| } else { |
| LOG(ERROR) << "Unknown mode"; |
| return false; |
| } |
| |
| if (!is_valid_candidate) { |
| VLOG(2) << "not a valid candidate: " << result_entry->key(); |
| continue; |
| } |
| |
| if (request.request().mixed_conversion() && |
| result_entry->suggestion_freq() < 2 && |
| Util::CharsLen(result_entry->value()) > 8) { |
| // Don't show long history for mixed conversion |
| // TODO(toshiyuki): Better to merge this into IsValidSuggestion logic. |
| VLOG(2) << "long candidate: " << result_entry->value(); |
| continue; |
| } |
| |
| Segment::Candidate *candidate = segment->push_back_candidate(); |
| DCHECK(candidate); |
| candidate->Init(); |
| candidate->key = result_entry->key(); |
| candidate->content_key = result_entry->key(); |
| candidate->value = result_entry->value(); |
| candidate->content_value = result_entry->value(); |
| candidate->attributes |= |
| Segment::Candidate::USER_HISTORY_PREDICTION | |
| Segment::Candidate::NO_VARIANTS_EXPANSION; |
| if (result_entry->spelling_correction()) { |
| candidate->attributes |= Segment::Candidate::SPELLING_CORRECTION; |
| } |
| const string &description = result_entry->description(); |
| // If we have stored description, set it exactly. |
| if (!description.empty()) { |
| candidate->description = description; |
| candidate->attributes |= Segment::Candidate::NO_EXTRA_DESCRIPTION; |
| } else { |
| VariantsRewriter::SetDescriptionForPrediction(*pos_matcher_, candidate); |
| } |
| #if DEBUG |
| if (candidate->description.find("History") == string::npos) { |
| candidate->description += " History"; |
| } |
| #endif // DEBUG |
| } |
| |
| return (segment->candidates_size() > 0); |
| } |
| |
| void UserHistoryPredictor::InsertNextEntry( |
| const UserHistoryPredictor::NextEntry &next_entry, |
| UserHistoryPredictor::Entry *entry) const { |
| if (next_entry.entry_fp() == 0 || entry == NULL) { |
| return; |
| } |
| |
| NextEntry *target_next_entry = NULL; |
| |
| // If next_entries_size is less than kMaxNextEntriesSize, |
| // we simply allocate a new entry. |
| if (entry->next_entries_size() < max_next_entries_size()) { |
| target_next_entry = entry->add_next_entries(); |
| } else { |
| // Otherwise, find the oldest next_entry. |
| uint64 last_access_time = kuint64max; |
| for (int i = 0; i < entry->next_entries_size(); ++i) { |
| // already has the same id |
| if (next_entry.entry_fp() == entry->next_entries(i).entry_fp()) { |
| target_next_entry = entry->mutable_next_entries(i); |
| break; |
| } |
| const Entry *found_entry = dic_->LookupWithoutInsert( |
| entry->next_entries(i).entry_fp()); |
| // reuse the entry if it is already removed from the LRU. |
| if (found_entry == NULL) { |
| target_next_entry = entry->mutable_next_entries(i); |
| break; |
| } |
| // preserve the oldest entry |
| if (target_next_entry == NULL || |
| last_access_time > found_entry->last_access_time()) { |
| target_next_entry = entry->mutable_next_entries(i); |
| last_access_time = found_entry->last_access_time(); |
| } |
| } |
| } |
| |
| if (target_next_entry == NULL) { |
| LOG(ERROR) << "cannot find a room for inserting next fp"; |
| return; |
| } |
| |
| target_next_entry->CopyFrom(next_entry); |
| } |
| |
| bool UserHistoryPredictor::IsValidEntry( |
| const Entry &entry, uint32 available_emoji_carrier) const { |
| return !entry.removed() && |
| IsValidEntryIgnoringRemovedField(entry, available_emoji_carrier); |
| } |
| |
| bool UserHistoryPredictor::IsValidEntryIgnoringRemovedField( |
| const Entry &entry, uint32 available_emoji_carrier) const { |
| if (entry.entry_type() != Entry::DEFAULT_ENTRY || |
| suppression_dictionary_->SuppressEntry(entry.key(), entry.value())) { |
| return false; |
| } |
| |
| if (IsEmojiEntry(entry)) { |
| if (Util::IsAndroidPuaEmoji(entry.value())) { |
| // Android carrier dependent emoji. |
| const uint32 kAndroidCarrier = |
| Request::DOCOMO_EMOJI | |
| Request::SOFTBANK_EMOJI | |
| Request::KDDI_EMOJI; |
| if (!(available_emoji_carrier & kAndroidCarrier)) { |
| return false; |
| } |
| } else { |
| // Unicode 6.0 emoji. |
| if (!(available_emoji_carrier & Request::UNICODE_EMOJI)) { |
| return false; |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| void UserHistoryPredictor::InsertEvent(EntryType type) { |
| if (type == Entry::DEFAULT_ENTRY) { |
| return; |
| } |
| |
| const uint64 last_access_time = Util::GetTime(); |
| const uint32 dic_key = Fingerprint("", "", type); |
| |
| CHECK(dic_.get()); |
| DicElement *e = dic_->Insert(dic_key); |
| if (e == NULL) { |
| VLOG(2) << "insert failed"; |
| return; |
| } |
| |
| Entry *entry = &(e->value); |
| DCHECK(entry); |
| entry->Clear(); |
| entry->set_entry_type(type); |
| entry->set_last_access_time(last_access_time); |
| } |
| |
| void UserHistoryPredictor::Insert(const string &key, |
| const string &value, |
| const string &description, |
| bool is_suggestion_selected, |
| uint32 next_fp, |
| uint64 last_access_time, |
| Segments *segments) { |
| if (key.empty() || value.empty() || |
| key.size() > kMaxStringLength || |
| value.size() > kMaxStringLength || |
| description.size() > kMaxStringLength) { |
| return; |
| } |
| |
| const uint32 dic_key = Fingerprint(key, value); |
| |
| if (!dic_->HasKey(dic_key)) { |
| // the key is a new key inserted in the last Finish method. |
| // Here we push a new RevertEntry so that the new "key" can be |
| // removed when Revert() method is called. |
| Segments::RevertEntry *revert_entry = segments->push_back_revert_entry(); |
| DCHECK(revert_entry); |
| revert_entry->key = Uint32ToString(dic_key); |
| revert_entry->id = UserHistoryPredictor::revert_id(); |
| revert_entry->revert_entry_type = Segments::RevertEntry::CREATE_ENTRY; |
| } else { |
| // the key is a old key not inserted in the last Finish method |
| // TODO(taku): |
| // add a treatment for UPDATE_ENTRY mode |
| } |
| |
| DicElement *e = dic_->Insert(dic_key); |
| if (e == NULL) { |
| VLOG(2) << "insert failed"; |
| return; |
| } |
| |
| Entry *entry = &(e->value); |
| DCHECK(entry); |
| |
| entry->set_key(key); |
| entry->set_value(value); |
| entry->set_removed(false); |
| |
| if (description.empty()) { |
| entry->clear_description(); |
| } else { |
| entry->set_description(description); |
| } |
| |
| entry->set_last_access_time(last_access_time); |
| if (is_suggestion_selected) { |
| entry->set_suggestion_freq(entry->suggestion_freq() + 1); |
| } else { |
| entry->set_conversion_freq(entry->conversion_freq() + 1); |
| } |
| |
| // Insert next_fp to the entry |
| if (next_fp != 0) { |
| NextEntry next_entry; |
| next_entry.set_entry_fp(next_fp); |
| InsertNextEntry(next_entry, entry); |
| } |
| |
| VLOG(2) << key << " " << value << " has inserted: " |
| << entry->Utf8DebugString(); |
| |
| // new entry is inserted to the cache |
| updated_ = true; |
| } |
| |
| void UserHistoryPredictor::Finish(Segments *segments) { |
| if (segments->request_type() == Segments::REVERSE_CONVERSION) { |
| // Do nothing for REVERSE_CONVERSION. |
| return; |
| } |
| |
| if (GET_CONFIG(incognito_mode)) { |
| VLOG(2) << "incognito mode"; |
| return; |
| } |
| |
| if (!GET_CONFIG(use_history_suggest)) { |
| VLOG(2) << "no history suggest"; |
| return; |
| } |
| |
| if (!CheckSyncerAndDelete()) { |
| LOG(WARNING) << "Syncer is running"; |
| return; |
| } |
| |
| const bool is_suggestion = segments->request_type() != Segments::CONVERSION; |
| const uint64 last_access_time = Util::GetTime(); |
| |
| // If user inputs a punctuation just after some long sentence, |
| // we make a new candidate by concatinating the top element in LRU and |
| // the punctuation user input. The top element in LRU is supposed to be |
| // the long sentence user input before. |
| // This is a fix for http://b/issue?id=2216838 |
| if (dic_->Head() != NULL && |
| dic_->Head()->value.last_access_time() + 5 > last_access_time && |
| // Check if the current value is a punctuation. |
| segments->conversion_segments_size() == 1 && |
| segments->conversion_segment(0).candidates_size() > 0 && |
| IsPunctuation(segments->conversion_segment(0).candidate(0).value) && |
| // Check if the previous value looks like a sentence. |
| segments->history_segments_size() > 0 && |
| segments->history_segment( |
| segments->history_segments_size() - 1).candidates_size() > 0) { |
| const Entry *entry = &(dic_->Head()->value); |
| DCHECK(entry); |
| const string &last_value = |
| segments->history_segment( |
| segments->history_segments_size() - 1).candidate(0).value; |
| // Check if the head value in LRU ends with the candidate value in history |
| // segments. |
| if (Util::EndsWith(entry->value(), last_value)) { |
| const Segment::Candidate &candidate = |
| segments->conversion_segment(0).candidate(0); |
| const string key = entry->key() + candidate.key; |
| const string value = entry->value() + candidate.value; |
| // use the same last_access_time stored in the top element |
| // so that this item can be grouped together. |
| Insert(key, value, entry->description(), is_suggestion, 0, |
| entry->last_access_time(), segments); |
| } |
| } |
| |
| string all_key, all_value; |
| const size_t history_segments_size = segments->history_segments_size(); |
| |
| // Check every segment is valid. |
| for (size_t i = history_segments_size; i < segments->segments_size(); ++i) { |
| const Segment &segment = segments->segment(i); |
| if (segment.candidates_size() < 1) { |
| VLOG(2) << "candidates size < 1"; |
| return; |
| } |
| if (segment.segment_type() != Segment::FIXED_VALUE) { |
| VLOG(2) << "segment is not FIXED_VALUE"; |
| return; |
| } |
| const Segment::Candidate &candidate = segment.candidate(0); |
| if (candidate.attributes & |
| Segment::Candidate::NO_SUGGEST_LEARNING) { |
| VLOG(2) << "NO_SUGGEST_LEARNING"; |
| return; |
| } |
| } |
| |
| if (IsPrivacySensitive(segments)) { |
| VLOG(2) << "do not remember privacy sensitive input"; |
| return; |
| } |
| |
| InsertHistory(is_suggestion, last_access_time, segments); |
| return; |
| } |
| |
| void UserHistoryPredictor::MakeLearningSegments( |
| const Segments &segments, SegmentsForLearning *learning_segments) const { |
| DCHECK(learning_segments); |
| |
| for (size_t i = 0; i < segments.history_segments_size(); ++i) { |
| const Segment &segment = segments.history_segment(i); |
| SegmentForLearning learning_segment; |
| DCHECK_LE(1, segment.candidates_size()); |
| learning_segment.key = segment.candidate(0).key; |
| learning_segment.value = segment.candidate(0).value; |
| learning_segment.content_key = segment.candidate(0).content_key; |
| learning_segment.content_value = segment.candidate(0).content_value; |
| learning_segment.description = GetDescription(segment.candidate(0)); |
| learning_segments->push_back_history_segment(learning_segment); |
| } |
| for (size_t i = 0; i < segments.conversion_segments_size(); ++i) { |
| const Segment &segment = segments.conversion_segment(i); |
| const Segment::Candidate &candidate = segment.candidate(0); |
| if (candidate.inner_segment_boundary.empty()) { |
| SegmentForLearning learning_segment; |
| learning_segment.key = candidate.key; |
| learning_segment.value = candidate.value; |
| learning_segment.content_key = candidate.content_key; |
| learning_segment.content_value = candidate.content_value; |
| learning_segment.description = GetDescription(candidate); |
| learning_segments->push_back_conversion_segment(learning_segment); |
| } else { |
| SegmentForLearning learning_segment; |
| for (Segment::Candidate::InnerSegmentIterator iter(&candidate); |
| !iter.Done(); iter.Next()) { |
| iter.GetKey().CopyToString(&learning_segment.key); |
| iter.GetValue().CopyToString(&learning_segment.value); |
| iter.GetContentKey().CopyToString(&learning_segment.content_key); |
| iter.GetContentValue().CopyToString(&learning_segment.content_value); |
| learning_segments->push_back_conversion_segment(learning_segment); |
| } |
| } |
| } |
| } |
| |
| void UserHistoryPredictor::InsertHistory(bool is_suggestion_selected, |
| uint64 last_access_time, |
| Segments *segments) { |
| SegmentsForLearning learning_segments; |
| MakeLearningSegments(*segments, &learning_segments); |
| |
| string all_key, all_value; |
| set<uint32> seen; |
| bool this_was_seen = false; |
| const size_t history_segments_size = |
| learning_segments.history_segments_size(); |
| |
| for (size_t i = history_segments_size; |
| i < learning_segments.all_segments_size(); ++i) { |
| const SegmentForLearning &segment = learning_segments.all_segment(i); |
| all_key += segment.key; |
| all_value += segment.value; |
| uint32 next_fp = (i == learning_segments.all_segments_size() - 1) ? |
| 0 : LearningSegmentFingerprint(learning_segments.all_segment(i + 1)); |
| // remember the first segment |
| if (i == history_segments_size) { |
| seen.insert(LearningSegmentFingerprint(segment)); |
| } |
| uint32 next_fp_to_set = next_fp; |
| // If two duplicate segments exist, kills the link |
| // TO/FROM the second one to prevent loops. |
| // Only killing "TO" link caused bug #2982886: |
| // after converting "らいおん(もうじゅう)とぞうりむし(びせいぶつ)" |
| // and typing "ぞうりむし", "ゾウリムシ(猛獣" was suggested. |
| if (this_was_seen) { |
| next_fp_to_set = 0; |
| } |
| if (!seen.insert(next_fp).second) { |
| next_fp_to_set = 0; |
| this_was_seen = true; |
| } else { |
| this_was_seen = false; |
| } |
| Insert(segment.key, |
| segment.value, |
| segment.description, |
| is_suggestion_selected, next_fp_to_set, |
| last_access_time, segments); |
| if (content_word_learning_enabled_ && |
| segment.content_key != segment.key && |
| segment.content_value != segment.value) { |
| Insert(segment.content_key, |
| segment.content_value, |
| segment.description, |
| is_suggestion_selected, 0, |
| last_access_time, segments); |
| } |
| } |
| |
| // Insert all_key/all_value |
| if (learning_segments.conversion_segments_size() > 1 && |
| !all_key.empty() && !all_value.empty()) { |
| Insert(all_key, all_value, "", |
| is_suggestion_selected, |
| 0, last_access_time, segments); |
| } |
| |
| // Make a link from the last history_segment to the first conversion segment |
| // or to the entire user input. |
| if (learning_segments.history_segments_size() > 0 && |
| learning_segments.conversion_segments_size() > 0) { |
| const SegmentForLearning &history_segment = |
| learning_segments.history_segment( |
| segments->history_segments_size() - 1); |
| const SegmentForLearning &conversion_segment = |
| learning_segments.conversion_segment(0); |
| Entry *history_entry = dic_->MutableLookupWithoutInsert( |
| LearningSegmentFingerprint(history_segment)); |
| NextEntry next_entry; |
| if (segments->request_type() == Segments::CONVERSION) { |
| next_entry.set_entry_fp(LearningSegmentFingerprint(conversion_segment)); |
| InsertNextEntry(next_entry, history_entry); |
| } |
| |
| // Entire user input or SUGGESTION |
| if (segments->request_type() != Segments::CONVERSION || |
| learning_segments.conversion_segments_size() > 1) { |
| next_entry.set_entry_fp(Fingerprint(all_key, all_value)); |
| InsertNextEntry(next_entry, history_entry); |
| } |
| } |
| |
| return; |
| } |
| |
| void UserHistoryPredictor::Revert(Segments *segments) { |
| if (!CheckSyncerAndDelete()) { |
| LOG(WARNING) << "Syncer is running"; |
| return; |
| } |
| |
| for (size_t i = 0; i < segments->revert_entries_size(); ++i) { |
| const Segments::RevertEntry &revert_entry = |
| segments->revert_entry(i); |
| if (revert_entry.id == UserHistoryPredictor::revert_id() && |
| revert_entry.revert_entry_type == Segments::RevertEntry::CREATE_ENTRY) { |
| VLOG(2) << "Erasing the key: " << StringToUint32(revert_entry.key); |
| dic_->Erase(StringToUint32(revert_entry.key)); |
| } |
| } |
| } |
| |
| // static |
| UserHistoryPredictor::MatchType UserHistoryPredictor::GetMatchType( |
| const string &lstr, const string &rstr) { |
| if (lstr.empty() && !rstr.empty()) { |
| return LEFT_EMPTY_MATCH; |
| } |
| |
| const size_t size = min(lstr.size(), rstr.size()); |
| if (size == 0) { |
| return NO_MATCH; |
| } |
| |
| const int result = memcmp(lstr.data(), rstr.data(), size); |
| if (result != 0) { |
| return NO_MATCH; |
| } |
| |
| if (lstr.size() == rstr.size()) { |
| return EXACT_MATCH; |
| } else if (lstr.size() < rstr.size()) { |
| return LEFT_PREFIX_MATCH; |
| } else { |
| return RIGHT_PREFIX_MATCH; |
| } |
| |
| return NO_MATCH; |
| } |
| |
| // static |
| UserHistoryPredictor::MatchType UserHistoryPredictor::GetMatchTypeFromInput( |
| const string &input_key, |
| const string &key_base, const Trie<string> *key_expanded, |
| const string &target) { |
| if (key_expanded == NULL) { |
| // |input_key| and |key_base| can be different by compoesr modification. |
| // For example, |input_key|, "8,+", and |base| "8、+". |
| return GetMatchType(key_base, target); |
| } |
| |
| // we can assume key_expanded != NULL from here. |
| if (key_base.empty()) { |
| string value; |
| size_t key_length = 0; |
| bool has_subtrie = false; |
| if (!key_expanded->LookUpPrefix(target, &value, |
| &key_length, &has_subtrie)) { |
| return NO_MATCH; |
| } else if (value == target && value == input_key) { |
| return EXACT_MATCH; |
| } else { |
| return LEFT_PREFIX_MATCH; |
| } |
| } else { |
| const size_t size = min(key_base.size(), target.size()); |
| if (size == 0) { |
| return NO_MATCH; |
| } |
| const int result = memcmp(key_base.data(), target.data(), size); |
| if (result != 0) { |
| return NO_MATCH; |
| } |
| if (target.size() <= key_base.size()) { |
| return RIGHT_PREFIX_MATCH; |
| } |
| string value; |
| size_t key_length = 0; |
| bool has_subtrie = false; |
| if (!key_expanded->LookUpPrefix(target.data() + key_base.size(), |
| &value, |
| &key_length, &has_subtrie)) { |
| return NO_MATCH; |
| } |
| const string matched = key_base + value; |
| if (matched == target && matched == input_key) { |
| return EXACT_MATCH; |
| } else { |
| return LEFT_PREFIX_MATCH; |
| } |
| } |
| |
| DCHECK(false) << "Should not come here"; |
| return NO_MATCH; |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::Fingerprint(const string &key, |
| const string &value, |
| EntryType type) { |
| if (type == Entry::DEFAULT_ENTRY) { |
| // Since we have already used the fingerprint function for next entries and |
| // next entries are saved in user's local machine, we are not able |
| // to change the Fingerprint function for the old key/value type. |
| return Util::Fingerprint32(key + kDelimiter + value); |
| } else { |
| uint8 id = static_cast<uint8>(type); |
| return Util::Fingerprint32(reinterpret_cast<char *>(&id), sizeof(id)); |
| } |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::Fingerprint(const string &key, |
| const string &value) { |
| return Fingerprint(key, value, Entry::DEFAULT_ENTRY); |
| } |
| |
| uint32 UserHistoryPredictor::EntryFingerprint( |
| const UserHistoryPredictor::Entry &entry) { |
| return Fingerprint(entry.key(), entry.value()); |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::SegmentFingerprint(const Segment &segment) { |
| if (segment.candidates_size() > 0) { |
| return Fingerprint(segment.candidate(0).key, segment.candidate(0).value); |
| } |
| return 0; |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::LearningSegmentFingerprint( |
| const SegmentForLearning &segment) { |
| return Fingerprint(segment.key, segment.value); |
| } |
| |
| // static |
| string UserHistoryPredictor::Uint32ToString(uint32 fp) { |
| string buf(reinterpret_cast<const char *>(&fp), sizeof(fp)); |
| return buf; |
| } |
| |
| // static |
| uint32 UserHistoryPredictor::StringToUint32(const string &input) { |
| uint32 result = 0; |
| if (input.size() == sizeof(result)) { |
| memcpy(reinterpret_cast<char *>(&result), input.data(), input.size()); |
| } |
| return result; |
| } |
| |
| // static |
| bool UserHistoryPredictor::IsValidSuggestion( |
| RequestType request_type, |
| uint32 prefix_len, |
| const UserHistoryPredictor::Entry &entry) { |
| // when bigram_boost is true, that means that previous user input |
| // and current input have bigram relation. |
| if (entry.bigram_boost()) { |
| return true; |
| } |
| // when zero_query_suggestion is true, that means that |
| // predictor is running on mobile device. In this case, |
| // make the behavior more aggressive. |
| if (request_type == ZERO_QUERY_SUGGESTION) { |
| return true; |
| } |
| // Handle suggestion_freq and conversion_freq differently. |
| // conversion_freq is less aggressively affecting to the final decision. |
| const uint32 freq = max(entry.suggestion_freq(), |
| entry.conversion_freq() / 4); |
| |
| // TODO(taku,komatsu): better to make it simpler and easier to be understood. |
| const uint32 base_prefix_len = 3 - min(static_cast<uint32>(2), freq); |
| return (prefix_len >= base_prefix_len); |
| } |
| |
| // static |
| // 1) sort by last_access_time, which is basically the same as LRU policy. |
| // 2) boost shorter candidate, if having the same last_access_time. |
| // 3) add a bigram boost as a special bonus. |
| // TODO(taku): better to take "frequency" into consideration |
| uint32 UserHistoryPredictor::GetScore( |
| const UserHistoryPredictor::Entry &entry) { |
| const uint32 kBigramBoostAsTime = 7 * 24 * 60 * 60; // 1 week. |
| return |
| entry.last_access_time() - Util::CharsLen(entry.value()) + |
| (entry.bigram_boost() ? kBigramBoostAsTime : 0); |
| } |
| |
| // return the size of cache. |
| // static |
| uint32 UserHistoryPredictor::cache_size() { |
| return kLRUCacheSize; |
| } |
| |
| // return the size of next entries. |
| // static |
| uint32 UserHistoryPredictor::max_next_entries_size() { |
| return kMaxNextEntriesSize; |
| } |
| |
| } // namespace mozc |