| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #ifndef MOZC_PREDICTION_USER_HISTORY_PREDICTOR_H_ |
| #define MOZC_PREDICTION_USER_HISTORY_PREDICTOR_H_ |
| |
| #include <queue> |
| #include <set> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/freelist.h" |
| #include "base/scoped_ptr.h" |
| #include "base/string_piece.h" |
| #include "base/trie.h" |
| #include "prediction/predictor_interface.h" |
| #include "prediction/user_history_predictor.pb.h" |
| #include "storage/lru_cache.h" |
| // for FRIEND_TEST |
| #include "testing/base/public/gunit_prod.h" |
| |
| namespace mozc { |
| |
| namespace storage { |
| class StringStorageInterface; |
| } // namespace storage |
| |
| class ConversionRequest; |
| class DictionaryInterface; |
| class POSMatcher; |
| class Segment; |
| class Segments; |
| class SuppressionDictionary; |
| class UserHistoryPredictorSyncer; |
| |
| // Added serialization method for UserHistory. |
| class UserHistoryStorage : public mozc::user_history_predictor::UserHistory { |
| public: |
| explicit UserHistoryStorage(const string &filename); |
| ~UserHistoryStorage(); |
| |
| // Load from encrypted file. |
| bool Load(); |
| |
| // Save history into encrypted file. |
| bool Save() const; |
| |
| private: |
| scoped_ptr<storage::StringStorageInterface> storage_; |
| }; |
| |
| // UserHistoryPredictor is NOT thread safe. |
| // Currently, all methods of UserHistoryPredictor is called |
| // by single thread. Although AsyncSave() and AsyncLoad() make |
| // worker threads internally, these two functions won't be |
| // called by multiple-threads at the same time |
| class UserHistoryPredictor : public PredictorInterface { |
| public: |
| UserHistoryPredictor(const DictionaryInterface *dictionary, |
| const POSMatcher *pos_matcher, |
| const SuppressionDictionary *suppression_dictionary, |
| bool enable_content_word_learning); |
| virtual ~UserHistoryPredictor(); |
| |
| void set_content_word_learning_enabled(bool value) { |
| content_word_learning_enabled_ = value; |
| } |
| |
| virtual bool Predict(Segments *segments) const; |
| virtual bool PredictForRequest(const ConversionRequest &request, |
| Segments *segments) const; |
| |
| // Hook(s) for all mutable operations. |
| virtual void Finish(Segments *segments); |
| |
| // Revert last Finish operation. |
| virtual void Revert(Segments *segments); |
| |
| // Sync user history data to local file. |
| // You can call either Save() or AsyncSave(). |
| virtual bool Sync(); |
| |
| // Reloads from local disk. |
| // Do not call Sync() before Reload(). |
| virtual bool Reload(); |
| |
| // Clears LRU data. |
| virtual bool ClearAllHistory(); |
| |
| // Clears unused data. |
| virtual bool ClearUnusedHistory(); |
| |
| // Clears a specific history entry. |
| virtual bool ClearHistoryEntry(const string &key, const string &value); |
| |
| // Implements PredictorInterface. |
| virtual bool WaitForSyncerForTest(); |
| |
| // Gets user history filename. |
| static string GetUserHistoryFileName(); |
| |
| virtual const string &GetPredictorName() const { return predictor_name_; } |
| |
| // Used in user_history_sync_util. |
| typedef user_history_predictor::UserHistory::Entry Entry; |
| typedef user_history_predictor::UserHistory::NextEntry NextEntry; |
| typedef user_history_predictor::UserHistory::Entry::EntryType EntryType; |
| |
| // return fingerprints from various object. |
| static uint32 Fingerprint(const string &key, const string &value); |
| static uint32 Fingerprint(const string &key, const string &value, |
| EntryType type); |
| static uint32 EntryFingerprint(const Entry &entry); |
| static uint32 SegmentFingerprint(const Segment &segment); |
| |
| // return the size of cache. |
| static uint32 cache_size(); |
| |
| // return the size of next entries. |
| static uint32 max_next_entries_size(); |
| |
| private: |
| struct SegmentForLearning { |
| string key; |
| string value; |
| string content_key; |
| string content_value; |
| string description; |
| }; |
| static uint32 LearningSegmentFingerprint(const SegmentForLearning &segment); |
| |
| class SegmentsForLearning { |
| public: |
| SegmentsForLearning() {} |
| |
| virtual ~SegmentsForLearning() {} |
| |
| void push_back_history_segment(const SegmentForLearning &val) { |
| history_segments_.push_back(val); |
| } |
| |
| void push_back_conversion_segment(const SegmentForLearning &val) { |
| conversion_segments_.push_back(val); |
| } |
| |
| size_t history_segments_size() const { |
| return history_segments_.size(); |
| } |
| |
| size_t conversion_segments_size() const { |
| return conversion_segments_.size(); |
| } |
| |
| size_t all_segments_size() const { |
| return history_segments_size() + conversion_segments_size(); |
| } |
| |
| const SegmentForLearning &history_segment(size_t i) const { |
| return history_segments_[i]; |
| } |
| |
| const SegmentForLearning &conversion_segment(size_t i) const { |
| return conversion_segments_[i]; |
| } |
| |
| const SegmentForLearning &all_segment(size_t i) const { |
| if (i < history_segments_size()) { |
| return history_segment(i); |
| } else { |
| return conversion_segments_[i - history_segments_size()]; |
| } |
| } |
| |
| private: |
| vector<SegmentForLearning> history_segments_; |
| vector<SegmentForLearning> conversion_segments_; |
| }; |
| |
| friend class UserHistoryPredictorSyncer; |
| friend class UserHistoryPredictorTest; |
| |
| FRIEND_TEST(UserHistoryPredictorTest, UserHistoryPredictorTest); |
| FRIEND_TEST(UserHistoryPredictorTest, UserHistoryPredictorTest_suggestion); |
| FRIEND_TEST(UserHistoryPredictorTest, DescriptionTest); |
| FRIEND_TEST(UserHistoryPredictorTest, |
| UserHistoryPredictorUnusedHistoryTest); |
| FRIEND_TEST(UserHistoryPredictorTest, UserHistoryPredictorRevertTest); |
| FRIEND_TEST(UserHistoryPredictorTest, UserHistoryPredictorClearTest); |
| FRIEND_TEST(UserHistoryPredictorTest, |
| UserHistoryPredictorTrailingPunctuation); |
| FRIEND_TEST(UserHistoryPredictorTest, HistoryToPunctuation); |
| FRIEND_TEST(UserHistoryPredictorTest, |
| UserHistoryPredictorPreceedingPunctuation); |
| FRIEND_TEST(UserHistoryPredictorTest, StartsWithPunctuations); |
| FRIEND_TEST(UserHistoryPredictorTest, ZeroQuerySuggestionTest); |
| FRIEND_TEST(UserHistoryPredictorTest, MultiSegmentsMultiInput); |
| FRIEND_TEST(UserHistoryPredictorTest, MultiSegmentsSingleInput); |
| FRIEND_TEST(UserHistoryPredictorTest, Regression2843371_Case1); |
| FRIEND_TEST(UserHistoryPredictorTest, Regression2843371_Case2); |
| FRIEND_TEST(UserHistoryPredictorTest, Regression2843371_Case3); |
| FRIEND_TEST(UserHistoryPredictorTest, Regression2843775); |
| FRIEND_TEST(UserHistoryPredictorTest, DuplicateString); |
| FRIEND_TEST(UserHistoryPredictorTest, SyncTest); |
| FRIEND_TEST(UserHistoryPredictorTest, GetMatchTypeTest); |
| FRIEND_TEST(UserHistoryPredictorTest, FingerPrintTest); |
| FRIEND_TEST(UserHistoryPredictorTest, Uint32ToStringTest); |
| FRIEND_TEST(UserHistoryPredictorTest, GetScore); |
| FRIEND_TEST(UserHistoryPredictorTest, IsValidEntry); |
| FRIEND_TEST(UserHistoryPredictorTest, IsValidSuggestion); |
| FRIEND_TEST(UserHistoryPredictorTest, EntryPriorityQueueTest); |
| FRIEND_TEST(UserHistoryPredictorTest, PrivacySensitiveTest); |
| FRIEND_TEST(UserHistoryPredictorTest, PrivacySensitiveMultiSegmentsTest); |
| FRIEND_TEST(UserHistoryPredictorTest, UserHistoryStorage); |
| FRIEND_TEST(UserHistoryPredictorTest, RomanFuzzyPrefixMatch); |
| FRIEND_TEST(UserHistoryPredictorTest, MaybeRomanMisspelledKey); |
| FRIEND_TEST(UserHistoryPredictorTest, GetRomanMisspelledKey); |
| FRIEND_TEST(UserHistoryPredictorTest, RomanFuzzyLookupEntry); |
| FRIEND_TEST(UserHistoryPredictorTest, ExpandedLookupRoman); |
| FRIEND_TEST(UserHistoryPredictorTest, ExpandedLookupKana); |
| FRIEND_TEST(UserHistoryPredictorTest, GetMatchTypeFromInputRoman); |
| FRIEND_TEST(UserHistoryPredictorTest, GetMatchTypeFromInputKana); |
| FRIEND_TEST(UserHistoryPredictorTest, GetInputKeyFromSegmentsRoman); |
| FRIEND_TEST(UserHistoryPredictorTest, GetInputKeyFromSegmentsRomanN); |
| FRIEND_TEST(UserHistoryPredictorTest, GetInputKeyFromSegmentsRomanRandom); |
| FRIEND_TEST(UserHistoryPredictorTest, GetInputKeyFromSegmentsShouldNotCrash); |
| FRIEND_TEST(UserHistoryPredictorTest, GetInputKeyFromSegmentsFlickN); |
| FRIEND_TEST(UserHistoryPredictorTest, GetInputKeyFromSegments12KeyN); |
| FRIEND_TEST(UserHistoryPredictorTest, GetInputKeyFromSegmentsKana); |
| FRIEND_TEST(UserHistoryPredictorTest, RealtimeConversionInnerSegment); |
| FRIEND_TEST(UserHistoryPredictorTest, ZeroQueryFromRealtimeConversion); |
| FRIEND_TEST(UserHistoryPredictorTest, LongCandidateForMobile); |
| FRIEND_TEST(UserHistoryPredictorTest, EraseNextEntries); |
| FRIEND_TEST(UserHistoryPredictorTest, RemoveNgramChain); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Unigram); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Bigram_DeleteWhole); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Bigram_DeleteFirst); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Bigram_DeleteSecond); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Trigram_DeleteWhole); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Trigram_DeleteFirst); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Trigram_DeleteSecond); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Trigram_DeleteThird); |
| FRIEND_TEST(UserHistoryPredictorTest, |
| ClearHistoryEntry_Trigram_DeleteFirstBigram); |
| FRIEND_TEST(UserHistoryPredictorTest, |
| ClearHistoryEntry_Trigram_DeleteSecondBigram); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Scenario1); |
| FRIEND_TEST(UserHistoryPredictorTest, ClearHistoryEntry_Scenario2); |
| |
| enum MatchType { |
| NO_MATCH, // no match |
| LEFT_PREFIX_MATCH, // left string is a prefix of right string |
| RIGHT_PREFIX_MATCH, // right string is a prefix of left string |
| LEFT_EMPTY_MATCH, // left string is empty (for zero_query_suggestion) |
| EXACT_MATCH, // right string == left string |
| }; |
| |
| enum RequestType { |
| DEFAULT, |
| ZERO_QUERY_SUGGESTION, |
| }; |
| |
| // Return value of RemoveNgramChain() method. See the comments in |
| // implementation. |
| enum RemoveNgramChainResult { |
| DONE, |
| TAIL, |
| NOT_FOUND, |
| }; |
| |
| // Load user history data to LRU from local file |
| bool Load(); |
| |
| // Save user history data in LRU to local file |
| bool Save(); |
| |
| // non-blocking version of Load |
| // This makes a new thread and call Load() |
| bool AsyncSave(); |
| |
| // non-blocking version of Sync |
| // This makes a new thread and call Save() |
| bool AsyncLoad(); |
| |
| // Waits until syncer finishes. |
| void WaitForSyncer(); |
| |
| // return id for RevertEntry |
| static uint16 revert_id(); |
| |
| // Get match type from two strings |
| static MatchType GetMatchType(const string &lstr, const string &rstr); |
| |
| // Get match type with ambiguity expansion |
| static MatchType GetMatchTypeFromInput(const string &input_key, |
| const string &key_base, |
| const Trie<string> *key_expanded, |
| const string &target); |
| |
| // Uint32 <=> string conversion |
| static string Uint32ToString(uint32 fp); |
| static uint32 StringToUint32(const string &input); |
| |
| // return true if prev_entry has a next_fp link to entry |
| static bool HasBigramEntry(const Entry &entry, |
| const Entry &prev_entry); |
| |
| // return true |result_entry| can be handled as |
| // a valid result if the length of user input is |prefix_len|. |
| static bool IsValidSuggestion(RequestType request_type, |
| uint32 prefix_len, |
| const Entry &result_entry); |
| |
| // Returns true if entry is DEFAULT_ENTRY, satisfies certain conditions, and |
| // doesn't have removed flag. |
| bool IsValidEntry(const Entry &entry, uint32 available_emoji_carrier) const; |
| // The same as IsValidEntry except that removed field is ignored. |
| bool IsValidEntryIgnoringRemovedField(const Entry &entry, |
| uint32 available_emoji_carrier) const; |
| |
| // return "tweaked" score of result_entry. |
| // the score is basically determined by "last_access_time", (a.k.a, |
| // LRU policy), but we want to slightly change the score |
| // with different signals, including the length of value and/or |
| // bigram_boost flags. |
| static uint32 GetScore(const Entry &result_entry); |
| |
| // Priority Queue class for entry. New item is sorted by |
| // |score| internally. By calling Pop() in sequence, you |
| // can obtain the list of entry sorted by |score|. |
| class EntryPriorityQueue { |
| public: |
| EntryPriorityQueue(); |
| virtual ~EntryPriorityQueue(); |
| size_t size() const { |
| return agenda_.size(); |
| } |
| bool Push(Entry *entry); |
| Entry *Pop(); |
| Entry *NewEntry(); |
| |
| private: |
| friend class UserHistoryPredictor; |
| typedef pair<uint32, Entry *> QueueElement; |
| typedef priority_queue<QueueElement> Agenda; |
| Agenda agenda_; |
| FreeList<Entry> pool_; |
| set<uint32> seen_; |
| }; |
| |
| typedef mozc::storage::LRUCache<uint32, Entry> DicCache; |
| typedef DicCache::Element DicElement; |
| |
| bool CheckSyncerAndDelete() const; |
| |
| // If |entry| is the target of prediction, |
| // create a new result and insert it to |results|. |
| // Can set |prev_entry| if there is a history segment just before |input_key|. |
| // |prev_entry| is an optional field. If set NULL, this field is just ignored. |
| // This method adds a new result entry with score, pair<score, entry>, to |
| // |results|. |
| bool LookupEntry(const string &input_key, |
| const string &key_base, |
| const Trie<string> *key_expanded, |
| const Entry *entry, |
| const Entry *prev_entry, |
| EntryPriorityQueue *results) const; |
| |
| const Entry *LookupPrevEntry(const Segments &segments, |
| uint32 available_emoji_carrier) const; |
| |
| // Adds an entry to a priority queue. |
| Entry *AddEntry(const Entry &entry, EntryPriorityQueue *results) const; |
| |
| // Adds the entry whose key and value are modified to a priority queue. |
| Entry *AddEntryWithNewKeyValue(const string &key, const string &value, |
| const Entry &entry, |
| EntryPriorityQueue *results) const; |
| |
| void GetResultsFromHistoryDictionary( |
| const ConversionRequest &request, |
| const Segments &segments, |
| const Entry *prev_entry, |
| EntryPriorityQueue *results) const; |
| |
| // Get input data from segments. |
| // These input data include ambiguities. |
| static void GetInputKeyFromSegments( |
| const ConversionRequest &request, const Segments &segments, |
| string *input_key, string *base, |
| scoped_ptr<Trie<string> >*expanded); |
| |
| bool InsertCandidates(RequestType request_type, |
| const ConversionRequest &request, Segments *segments, |
| EntryPriorityQueue *results) const; |
| |
| void MakeLearningSegments(const Segments &segments, |
| SegmentsForLearning *learning_segments) const; |
| |
| // return true if |prefix| is a fuzzy-prefix of |str|. |
| // 'Fuzzy' means that |
| // 1) Allows one character deletation in the |prefix|. |
| // 2) Allows one character swap in the |prefix|. |
| static bool RomanFuzzyPrefixMatch(const string &str, |
| const string &prefix); |
| |
| // return romanized preedit string if the preedit looks |
| // misspelled. It first tries to get the preedit string with |
| // composer() if composer is available. If not, use the key |
| // directory. It also use MaybeRomanMisspelledKey() defined |
| // below to check the preedit looks missspelled or not. |
| static string GetRomanMisspelledKey(const Segments &segments); |
| |
| // return true if |key| may contain miss spelling. |
| // Currently, this function returns true if |
| // 1) key contains only one alphabet. |
| // 2) other characters of key are all hiragana. |
| static bool MaybeRomanMisspelledKey(const string &key); |
| |
| // If roman_input_key can be a target key of entry->key(), creat a new |
| // result and insert it to |results|. |
| // This method adds a new result entry with score, pair<score, entry>, to |
| // |results|. |
| bool RomanFuzzyLookupEntry( |
| const string &roman_input_key, |
| const Entry *entry, |
| EntryPriorityQueue *results) const; |
| |
| void InsertHistory(bool is_suggestion_selected, |
| uint64 last_access_time, |
| Segments *segments); |
| |
| // insert |key,value,description| to the internal dictionary database. |
| // |is_suggestion_selected|: key/value is suggestion or conversion. |
| // |next_fp|: fingerprint of the next segment. |
| // |last_access_time|: the time when this entrty was created |
| void Insert(const string &key, |
| const string &value, |
| const string &description, |
| bool is_suggestion_selected, |
| uint32 next_fp, |
| uint64 last_access_time, |
| Segments *segments); |
| |
| // Insert event entry (CLEAN_ALL_EVENT|CLEAN_UNUSED_EVENT). |
| void InsertEvent(EntryType type); |
| |
| // Insert a new |next_entry| into |entry|. |
| // it makes a bigram connection from entry to next_entry. |
| void InsertNextEntry(const NextEntry &next_entry, |
| UserHistoryPredictor::Entry *entry) const; |
| |
| static void EraseNextEntries(uint32 fp, Entry *entry); |
| |
| // Recursively removes a chain of Entries in |dic_|. See the comment in |
| // implemenetation for details. |
| RemoveNgramChainResult RemoveNgramChain(const string &target_key, |
| const string &target_value, |
| Entry *entry, |
| vector<StringPiece> *key_ngrams, |
| size_t key_ngrams_len, |
| vector<StringPiece> *value_ngrams, |
| size_t value_ngrams_len); |
| |
| // Returns true if the input first candidate seems to be a privacy sensitive |
| // such like password. |
| bool IsPrivacySensitive(const Segments *segments) const; |
| |
| const DictionaryInterface *dictionary_; |
| const POSMatcher *pos_matcher_; |
| const SuppressionDictionary *suppression_dictionary_; |
| const string predictor_name_; |
| |
| bool content_word_learning_enabled_; |
| bool updated_; |
| scoped_ptr<DicCache> dic_; |
| mutable scoped_ptr<UserHistoryPredictorSyncer> syncer_; |
| }; |
| } // namespace mozc |
| |
| #endif // MOZC_PREDICTION_USER_HISTORY_PREDICTOR_H_ |