| // Copyright 2010-2014, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "prediction/dictionary_predictor.h" |
| |
| #include <algorithm> |
| #include <cctype> |
| #include <climits> // INT_MAX |
| #include <cmath> |
| #include <list> |
| #include <map> |
| #include <set> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/flags.h" |
| #include "base/logging.h" |
| #include "base/number_util.h" |
| #include "base/util.h" |
| #include "composer/composer.h" |
| #include "config/config.pb.h" |
| #include "config/config_handler.h" |
| #include "converter/connector_interface.h" |
| #include "converter/conversion_request.h" |
| #include "converter/converter_interface.h" |
| #include "converter/immutable_converter_interface.h" |
| #include "converter/node_list_builder.h" |
| #include "converter/segmenter_interface.h" |
| #include "converter/segments.h" |
| #include "dictionary/dictionary_interface.h" |
| #include "dictionary/pos_matcher.h" |
| #include "prediction/predictor_interface.h" |
| #include "prediction/suggestion_filter.h" |
| #include "prediction/zero_query_number_data.h" |
| #include "session/commands.pb.h" |
| |
| // This flag is set by predictor.cc |
| // We can remove this after the ambiguity expansion feature get stable. |
| DEFINE_bool(enable_expansion_for_dictionary_predictor, |
| false, |
| "enable ambiguity expansion for dictionary_predictor"); |
| |
| DEFINE_bool(enable_mixed_conversion, |
| false, |
| "Enable mixed conversion feature"); |
| |
| DECLARE_bool(enable_typing_correction); |
| |
| namespace mozc { |
| namespace { |
| |
| // Used to emulate positive infinity for cost. This value is set for those |
| // candidates that are thought to be aggressive; thus we can eliminate such |
| // candidates from suggestion or prediction. Note that for this purpose we don't |
| // want to use INT_MAX because someone might further add penalty after cost is |
| // set to INT_MAX, which leads to overflow and consequently aggressive |
| // candidates would appear in the top results. |
| const int kInfinity = (2 << 20); |
| |
| // Note that PREDICTION mode is much slower than SUGGESTION. |
| // Number of prediction calls should be minimized. |
| const size_t kSuggestionMaxResultsSize = 256; |
| const size_t kPredictionMaxResultsSize = 100000; |
| |
| void GetNumberSuffixArray(const string &history_input, |
| vector<string> *suffixes) { |
| DCHECK(suffixes); |
| const char kDefault[] = "default"; |
| const string default_str(kDefault); |
| |
| int default_num = -1; |
| int suffix_num = -1; |
| |
| for (int i = 0; ZeroQueryNum[i]; ++i) { |
| if (default_str == ZeroQueryNum[i][0]) { |
| default_num = i; |
| } else if (history_input == ZeroQueryNum[i][0]) { |
| suffix_num = i; |
| } |
| } |
| DCHECK_GE(default_num, 0); |
| |
| if (suffix_num != -1) { |
| for (int j = 1; ZeroQueryNum[suffix_num][j]; ++j) { |
| suffixes->push_back(ZeroQueryNum[suffix_num][j]); |
| } |
| } |
| for (int j = 1; ZeroQueryNum[default_num][j]; ++j) { |
| suffixes->push_back(ZeroQueryNum[default_num][j]); |
| } |
| } |
| |
| // Returns true if the |target| may be reduncant result. |
| bool MaybeRedundant(const string &reference, const string &target) { |
| return Util::StartsWith(target, reference); |
| } |
| |
| bool IsLatinInputMode(const ConversionRequest &request) { |
| return (request.has_composer() && |
| (request.composer().GetInputMode() == transliteration::HALF_ASCII || |
| request.composer().GetInputMode() == transliteration::FULL_ASCII)); |
| } |
| |
| // Returns true if |segments| contains number history. |
| // Normalized number will be set to |number_key| |
| // Note: |
| // Now this function supports arabic number candidates only and |
| // we don't support kanji number candidates for now. |
| // This is because We have several kanji number styles, for example, |
| // "一二", "十二", "壱拾弐", etc for 12. |
| // TODO(toshiyuki): Define the spec and support Kanji. |
| bool GetNumberHistory(const Segments &segments, string *number_key) { |
| DCHECK(number_key); |
| const size_t history_size = segments.history_segments_size(); |
| if (history_size <= 0) { |
| return false; |
| } |
| |
| const Segment &last_segment = segments.history_segment(history_size - 1); |
| DCHECK_GT(last_segment.candidates_size(), 0); |
| const string &history_value = last_segment.candidate(0).value; |
| if (!NumberUtil::IsArabicNumber(history_value)) { |
| return false; |
| } |
| |
| Util::FullWidthToHalfWidth(history_value, number_key); |
| return true; |
| } |
| |
| bool IsMixedConversionEnabled(const commands::Request& request) { |
| return request.mixed_conversion() || FLAGS_enable_mixed_conversion; |
| } |
| |
| bool IsTypingCorrectionEnabled() { |
| return GET_CONFIG(use_typing_correction) || |
| FLAGS_enable_typing_correction; |
| } |
| |
| } // namespace |
| |
| class DictionaryPredictor::PredictiveLookupCallback : |
| public mozc::DictionaryInterface::Callback { |
| public: |
| PredictiveLookupCallback(DictionaryPredictor::PredictionTypes types, |
| size_t limit, size_t original_key_len, |
| const set<string> *subsequent_chars, |
| vector<DictionaryPredictor::Result> *results) |
| : penalty_(0), types_(types), limit_(limit), |
| original_key_len_(original_key_len), |
| subsequent_chars_(subsequent_chars), results_(results) {} |
| |
| virtual ResultType OnKey(StringPiece key) { |
| if (subsequent_chars_ == NULL) { |
| return TRAVERSE_CONTINUE; |
| } |
| // If |subsequent_chars_| was provided, check if the substring of |key| |
| // obtained by removing the original lookup key starts with a string in the |
| // set. For example, if original key is "he" and "hello" was found, |
| // continue traversing only when one of "l", "ll", or "llo" is in |
| // |subsequent_chars_|. |
| // Implementation note: Although Util::StartsWith is called at most N times |
| // where N = subsequent_chars_.size(), N is very small in practice, less |
| // than 10. Thus, this linear order algorithm is fast enough. |
| // Theoretically, we can construct a trie of strings in |subsequent_chars_| |
| // to get more performance but it's overkill here. |
| // TODO(noriyukit): vector<string> would be better than set<string>. To |
| // this end, we need to fix Comopser as well. |
| const StringPiece rest(key, original_key_len_); |
| for (set<string>::const_iterator iter = subsequent_chars_->begin(); |
| iter != subsequent_chars_->end(); ++iter) { |
| if (Util::StartsWith(rest, *iter)) { |
| return TRAVERSE_CONTINUE; |
| } |
| } |
| return TRAVERSE_NEXT_KEY; |
| } |
| |
| virtual ResultType OnActualKey(StringPiece key, StringPiece actual_key, |
| bool is_expanded) { |
| penalty_ = is_expanded ? kKanaModifierInsensitivePenalty : 0; |
| return TRAVERSE_CONTINUE; |
| } |
| |
| virtual ResultType OnToken(StringPiece, // key |
| StringPiece, // actual_key |
| const Token &token) { |
| results_->push_back(Result()); |
| results_->back().InitializeByTokenAndTypes(token, types_); |
| results_->back().wcost += penalty_; |
| if (results_->size() < limit_) { |
| return TRAVERSE_CONTINUE; |
| } else { |
| return TRAVERSE_DONE; |
| } |
| } |
| |
| private: |
| int32 penalty_; |
| const DictionaryPredictor::PredictionTypes types_; |
| const size_t limit_; |
| const size_t original_key_len_; |
| const set<string> *subsequent_chars_; |
| vector<DictionaryPredictor::Result> *results_; |
| |
| DISALLOW_COPY_AND_ASSIGN(PredictiveLookupCallback); |
| }; |
| |
| class DictionaryPredictor::PredictiveBigramLookupCallback : |
| public PredictiveLookupCallback { |
| public: |
| PredictiveBigramLookupCallback(DictionaryPredictor::PredictionTypes types, |
| size_t limit, size_t original_key_len, |
| const set<string> *subsequent_chars, |
| StringPiece history_value, |
| vector<DictionaryPredictor::Result> *results) |
| : PredictiveLookupCallback(types, limit, original_key_len, |
| subsequent_chars, results), |
| history_value_(history_value) {} |
| |
| virtual ResultType OnToken(StringPiece key, StringPiece expanded_key, |
| const Token &token) { |
| // Skip the token if its value doesn't start with the previous user input, |
| // |history_value_|. |
| if (!Util::StartsWith(token.value, history_value_) || |
| token.value.size() <= history_value_.size()) { |
| return TRAVERSE_CONTINUE; |
| } |
| return PredictiveLookupCallback::OnToken(key, expanded_key, token); |
| } |
| |
| private: |
| StringPiece history_value_; |
| |
| DISALLOW_COPY_AND_ASSIGN(PredictiveBigramLookupCallback); |
| }; |
| |
| // Comparator for sorting prediction candidates. |
| // If we have words A and AB, for example "六本木" and "六本木ヒルズ", |
| // assume that cost(A) < cost(AB). |
| class DictionaryPredictor::ResultWCostLess : |
| public binary_function<Result, Result, bool> { |
| public: |
| bool operator() (const DictionaryPredictor::Result &lhs, |
| const DictionaryPredictor::Result &rhs) const { |
| return lhs.wcost < rhs.wcost; |
| } |
| }; |
| |
| class DictionaryPredictor::ResultCostLess : |
| public binary_function<Result, Result, bool> { |
| public: |
| bool operator() (const DictionaryPredictor::Result &lhs, |
| const DictionaryPredictor::Result &rhs) const { |
| return lhs.cost > rhs.cost; |
| } |
| }; |
| |
| DictionaryPredictor::DictionaryPredictor( |
| const ConverterInterface *converter, |
| const ImmutableConverterInterface *immutable_converter, |
| const DictionaryInterface *dictionary, |
| const DictionaryInterface *suffix_dictionary, |
| const ConnectorInterface *connector, |
| const SegmenterInterface *segmenter, |
| const POSMatcher *pos_matcher, |
| const SuggestionFilter *suggestion_filter) |
| : converter_(converter), |
| immutable_converter_(immutable_converter), |
| dictionary_(dictionary), |
| suffix_dictionary_(suffix_dictionary), |
| connector_(connector), |
| segmenter_(segmenter), |
| suggestion_filter_(suggestion_filter), |
| counter_suffix_word_id_(pos_matcher->GetCounterSuffixWordId()), |
| predictor_name_("DictionaryPredictor") {} |
| |
| DictionaryPredictor::~DictionaryPredictor() {} |
| |
| bool DictionaryPredictor::PredictForRequest(const ConversionRequest &request, |
| Segments *segments) const { |
| if (segments == NULL) { |
| return false; |
| } |
| |
| vector<Result> results; |
| if (!AggregatePrediction(request, segments, &results)) { |
| return false; |
| } |
| |
| SetCost(request, *segments, &results); |
| RemovePrediction(request, *segments, &results); |
| |
| return AddPredictionToCandidates(request, segments, &results); |
| } |
| |
| bool DictionaryPredictor::AggregatePrediction( |
| const ConversionRequest &request, |
| Segments *segments, |
| vector<Result> *results) const { |
| DCHECK(segments); |
| DCHECK(results); |
| |
| const PredictionTypes prediction_types = |
| GetPredictionTypes(request, *segments); |
| if (prediction_types == NO_PREDICTION) { |
| return false; |
| } |
| |
| if (segments->request_type() == Segments::PARTIAL_SUGGESTION || |
| segments->request_type() == Segments::PARTIAL_PREDICTION) { |
| // This request type is used to get conversion before cursor during |
| // composition mode. Thus it should return only the candidates whose key |
| // exactly matches the query. |
| // Therefore, we use only the realtime conversion result. |
| AggregateRealtimeConversion(prediction_types, request, segments, results); |
| } else { |
| AggregateRealtimeConversion(prediction_types, request, segments, results); |
| AggregateUnigramPrediction(prediction_types, request, *segments, results); |
| AggregateBigramPrediction(prediction_types, request, *segments, results); |
| AggregateSuffixPrediction(prediction_types, request, *segments, results); |
| AggregateEnglishPrediction(prediction_types, request, *segments, results); |
| AggregateTypeCorrectingPrediction(prediction_types, request, *segments, |
| results); |
| } |
| |
| if (results->empty()) { |
| VLOG(2) << "|result| is empty"; |
| return false; |
| } else { |
| return true; |
| } |
| } |
| |
| void DictionaryPredictor::SetCost(const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| DCHECK(results); |
| |
| if (IsMixedConversionEnabled(request.request())) { |
| SetLMCost(segments, results); |
| } else { |
| SetPredictionCost(segments, results); |
| } |
| |
| ApplyPenaltyForKeyExpansion(segments, results); |
| } |
| |
| void DictionaryPredictor::RemovePrediction(const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| DCHECK(results); |
| |
| if (!IsMixedConversionEnabled(request.request())) { |
| // Currently, we don't have spelling correction feature on mobile, |
| // so we don't run RemoveMissSpelledCandidates. |
| const string &input_key = segments.conversion_segment(0).key(); |
| const size_t input_key_len = Util::CharsLen(input_key); |
| RemoveMissSpelledCandidates(input_key_len, results); |
| } |
| } |
| |
| bool DictionaryPredictor::AddPredictionToCandidates( |
| const ConversionRequest &request, |
| Segments *segments, vector<Result> *results) const { |
| DCHECK(segments); |
| DCHECK(results); |
| const bool mixed_conversion = IsMixedConversionEnabled(request.request()); |
| const string &input_key = segments->conversion_segment(0).key(); |
| const size_t input_key_len = Util::CharsLen(input_key); |
| |
| string history_key, history_value; |
| GetHistoryKeyAndValue(*segments, &history_key, &history_value); |
| |
| // exact_bigram_key does not contain ambiguity expansion, because |
| // this is used for exact matching for the key. |
| const string exact_bigram_key = history_key + input_key; |
| |
| Segment *segment = segments->mutable_conversion_segment(0); |
| DCHECK(segment); |
| |
| // Instead of sorting all the results, we construct a heap. |
| // This is done in linear time and |
| // we can pop as many results as we need efficiently. |
| make_heap(results->begin(), results->end(), ResultCostLess()); |
| |
| |
| const size_t size = min(segments->max_prediction_candidates_size(), |
| results->size()); |
| |
| int added = 0; |
| set<string> seen; |
| |
| int added_suffix = 0; |
| bool cursor_at_tail = |
| request.has_composer() && |
| request.composer().GetCursor() == request.composer().GetLength(); |
| |
| for (size_t i = 0; i < results->size(); ++i) { |
| // Pop a result from a heap. Please pay attention not to use results->at(i). |
| pop_heap(results->begin(), results->end() - i, ResultCostLess()); |
| const Result &result = results->at(results->size() - i - 1); |
| |
| if (added >= size || result.cost >= kInfinity) { |
| break; |
| } |
| |
| if (result.types == NO_PREDICTION) { |
| continue; |
| } |
| |
| // If mixed_conversion is true, we don't filter the results which have |
| // the exact same key as the input. |
| if (!(mixed_conversion && (result.key == input_key)) && |
| suggestion_filter_->IsBadSuggestion(result.value)) { |
| continue; |
| } |
| |
| // don't suggest exactly the same candidate as key. |
| // if |mixed_conversion| is true, that's not the case. |
| if (!mixed_conversion && |
| !(result.types & REALTIME) && |
| (((result.types & BIGRAM) && |
| exact_bigram_key == result.value) || |
| (!(result.types & BIGRAM) && |
| input_key == result.value))) { |
| continue; |
| } |
| |
| string key, value; |
| if (result.types & BIGRAM) { |
| // remove the prefix of history key and history value. |
| key = result.key.substr(history_key.size(), |
| result.key.size() - history_key.size()); |
| value = result.value.substr(history_value.size(), |
| result.value.size() - history_value.size()); |
| } else { |
| key = result.key; |
| value = result.value; |
| } |
| |
| if (!seen.insert(value).second) { |
| continue; |
| } |
| |
| // User input: "おーすとり" (len = 5) |
| // key/value: "おーすとりら" "オーストラリア" (miss match pos = 4) |
| if ((result.candidate_attributes & |
| Segment::Candidate::SPELLING_CORRECTION) && |
| key != input_key && |
| input_key_len <= GetMissSpelledPosition(key, value) + 1) { |
| continue; |
| } |
| |
| if (result.types == SUFFIX && added_suffix++ >= 20) { |
| // TODO(toshiyuki): Need refactoring for controlling suffix |
| // prediction number after we will fix the appropriate number. |
| continue; |
| } |
| |
| Segment::Candidate *candidate = segment->push_back_candidate(); |
| DCHECK(candidate); |
| |
| candidate->Init(); |
| candidate->content_key = key; |
| candidate->content_value = value; |
| candidate->key = key; |
| candidate->value = value; |
| candidate->lid = result.lid; |
| candidate->rid = result.rid; |
| candidate->wcost = result.wcost; |
| candidate->cost = result.cost; |
| candidate->attributes = result.candidate_attributes; |
| if (!(candidate->attributes & Segment::Candidate::SPELLING_CORRECTION) && |
| IsLatinInputMode(request)) { |
| candidate->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION; |
| candidate->attributes |= Segment::Candidate::NO_EXTRA_DESCRIPTION; |
| } |
| if (candidate->attributes & Segment::Candidate::PARTIALLY_KEY_CONSUMED) { |
| candidate->consumed_key_size = result.consumed_key_size; |
| // There are two scenarios to reach here. |
| // 1. Auto partial suggestion. |
| // e.g. composition わたしのなまえ| -> candidate 私の |
| // 2. Partial suggestion. |
| // e.g. composition わたしの|なまえ -> candidate 私の |
| // To distinguish auto partial suggestion from (non-auto) partial |
| // suggestion, see the cursor position. If the cursor is at the tail |
| // of the composition, this is auto partial suggestion. |
| if (cursor_at_tail) { |
| candidate->attributes |= Segment::Candidate::AUTO_PARTIAL_SUGGESTION; |
| } |
| } |
| if (result.types & REALTIME) { |
| candidate->inner_segment_boundary = result.inner_segment_boundary; |
| } |
| if (result.types & TYPING_CORRECTION) { |
| candidate->attributes |= Segment::Candidate::TYPING_CORRECTION; |
| } |
| |
| SetDescription(result.types, candidate->attributes, |
| &candidate->description); |
| #ifdef DEBUG |
| SetDebugDescription(result.types, &candidate->description); |
| #endif // DEBUG |
| |
| ++added; |
| } |
| return added > 0; |
| } |
| |
| void DictionaryPredictor::SetDescription(PredictionTypes types, |
| uint32 attributes, |
| string *description) { |
| if (types & TYPING_CORRECTION) { |
| // <入力補正> |
| Util::AppendStringWithDelimiter( |
| " ", |
| "<" "\xE5\x85\xA5\xE5\x8A\x9B\xE8\xA3\x9C\xE6\xAD\xA3" ">", |
| description); |
| } |
| if (attributes & Segment::Candidate::AUTO_PARTIAL_SUGGESTION) { |
| // <部分確定> |
| Util::AppendStringWithDelimiter( |
| " ", |
| "<" "\xE9\x83\xA8\xE5\x88\x86\xE7\xA2\xBA\xE5\xAE\x9A" ">", |
| description); |
| } |
| } |
| |
| void DictionaryPredictor::SetDebugDescription(PredictionTypes types, |
| string *description) { |
| if (types & UNIGRAM) { |
| Util::AppendStringWithDelimiter(" ", "Unigram", description); |
| } |
| if (types & BIGRAM) { |
| Util::AppendStringWithDelimiter(" ", "Bigram", description); |
| } |
| if (types & REALTIME_TOP) { |
| Util::AppendStringWithDelimiter(" ", "Realtime Top", description); |
| } else if (types & REALTIME) { |
| Util::AppendStringWithDelimiter(" ", "Realtime", description); |
| } |
| if (types & SUFFIX) { |
| Util::AppendStringWithDelimiter(" ", "Suffix", description); |
| } |
| if (types & ENGLISH) { |
| Util::AppendStringWithDelimiter(" ", "English", description); |
| } |
| // Note that description for TYPING_CORRECTION is omitted |
| // because it is appended by SetDescription. |
| } |
| |
| // return transition_cost[rid][result.lid] + result.wcost (+ penalties). |
| int DictionaryPredictor::GetLMCost(const Result &result, int rid) const { |
| int lm_cost = connector_->GetTransitionCost(rid, result.lid) + result.wcost; |
| if (!(result.types & REALTIME)) { |
| // Relatime conversion already adds perfix/suffix penalties to the result. |
| // Note that we don't add prefix penalty the role of "bunsetsu" is |
| // ambigous on zero-query suggestion. |
| lm_cost += segmenter_->GetSuffixPenalty(result.rid); |
| } |
| |
| return lm_cost; |
| } |
| |
| namespace { |
| |
| class FindValueCallback : public DictionaryInterface::Callback { |
| public: |
| explicit FindValueCallback(StringPiece target_value) |
| : target_value_(target_value), found_(false) {} |
| |
| virtual ResultType OnToken(StringPiece, // key |
| StringPiece, // actual_key |
| const Token &token) { |
| if (token.value != target_value_) { |
| return TRAVERSE_CONTINUE; |
| } |
| found_ = true; |
| token_ = token; |
| return TRAVERSE_DONE; |
| } |
| |
| bool found() const { |
| return found_; |
| } |
| |
| const Token &token() const { |
| return token_; |
| } |
| |
| private: |
| StringPiece target_value_; |
| bool found_; |
| Token token_; |
| |
| DISALLOW_COPY_AND_ASSIGN(FindValueCallback); |
| }; |
| |
| } // namespace |
| |
| void DictionaryPredictor::Result::InitializeByTokenAndTypes( |
| const Token &token, PredictionTypes types) { |
| SetTypesAndTokenAttributes(types, token.attributes); |
| key = token.key; |
| value = token.value; |
| wcost = token.cost; |
| lid = token.lid; |
| rid = token.rid; |
| } |
| |
| void DictionaryPredictor::Result::SetTypesAndTokenAttributes( |
| PredictionTypes prediction_types, Token::AttributesBitfield token_attr) { |
| types = prediction_types; |
| candidate_attributes = 0; |
| if (types & TYPING_CORRECTION) { |
| candidate_attributes |= Segment::Candidate::TYPING_CORRECTION; |
| } |
| if (types & (REALTIME | REALTIME_TOP)) { |
| candidate_attributes |= Segment::Candidate::REALTIME_CONVERSION; |
| } |
| if (token_attr & Token::SPELLING_CORRECTION) { |
| candidate_attributes |= Segment::Candidate::SPELLING_CORRECTION; |
| } |
| if (token_attr & Token::USER_DICTIONARY) { |
| candidate_attributes |= (Segment::Candidate::USER_DICTIONARY | |
| Segment::Candidate::NO_VARIANTS_EXPANSION); |
| } |
| } |
| |
| bool DictionaryPredictor::GetHistoryKeyAndValue( |
| const Segments &segments, string *key, string *value) const { |
| DCHECK(key); |
| DCHECK(value); |
| if (segments.history_segments_size() == 0) { |
| return false; |
| } |
| |
| const Segment &history_segment = |
| segments.history_segment(segments.history_segments_size() - 1); |
| if (history_segment.candidates_size() == 0) { |
| return false; |
| } |
| |
| key->assign(history_segment.candidate(0).key); |
| value->assign(history_segment.candidate(0).value); |
| return true; |
| } |
| |
| void DictionaryPredictor::SetPredictionCost(const Segments &segments, |
| vector<Result> *results) const { |
| DCHECK(results); |
| |
| int rid = 0; // 0 (BOS) is default |
| if (segments.history_segments_size() > 0) { |
| const Segment &history_segment = |
| segments.history_segment(segments.history_segments_size() - 1); |
| if (history_segment.candidates_size() > 0) { |
| rid = history_segment.candidate(0).rid; // use history segment's id |
| } |
| } |
| |
| const string &input_key = segments.conversion_segment(0).key(); |
| string history_key, history_value; |
| GetHistoryKeyAndValue(segments, &history_key, &history_value); |
| const string bigram_key = history_key + input_key; |
| const bool is_suggestion = (segments.request_type() == |
| Segments::SUGGESTION); |
| |
| // use the same scoring function for both unigram/bigram. |
| // Bigram will be boosted because we pass the previous |
| // key as a context information. |
| const size_t bigram_key_len = Util::CharsLen(bigram_key); |
| const size_t unigram_key_len = Util::CharsLen(input_key); |
| |
| // In the loop below, we track the minimum cost among those REALTIME |
| // candidates that have the same key length as |input_key| so that we can set |
| // a slightly smaller cost to REALTIME_TOP than these. |
| int realtime_cost_min = kInfinity; |
| Result *realtime_top_result = NULL; |
| |
| for (size_t i = 0; i < results->size(); ++i) { |
| const Result &result = results->at(i); |
| |
| // The cost of REALTIME_TOP is determined after the loop based on the |
| // minimum cost for REALTIME. Just remember the pointer of result. |
| if (result.types & REALTIME_TOP) { |
| realtime_top_result = &results->at(i); |
| continue; |
| } |
| |
| const int cost = GetLMCost(result, rid); |
| const size_t query_len = |
| (result.types & BIGRAM) ? bigram_key_len : unigram_key_len; |
| const size_t key_len = Util::CharsLen(result.key); |
| |
| if (IsAggressiveSuggestion(query_len, key_len, cost, |
| is_suggestion, results->size())) { |
| results->at(i).cost = kInfinity; |
| continue; |
| } |
| |
| // cost = -500 * log(lang_prob(w) * (1 + remain_length)) -- (1) |
| // where lang_prob(w) is a language model probability of the word "w", and |
| // remain_length the length of key user must type to input "w". |
| // |
| // Example: |
| // key/value = "とうきょう/東京" |
| // user_input = "とう" |
| // remain_length = len("とうきょう") - len("とう") = 3 |
| // |
| // By taking the log of (1), |
| // cost = -500 [log(lang_prob(w)) + log(1 + ramain_length)] |
| // = -500 * log(lang_prob(w)) + 500 * log(1 + remain_length) |
| // = cost - 500 * log(1 + remain_length) |
| // Because 500 * log(lang_prob(w)) = -cost. |
| // |
| // lang_prob(w) * (1 + remain_length) represents how user can reduce |
| // the total types by choosing this candidate. |
| // Before this simple algorithm, we have been using an SVM-base scoring, |
| // but we stop usign it with the following reasons. |
| // 1) Hard to maintain the ranking. |
| // 2) Hard to control the final results of SVM. |
| // 3) Hard to debug. |
| // 4) Since we used the log(remain_length) as a feature, |
| // the new ranking algorithm and SVM algorithm was essentially |
| // the same. |
| // 5) Since we used the length of value as a feature, we find |
| // inconsistencies between the conversion and the prediction |
| // -- the results of top prediction and the top conversion |
| // (the candidate shown after the space key) may differ. |
| // |
| // The new function brings consistent results. If two candidate |
| // have the same reading (key), they should have the same cost bonus |
| // from the length part. This implies that the result is reranked by |
| // the language model probability as long as the key part is the same. |
| // This behavior is baisically the same as the converter. |
| // |
| // TODO(team): want find the best parameter instread of kCostFactor. |
| const int kCostFactor = 500; |
| results->at(i).cost = cost - |
| kCostFactor * log(1.0 + max(0, static_cast<int>(key_len - query_len))); |
| |
| // Update the minimum cost for REALTIME candidates that have the same key |
| // length as input_key. |
| if (result.types & REALTIME && |
| result.cost < realtime_cost_min && |
| result.key.size() == input_key.size()) { |
| realtime_cost_min = result.cost; |
| } |
| } |
| |
| // Ensure that the REALTIME_TOP candidate has relatively smaller cost than |
| // those of REALTIME candidates. |
| if (realtime_top_result != NULL) { |
| realtime_top_result->cost = max(0, realtime_cost_min - 10); |
| } |
| } |
| |
| void DictionaryPredictor::SetLMCost(const Segments &segments, |
| vector<Result> *results) const { |
| DCHECK(results); |
| |
| // ranking for mobile |
| int rid = 0; // 0 (BOS) is default |
| int prev_cost = 0; |
| if (segments.history_segments_size() > 0) { |
| const Segment &history_segment = |
| segments.history_segment(segments.history_segments_size() - 1); |
| if (history_segment.candidates_size() > 0) { |
| rid = history_segment.candidate(0).rid; // use history segment's id |
| prev_cost = history_segment.candidate(0).cost; |
| if (prev_cost == 0) { |
| // if prev_cost is set to be 0 for some reason, use default cost. |
| prev_cost = 5000; |
| } |
| } |
| } |
| |
| const size_t input_key_len = Util::CharsLen( |
| segments.conversion_segment(0).key()); |
| for (size_t i = 0; i < results->size(); ++i) { |
| const Result &result = results->at(i); |
| |
| int cost = GetLMCost(result, rid); |
| // Demote filtered word here, because they are not filtered for exact match. |
| // Even for exact match, we don't want to show aggressive words |
| // with high ranking. |
| if (suggestion_filter_->IsBadSuggestion(result.value)) { |
| // Cost penalty means for bad suggestion. |
| // 3453 = 500 * log(1000) |
| const int kBadSuggestionPenalty = 3453; |
| cost += kBadSuggestionPenalty; |
| } |
| |
| // Make exact candidates to have higher ranking. |
| // Because for mobile, suggestion is the main candidates and |
| // users expect the candidates for the input key on the candidates. |
| if (result.types & (UNIGRAM | TYPING_CORRECTION)) { |
| const size_t key_len = Util::CharsLen(result.key); |
| if (key_len > input_key_len) { |
| // Cost penalty means that exact candiates are evaluated |
| // 50 times bigger in frequency. |
| // Note that the cost is calculated by cost = -500 * log(prob) |
| // 1956 = 500 * log(50) |
| const int kNotExactPenalty = 1956; |
| cost += kNotExactPenalty; |
| } |
| } |
| if (result.types & BIGRAM) { |
| // When user inputs "六本木" and there is an entry |
| // "六本木ヒルズ" in the dictionary, we can suggest |
| // "ヒルズ" as a ZeroQuery suggestion. In this case, |
| // We can't calcurate the transition cost between "六本木" |
| // and "ヒルズ". If we ignore the transition cost, |
| // bigram-based suggestion will be overestimated. |
| // Here we use |default_transition_cost| as an |
| // transition cost between "六本木" and "ヒルズ". Currently, |
| // the cost is basically the same as the cost between |
| // "名詞,一般" and "名詞,一般". |
| const int kDefaultTransitionCost = 1347; |
| // Promoting bigram candidates. |
| const int kBigramBonus = 800; // ~= 500*ln(5) |
| cost += (kDefaultTransitionCost - kBigramBonus - prev_cost); |
| } |
| results->at(i).cost = cost; |
| } |
| } |
| |
| void DictionaryPredictor::ApplyPenaltyForKeyExpansion( |
| const Segments &segments, vector<Result> *results) const { |
| if (segments.conversion_segments_size() == 0) { |
| return; |
| } |
| // Cost penalty 1151 means that expanded candiates are evaluated |
| // 10 times smaller in frequency. |
| // Note that the cost is calcurated by cost = -500 * log(prob) |
| // 1151 = 500 * log(10) |
| const int kKeyExpansionPenalty = 1151; |
| const string &conversion_key = segments.conversion_segment(0).key(); |
| for (size_t i = 0; i < results->size(); ++i) { |
| const Result &result = results->at(i); |
| if (result.types & TYPING_CORRECTION) { |
| continue; |
| } |
| if (!Util::StartsWith(result.key, conversion_key)) { |
| results->at(i).cost += kKeyExpansionPenalty; |
| } |
| } |
| } |
| |
| size_t DictionaryPredictor::GetMissSpelledPosition( |
| const string &key, const string &value) const { |
| string hiragana_value; |
| Util::KatakanaToHiragana(value, &hiragana_value); |
| // value is mixed type. return true if key == request_key. |
| if (Util::GetScriptType(hiragana_value) != Util::HIRAGANA) { |
| return Util::CharsLen(key); |
| } |
| |
| // Find the first position of character where miss spell occurs. |
| int position = 0; |
| ConstChar32Iterator key_iter(key); |
| for (ConstChar32Iterator hiragana_iter(hiragana_value); |
| !hiragana_iter.Done() && !key_iter.Done(); |
| hiragana_iter.Next(), key_iter.Next(), ++position) { |
| if (hiragana_iter.Get() != key_iter.Get()) { |
| return position; |
| } |
| } |
| |
| // not find. return the length of key. |
| while (!key_iter.Done()) { |
| ++position; |
| key_iter.Next(); |
| } |
| |
| return position; |
| } |
| |
| void DictionaryPredictor::RemoveMissSpelledCandidates( |
| size_t request_key_len, |
| vector<Result> *results) const { |
| DCHECK(results); |
| |
| if (results->size() <= 1) { |
| return; |
| } |
| |
| int spelling_correction_size = 5; |
| for (size_t i = 0; i < results->size(); ++i) { |
| const Result &result = (*results)[i]; |
| if (!(result.candidate_attributes & |
| Segment::Candidate::SPELLING_CORRECTION)) { |
| continue; |
| } |
| |
| // Only checks at most 5 spelling corrections to avoid the case |
| // like all candidates have SPELLING_CORRECTION. |
| if (--spelling_correction_size == 0) { |
| return; |
| } |
| |
| vector<size_t> same_key_index, same_value_index; |
| for (size_t j = 0; j < results->size(); ++j) { |
| if (i == j) { |
| continue; |
| } |
| const Result &target_result = (*results)[j]; |
| if (target_result.candidate_attributes & |
| Segment::Candidate::SPELLING_CORRECTION) { |
| continue; |
| } |
| if (target_result.key == result.key) { |
| same_key_index.push_back(j); |
| } |
| if (target_result.value == result.value) { |
| same_value_index.push_back(j); |
| } |
| } |
| |
| // delete same_key_index and same_value_index |
| if (!same_key_index.empty() && !same_value_index.empty()) { |
| results->at(i).types = NO_PREDICTION; |
| for (size_t k = 0; k < same_key_index.size(); ++k) { |
| results->at(same_key_index[k]).types = NO_PREDICTION; |
| } |
| } else if (same_key_index.empty() && !same_value_index.empty()) { |
| results->at(i).types = NO_PREDICTION; |
| } else if (!same_key_index.empty() && same_value_index.empty()) { |
| for (size_t k = 0; k < same_key_index.size(); ++k) { |
| results->at(same_key_index[k]).types = NO_PREDICTION; |
| } |
| if (request_key_len <= GetMissSpelledPosition(result.key, result.value)) { |
| results->at(i).types = NO_PREDICTION; |
| } |
| } |
| } |
| } |
| |
| bool DictionaryPredictor::IsAggressiveSuggestion( |
| size_t query_len, size_t key_len, int cost, |
| bool is_suggestion, size_t total_candidates_size) const { |
| // Temporal workaround for fixing the problem where longer sentence-like |
| // suggestions are shown when user input is very short. |
| // "ただしい" => "ただしいけめんにかぎる" |
| // "それでもぼ" => "それでもぼくはやっていない". |
| // If total_candidates_size is small enough, we don't perform |
| // special filtering. e.g., "せんとち" has only two candidates, so |
| // showing "千と千尋の神隠し" is OK. |
| // Also, if the cost is too small (< 5000), we allow to display |
| // long phrases. Examples include "よろしくおねがいします". |
| if (is_suggestion && total_candidates_size >= 10 && key_len >= 8 && |
| cost >= 5000 && query_len <= static_cast<size_t>(0.4 * key_len)) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| size_t DictionaryPredictor::GetRealtimeCandidateMaxSize( |
| const Segments &segments, bool mixed_conversion, size_t max_size) const { |
| const Segments::RequestType request_type = segments.request_type(); |
| DCHECK(request_type == Segments::PREDICTION || |
| request_type == Segments::SUGGESTION || |
| request_type == Segments::PARTIAL_PREDICTION || |
| request_type == Segments::PARTIAL_SUGGESTION); |
| const int kFewResultThreshold = 8; |
| size_t default_size = 10; |
| if (segments.segments_size() > 0 && |
| Util::CharsLen(segments.segment(0).key()) >= kFewResultThreshold) { |
| // We don't make so many realtime conversion prediction |
| // even if we have enough margin, as it's expected less useful. |
| max_size = min(max_size, static_cast<size_t>(8)); |
| default_size = 5; |
| } |
| size_t size = 0; |
| switch (request_type) { |
| case Segments::PREDICTION: |
| size = mixed_conversion ? max_size : default_size; |
| break; |
| case Segments::SUGGESTION: |
| // Fewer candidatats are needed basically. |
| // But on mixed_conversion mode we should behave like as conversion mode. |
| size = mixed_conversion ? default_size : 1; |
| break; |
| case Segments::PARTIAL_PREDICTION: |
| // This is kind of prediction so richer result than PARTIAL_SUGGESTION |
| // is needed. |
| size = max_size; |
| break; |
| case Segments::PARTIAL_SUGGESTION: |
| // PARTIAL_SUGGESTION works like as conversion mode so returning |
| // some candidates is needed. |
| size = default_size; |
| break; |
| default: |
| size = 0; // Never reach here |
| } |
| |
| return min(max_size, size); |
| } |
| |
| bool DictionaryPredictor::PushBackTopConversionResult( |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| DCHECK_EQ(1, segments.conversion_segments_size()); |
| |
| Segments tmp_segments; |
| tmp_segments.CopyFrom(segments); |
| tmp_segments.set_max_conversion_candidates_size(20); |
| ConversionRequest tmp_request; |
| tmp_request.CopyFrom(request); |
| tmp_request.set_composer_key_selection(ConversionRequest::PREDICTION_KEY); |
| // Some rewriters cause significant performance loss. So we skip them. |
| tmp_request.set_skip_slow_rewriters(true); |
| // This method emulates usual converter's behavior so here disable |
| // partial candidates. |
| tmp_request.set_create_partial_candidates(false); |
| if (!converter_->StartConversionForRequest(tmp_request, &tmp_segments)) { |
| return false; |
| } |
| |
| results->push_back(Result()); |
| Result *result = &results->back(); |
| result->key = segments.conversion_segment(0).key(); |
| result->lid = tmp_segments.conversion_segment(0).candidate(0).lid; |
| result->rid = tmp_segments.conversion_segment( |
| tmp_segments.conversion_segments_size() - 1).candidate(0).rid; |
| result->SetTypesAndTokenAttributes(REALTIME | REALTIME_TOP, Token::NONE); |
| result->candidate_attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION; |
| |
| // Concatenate the top candidates. |
| // Note that since StartConversionForRequest() runs in conversion mode, the |
| // resulting |tmp_segments| doesn't have inner_segment_boundary. We need to |
| // construct it manually here. |
| // TODO(noriyukit): This is code duplicate in converter/nbest_generator.cc and |
| // we should refactor code after finding more good design. |
| for (size_t i = 0; i < tmp_segments.conversion_segments_size(); ++i) { |
| const Segment &segment = tmp_segments.conversion_segment(i); |
| const Segment::Candidate &candidate = segment.candidate(0); |
| result->value.append(candidate.value); |
| result->wcost += candidate.cost; |
| result->inner_segment_boundary.push_back( |
| make_pair(Util::CharsLen(candidate.key), |
| Util::CharsLen(candidate.value))); |
| } |
| |
| return true; |
| } |
| |
| void DictionaryPredictor::AggregateRealtimeConversion( |
| PredictionTypes types, |
| const ConversionRequest &request, |
| Segments *segments, |
| vector<Result> *results) const { |
| if (!(types & REALTIME)) { |
| return; |
| } |
| |
| DCHECK(converter_); |
| DCHECK(immutable_converter_); |
| DCHECK(segments); |
| DCHECK(results); |
| |
| // TODO(noriyukit): Currently, |segments| is abused as a temporary output from |
| // the immutable converter. Therefore, the first segment needs to be |
| // mutable. Fix this bad abuse. |
| Segment *segment = segments->mutable_conversion_segment(0); |
| DCHECK(!segment->key().empty()); |
| |
| // First insert a top conversion result. |
| if (request.use_actual_converter_for_realtime_conversion()) { |
| if (!PushBackTopConversionResult(request, *segments, results)) { |
| LOG(WARNING) << "Realtime conversion with converter failed"; |
| } |
| } |
| |
| // In what follows, add results from immutable converter. |
| // TODO(noriyukit): The |immutable_converter_| used below can be replaced by |
| // |converter_| in principle. There's a problem of ranking when we get |
| // multiple segments, i.e., how to concatenate candidates in each |
| // segment. Currently, immutable converter handles such ranking in prediction |
| // mode to generate single segment results. So we want to share that code. |
| |
| // Preserve the current max_prediction_candidates_size and candidates_size to |
| // restore them at the end of this method. |
| const size_t prev_candidates_size = segment->candidates_size(); |
| const size_t prev_max_prediction_candidates_size = |
| segments->max_prediction_candidates_size(); |
| |
| // Set how many candidates we want to obtain with the immutable |
| // converter. |
| const bool mixed_conversion = IsMixedConversionEnabled(request.request()); |
| size_t realtime_candidates_size = GetRealtimeCandidateMaxSize( |
| *segments, |
| mixed_conversion, |
| prev_max_prediction_candidates_size - prev_candidates_size); |
| if (realtime_candidates_size == 0) { |
| return; |
| } |
| |
| segments->set_max_prediction_candidates_size( |
| prev_candidates_size + realtime_candidates_size); |
| |
| if (!immutable_converter_->ConvertForRequest(request, segments) || |
| prev_candidates_size >= segment->candidates_size()) { |
| LOG(WARNING) << "Convert failed"; |
| return; |
| } |
| |
| // A little tricky treatment: |
| // Since ImmutableConverter::Convert creates a set of new candidates, |
| // copy them into the array of Results. |
| for (size_t i = prev_candidates_size; |
| i < segment->candidates_size(); ++i) { |
| const Segment::Candidate &candidate = segment->candidate(i); |
| results->push_back(Result()); |
| Result *result = &results->back(); |
| result->key = candidate.key; |
| result->value = candidate.value; |
| result->wcost = candidate.wcost; |
| result->lid = candidate.lid; |
| result->rid = candidate.rid; |
| result->inner_segment_boundary = candidate.inner_segment_boundary; |
| result->SetTypesAndTokenAttributes(REALTIME, Token::NONE); |
| result->candidate_attributes |= candidate.attributes; |
| result->consumed_key_size = candidate.consumed_key_size; |
| } |
| // Remove candidates created by ImmutableConverter. |
| segment->erase_candidates(prev_candidates_size, |
| segment->candidates_size() - |
| prev_candidates_size); |
| // Restore the max_prediction_candidates_size. |
| segments->set_max_prediction_candidates_size( |
| prev_max_prediction_candidates_size); |
| } |
| |
| size_t DictionaryPredictor::GetCandidateCutoffThreshold( |
| const Segments &segments) const { |
| DCHECK(segments.request_type() == Segments::PREDICTION || |
| segments.request_type() == Segments::SUGGESTION); |
| if (segments.request_type() == Segments::PREDICTION) { |
| // If PREDICTION, many candidates are needed than SUGGESTION. |
| return kPredictionMaxResultsSize; |
| } |
| return kSuggestionMaxResultsSize; |
| } |
| |
| void DictionaryPredictor::AggregateUnigramPrediction( |
| PredictionTypes types, |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| if (!(types & UNIGRAM)) { |
| return; |
| } |
| |
| DCHECK(results); |
| DCHECK(segments.request_type() == Segments::PREDICTION || |
| segments.request_type() == Segments::SUGGESTION); |
| |
| const bool mixed_conversion = IsMixedConversionEnabled(request.request()); |
| if (!mixed_conversion) { |
| AggregateUnigramCandidate(request, segments, results); |
| } else { |
| AggregateUnigramCandidateForMixedConversion(request, segments, results); |
| } |
| } |
| |
| void DictionaryPredictor::AggregateUnigramCandidate( |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| DCHECK(results); |
| DCHECK(dictionary_); |
| |
| const size_t cutoff_threshold = GetCandidateCutoffThreshold(segments); |
| const size_t prev_results_size = results->size(); |
| GetPredictiveResults(*dictionary_, "", request, segments, UNIGRAM, |
| cutoff_threshold, results); |
| const size_t unigram_results_size = results->size() - prev_results_size; |
| |
| // If size reaches max_results_size (== cutoff_threshold). |
| // we don't show the candidates, since disambiguation from |
| // 256 candidates is hard. (It may exceed max_results_size, because this is |
| // just a limit for each backend, so total number may be larger) |
| if (unigram_results_size >= cutoff_threshold) { |
| results->resize(prev_results_size); |
| } |
| } |
| |
| void DictionaryPredictor::AggregateUnigramCandidateForMixedConversion( |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| const size_t cutoff_threshold = kPredictionMaxResultsSize; |
| |
| vector<Result> raw_result; |
| // No history key |
| GetPredictiveResults(*dictionary_, "", request, segments, UNIGRAM, |
| cutoff_threshold, &raw_result); |
| |
| // Hereafter, we split "Needed Results" and "(maybe) Unneeded Results." |
| // The algorithm is: |
| // 1) Take the Result with minimum cost. |
| // 2) Remove results which is "redundant" (defined by MaybeRedundant), |
| // from remaining results. |
| // 3) Repeat 1) and 2) five times. |
| // Note: to reduce the number of memory allocation, we swap out the |
| // "redundant" results to the end of the |results| vector. |
| const size_t kDeleteTrialNum = 5; |
| |
| // min_iter is the beginning of the remaining results (inclusive), and |
| // max_iter is the end of the remaining results (exclusive). |
| typedef vector<Result>::iterator Iter; |
| Iter min_iter = raw_result.begin(); |
| Iter max_iter = raw_result.end(); |
| for (size_t i = 0; i < kDeleteTrialNum; ++i) { |
| if (min_iter == max_iter) { |
| break; |
| } |
| |
| // Find the Result with minimum cost. Swap it with the beginning element. |
| iter_swap(min_iter, min_element(min_iter, max_iter, ResultWCostLess())); |
| |
| const Result &reference_result = *min_iter; |
| |
| // Preserve the reference result. |
| ++min_iter; |
| |
| // Traverse all remaining elements and check if each result is redundant. |
| for (Iter iter = min_iter; iter != max_iter; ) { |
| if (MaybeRedundant(reference_result.value, iter->value)) { |
| // Swap out the redundant result. |
| --max_iter; |
| iter_swap(iter, max_iter); |
| } else { |
| ++iter; |
| } |
| } |
| } |
| |
| // Then the |raw_result| contains; |
| // [begin, min_iter): reference results in the above loop. |
| // [max_iter, end): (maybe) redundant results. |
| // [min_iter, max_iter): remaining results. |
| // Here, we revive the redundant results up to five in the result cost order. |
| const size_t kDoNotDeleteNum = 5; |
| if (distance(max_iter, raw_result.end()) >= kDoNotDeleteNum) { |
| partial_sort(max_iter, max_iter + kDoNotDeleteNum, raw_result.end(), |
| ResultWCostLess()); |
| max_iter += kDoNotDeleteNum; |
| } else { |
| max_iter = raw_result.end(); |
| } |
| |
| // Finally output the result. |
| results->insert(results->end(), raw_result.begin(), max_iter); |
| } |
| |
| void DictionaryPredictor::AggregateBigramPrediction( |
| PredictionTypes types, |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| if (!(types & BIGRAM)) { |
| return; |
| } |
| |
| DCHECK(results); |
| DCHECK(dictionary_); |
| |
| // TODO(toshiyuki): Support suggestion from the last 2 histories. |
| // ex) "六本木"+"ヒルズ"->"レジデンス" |
| string history_key, history_value; |
| if (!GetHistoryKeyAndValue(segments, &history_key, &history_value)) { |
| return; |
| } |
| AddBigramResultsFromHistory( |
| history_key, history_value, request, segments, results); |
| } |
| |
| void DictionaryPredictor::AddBigramResultsFromHistory( |
| const string &history_key, |
| const string &history_value, |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| // Check that history_key/history_value are in the dictionary. |
| FindValueCallback find_history_callback(history_value); |
| dictionary_->LookupPrefix(history_key, false, &find_history_callback); |
| |
| // History value is not found in the dictionary. |
| // User may create this the history candidate from T13N or segment |
| // expand/shrinkg operations. |
| if (!find_history_callback.found()) { |
| return; |
| } |
| |
| const size_t cutoff_threshold = GetCandidateCutoffThreshold(segments); |
| const size_t prev_results_size = results->size(); |
| GetPredictiveResultsForBigram( |
| *dictionary_, history_key, history_value, request, segments, BIGRAM, |
| cutoff_threshold, results); |
| const size_t bigram_results_size = results->size() - prev_results_size; |
| |
| // if size reaches max_results_size, |
| // we don't show the candidates, since disambiguation from |
| // 256 candidates is hard. (It may exceed max_results_size, because this is |
| // just a limit for each backend, so total number may be larger) |
| if (bigram_results_size >= cutoff_threshold) { |
| results->resize(prev_results_size); |
| return; |
| } |
| |
| // Obtain the character type of the last history value. |
| const size_t history_value_size = Util::CharsLen(history_value); |
| if (history_value_size == 0) { |
| return; |
| } |
| |
| const Util::ScriptType history_ctype = Util::GetScriptType(history_value); |
| const Util::ScriptType last_history_ctype = |
| Util::GetScriptType(Util::SubString(history_value, |
| history_value_size - 1, 1)); |
| for (size_t i = prev_results_size; i < results->size(); ++i) { |
| CheckBigramResult(find_history_callback.token(), history_ctype, |
| last_history_ctype, &(*results)[i]); |
| } |
| } |
| |
| // Filter out irrelevant bigrams. For example, we don't want to |
| // suggest "リカ" from the history "アメ". |
| void DictionaryPredictor::CheckBigramResult( |
| const Token &history_token, |
| const Util::ScriptType history_ctype, |
| const Util::ScriptType last_history_ctype, |
| Result *result) const { |
| DCHECK(result); |
| |
| const string &history_key = history_token.key; |
| const string &history_value = history_token.value; |
| const string key(result->key, history_key.size(), |
| result->key.size() - history_key.size()); |
| const string value(result->value, history_value.size(), |
| result->value.size() - history_value.size()); |
| |
| // Don't suggest 0-length key/value. |
| if (key.empty() || value.empty()) { |
| result->types = NO_PREDICTION; |
| return; |
| } |
| |
| const Util::ScriptType ctype = |
| Util::GetScriptType(Util::SubString(value, 0, 1)); |
| |
| if (history_ctype == Util::KANJI && |
| ctype == Util::KATAKANA) { |
| // Do not filter "六本木ヒルズ" |
| return; |
| } |
| |
| // If freq("アメ") < freq("アメリカ"), we don't |
| // need to suggest it. As "アメリカ" should already be |
| // suggested when user type "アメ". |
| // Note that wcost = -500 * log(prob). |
| if (ctype != Util::KANJI && |
| history_token.cost > result->wcost) { |
| result->types = NO_PREDICTION; |
| return; |
| } |
| |
| // If character type doesn't change, this boundary might NOT |
| // be a word boundary. If character type is HIRAGANA, |
| // we don't trust it. If Katakana, only trust iif the |
| // entire key is reasonably long. |
| if (ctype == last_history_ctype && |
| (ctype == Util::HIRAGANA || |
| (ctype == Util::KATAKANA && Util::CharsLen(result->key) <= 5))) { |
| result->types = NO_PREDICTION; |
| return; |
| } |
| |
| // The suggested key/value pair must exist in the dictionary. |
| // For example, we don't want to suggest "ターネット" from |
| // the history "イン". |
| // If character type is Kanji and the suggestion is not a |
| // zero_query_suggestion, we relax this condition, as there are |
| // many Kanji-compounds which may not in the dictionary. For example, |
| // we want to suggest "霊長類研究所" from the history "京都大学". |
| if (ctype == Util::KANJI && Util::CharsLen(value) >= 2) { |
| // Do not filter this. |
| // TODO(toshiyuki): one-length kanji prediciton may be annoying other than |
| // some exceptions, "駅", "口", etc |
| return; |
| } |
| |
| FindValueCallback callback(value); |
| dictionary_->LookupPrefix(key, false, &callback); |
| if (!callback.found()) { |
| result->types = NO_PREDICTION; |
| return; |
| } |
| } |
| |
| void DictionaryPredictor::GetPredictiveResults( |
| const DictionaryInterface &dictionary, |
| const string &history_key, |
| const ConversionRequest &request, |
| const Segments &segments, |
| PredictionTypes types, |
| size_t lookup_limit, |
| vector<Result> *results) const { |
| if (!request.has_composer() || |
| !FLAGS_enable_expansion_for_dictionary_predictor) { |
| const string input_key = history_key + segments.conversion_segment(0).key(); |
| PredictiveLookupCallback callback(types, lookup_limit, input_key.size(), |
| NULL, results); |
| dictionary.LookupPredictive(input_key, false, &callback); |
| return; |
| } |
| |
| // If we have ambiguity for the input, get expanded key. |
| // Example1 roman input: for "あk", we will get |base|, "あ" and |expanded|, |
| // "か", "き", etc |
| // Example2 kana input: for "あか", we will get |base|, "あ" and |expanded|, |
| // "か", and "が". |
| string base; |
| set<string> expanded; |
| request.composer().GetQueriesForPrediction(&base, &expanded); |
| const string input_key = history_key + base; |
| PredictiveLookupCallback callback( |
| types, lookup_limit, input_key.size(), |
| expanded.empty() ? NULL : &expanded, results); |
| dictionary.LookupPredictive( |
| input_key, request.IsKanaModifierInsensitiveConversion(), &callback); |
| } |
| |
| void DictionaryPredictor::GetPredictiveResultsForBigram( |
| const DictionaryInterface &dictionary, |
| const string &history_key, |
| const string &history_value, |
| const ConversionRequest &request, |
| const Segments &segments, |
| PredictionTypes types, |
| size_t lookup_limit, |
| vector<Result> *results) const { |
| if (!request.has_composer() || |
| !FLAGS_enable_expansion_for_dictionary_predictor) { |
| const string input_key = history_key + segments.conversion_segment(0).key(); |
| PredictiveBigramLookupCallback callback( |
| types, lookup_limit, input_key.size(), NULL, history_value, results); |
| dictionary.LookupPredictive(input_key, false, &callback); |
| return; |
| } |
| |
| // If we have ambiguity for the input, get expanded key. |
| // Example1 roman input: for "あk", we will get |base|, "あ" and |expanded|, |
| // "か", "き", etc |
| // Example2 kana input: for "あか", we will get |base|, "あ" and |expanded|, |
| // "か", and "が". |
| string base; |
| set<string> expanded; |
| request.composer().GetQueriesForPrediction(&base, &expanded); |
| const string input_key = history_key + base; |
| PredictiveBigramLookupCallback callback(types, lookup_limit, input_key.size(), |
| expanded.empty() ? NULL : &expanded, |
| history_value, results); |
| dictionary.LookupPredictive( |
| input_key, request.IsKanaModifierInsensitiveConversion(), &callback); |
| } |
| |
| void DictionaryPredictor::GetPredictiveResultsForEnglish( |
| const DictionaryInterface &dictionary, |
| const string &history_key, |
| const ConversionRequest &request, |
| const Segments &segments, |
| PredictionTypes types, |
| size_t lookup_limit, |
| vector<Result> *results) const { |
| if (!request.has_composer()) { |
| GetPredictiveResults(dictionary, history_key, request, segments, types, |
| lookup_limit, results); |
| return; |
| } |
| |
| string input_key; |
| request.composer().GetQueryForPrediction(&input_key); |
| // We don't look up English words when key length is one. |
| if (input_key.size() < 2) { |
| return; |
| } |
| const size_t prev_results_size = results->size(); |
| if (Util::IsUpperAscii(input_key)) { |
| // For upper case key, look up its lower case version and then transform the |
| // results to upper case. |
| string key(input_key); |
| Util::LowerString(&key); |
| PredictiveLookupCallback callback(types, lookup_limit, key.size(), NULL, |
| results); |
| dictionary.LookupPredictive(key, false, &callback); |
| for (size_t i = prev_results_size; i < results->size(); ++i) { |
| Util::UpperString(&results->at(i).value); |
| } |
| } else if (Util::IsCapitalizedAscii(input_key)) { |
| // For capitalized key, look up its lower case version and then transform |
| // the results to capital. |
| string key(input_key); |
| Util::LowerString(&key); |
| PredictiveLookupCallback callback(types, lookup_limit, key.size(), NULL, |
| results); |
| dictionary.LookupPredictive(key, false, &callback); |
| for (size_t i = prev_results_size; i < results->size(); ++i) { |
| Util::CapitalizeString(&results->at(i).value); |
| } |
| } else { |
| // For other cases (lower and as-is), just look up directly. |
| PredictiveLookupCallback callback(types, lookup_limit, input_key.size(), |
| NULL, results); |
| dictionary.LookupPredictive(input_key, false, &callback); |
| } |
| // If input mode is FULL_ASCII, then convert the results to full-width. |
| if (request.composer().GetInputMode() == transliteration::FULL_ASCII) { |
| string tmp; |
| for (size_t i = prev_results_size; i < results->size(); ++i) { |
| tmp.assign(results->at(i).value); |
| Util::HalfWidthAsciiToFullWidthAscii(tmp, &results->at(i).value); |
| } |
| } |
| } |
| |
| void DictionaryPredictor::GetPredictiveResultsUsingTypingCorrection( |
| const DictionaryInterface &dictionary, |
| const string &history_key, |
| const ConversionRequest &request, |
| const Segments &segments, |
| PredictionTypes types, |
| size_t lookup_limit, |
| vector<Result> *results) const { |
| if (!request.has_composer()) { |
| return; |
| } |
| |
| vector<composer::TypeCorrectedQuery> queries; |
| request.composer().GetTypeCorrectedQueriesForPrediction(&queries); |
| for (size_t query_index = 0; query_index < queries.size(); ++query_index) { |
| const composer::TypeCorrectedQuery &query = queries[query_index]; |
| const string input_key = history_key + query.base; |
| const size_t previous_results_size = results->size(); |
| PredictiveLookupCallback callback( |
| types, lookup_limit, input_key.size(), |
| query.expanded.empty() ? NULL : &query.expanded, results); |
| dictionary.LookupPredictive( |
| input_key, request.IsKanaModifierInsensitiveConversion(), &callback); |
| |
| for (size_t i = previous_results_size; i < results->size(); ++i) { |
| results->at(i).wcost += query.cost; |
| } |
| lookup_limit -= results->size() - previous_results_size; |
| if (lookup_limit <= 0) { |
| break; |
| } |
| } |
| } |
| |
| void DictionaryPredictor::AggregateSuffixPrediction( |
| PredictionTypes types, |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| if (!(types & SUFFIX)) { |
| return; |
| } |
| |
| DCHECK_GT(segments.conversion_segments_size(), 0); |
| |
| const bool is_zero_query = segments.conversion_segment(0).key().empty(); |
| if (is_zero_query) { |
| string number_key; |
| if (GetNumberHistory(segments, &number_key)) { |
| // Use number suffixes and do not add normal zero query. |
| vector<string> suffixes; |
| GetNumberSuffixArray(number_key, &suffixes); |
| DCHECK_GT(suffixes.size(), 0); |
| int cost = 0; |
| |
| for (vector<string>::const_iterator it = suffixes.begin(); |
| it != suffixes.end(); ++it) { |
| // Increment cost to show the candidates in order. |
| const int kSuffixPenalty = 10; |
| |
| results->push_back(Result()); |
| Result *result = &results->back(); |
| result->SetTypesAndTokenAttributes(SUFFIX, Token::NONE); |
| result->key = *it; |
| result->value = *it; |
| result->wcost = cost; |
| result->lid = counter_suffix_word_id_; |
| result->rid = counter_suffix_word_id_; |
| |
| cost += kSuffixPenalty; |
| } |
| return; |
| } |
| // Fall through |
| // Use normal suffix predictions |
| } |
| |
| const size_t cutoff_threshold = GetCandidateCutoffThreshold(segments); |
| const string kEmptyHistoryKey = ""; |
| GetPredictiveResults(*suffix_dictionary_, kEmptyHistoryKey, request, |
| segments, SUFFIX, cutoff_threshold, results); |
| } |
| |
| void DictionaryPredictor::AggregateEnglishPrediction( |
| PredictionTypes types, |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| if (!(types & ENGLISH)) { |
| return; |
| } |
| DCHECK(results); |
| DCHECK(dictionary_); |
| |
| const size_t cutoff_threshold = GetCandidateCutoffThreshold(segments); |
| const size_t prev_results_size = results->size(); |
| |
| // Currently, history key is never utilized. |
| // TODO(noriyukit): Come up with a way of utilizing it. |
| const string kEmptyHistoryKey = ""; |
| GetPredictiveResultsForEnglish(*dictionary_, kEmptyHistoryKey, request, |
| segments, ENGLISH, cutoff_threshold, results); |
| |
| size_t unigram_results_size = results->size() - prev_results_size; |
| if (unigram_results_size >= cutoff_threshold) { |
| results->resize(prev_results_size); |
| return; |
| } |
| } |
| |
| void DictionaryPredictor::AggregateTypeCorrectingPrediction( |
| PredictionTypes types, |
| const ConversionRequest &request, |
| const Segments &segments, |
| vector<Result> *results) const { |
| if (!(types & TYPING_CORRECTION)) { |
| return; |
| } |
| DCHECK(results); |
| DCHECK(dictionary_); |
| |
| const size_t prev_results_size = results->size(); |
| if (prev_results_size > 10000) { |
| return; |
| } |
| |
| const size_t cutoff_threshold = GetCandidateCutoffThreshold(segments); |
| |
| // Currently, history key is never utilized. |
| const string kEmptyHistoryKey = ""; |
| GetPredictiveResultsUsingTypingCorrection( |
| *dictionary_, kEmptyHistoryKey, request, segments, TYPING_CORRECTION, |
| cutoff_threshold, results); |
| if (results->size() - prev_results_size >= cutoff_threshold) { |
| results->resize(prev_results_size); |
| return; |
| } |
| } |
| |
| DictionaryPredictor::PredictionTypes DictionaryPredictor::GetPredictionTypes( |
| const ConversionRequest &request, const Segments &segments) { |
| if (segments.request_type() == Segments::CONVERSION) { |
| VLOG(2) << "request type is CONVERSION"; |
| return NO_PREDICTION; |
| } |
| |
| if (segments.conversion_segments_size() < 1) { |
| VLOG(2) << "segment size < 1"; |
| return NO_PREDICTION; |
| } |
| |
| PredictionTypes result = NO_PREDICTION; |
| |
| // Check if realtime conversion should be used. |
| if (ShouldRealTimeConversionEnabled(request, segments)) { |
| result |= REALTIME; |
| } |
| |
| const bool zero_query_suggestion = request.request().zero_query_suggestion(); |
| if (IsLatinInputMode(request) && !zero_query_suggestion) { |
| if (GET_CONFIG(use_dictionary_suggest)) { |
| // By following the dictionary_suggest config, enable English prediction. |
| result |= ENGLISH; |
| } |
| |
| // Returns regardless of whether use_dictionary_suggest is enabled or not |
| // in the config, in order to avoid full-width candidates of English words. |
| return result; |
| } |
| |
| if (!GET_CONFIG(use_dictionary_suggest) && |
| segments.request_type() == Segments::SUGGESTION) { |
| VLOG(2) << "no_dictionary_suggest"; |
| return result; |
| } |
| |
| const string &key = segments.conversion_segment(0).key(); |
| const size_t key_len = Util::CharsLen(key); |
| if (key_len == 0 && !zero_query_suggestion) { |
| return result; |
| } |
| |
| // Never trigger prediction if key looks like zip code. |
| if (segments.request_type() == Segments::SUGGESTION && |
| DictionaryPredictor::IsZipCodeRequest(key) && key_len < 6) { |
| return result; |
| } |
| |
| const int kMinUnigramKeyLen = zero_query_suggestion ? 1 : 3; |
| |
| // unigram based suggestion requires key_len >= kMinUnigramKeyLen. |
| // Providing suggestions from very short user input key is annoying. |
| if ((segments.request_type() == Segments::PREDICTION && key_len >= 1) || |
| key_len >= kMinUnigramKeyLen) { |
| result |= UNIGRAM; |
| } |
| |
| const size_t history_segments_size = segments.history_segments_size(); |
| if (history_segments_size > 0) { |
| const Segment &history_segment = |
| segments.history_segment(history_segments_size - 1); |
| const int kMinHistoryKeyLen = zero_query_suggestion ? 2 : 3; |
| // even in PREDICTION mode, bigram-based suggestion requires that |
| // the length of previous key is >= kMinBigramKeyLen. |
| // It also implies that bigram-based suggestion will be triggered, |
| // even if the current key length is short enough. |
| // TOOD(taku): this setting might be aggressive if the current key |
| // looks like Japanese particle like "が|で|は" |
| // If the current key looks like particle, we can make the behavior |
| // less aggressive. |
| if (history_segment.candidates_size() > 0 && |
| Util::CharsLen(history_segment.candidate(0).key) >= kMinHistoryKeyLen) { |
| result |= BIGRAM; |
| } |
| } |
| |
| if (history_segments_size > 0 && zero_query_suggestion) { |
| result |= SUFFIX; |
| } |
| |
| if (IsTypingCorrectionEnabled() && key_len >= 3) { |
| result |= TYPING_CORRECTION; |
| } |
| |
| return result; |
| } |
| |
| bool DictionaryPredictor::ShouldRealTimeConversionEnabled( |
| const ConversionRequest &request, |
| const Segments &segments) { |
| const size_t kMaxRealtimeKeySize = 300; // 300 bytes in UTF8 |
| const string &key = segments.conversion_segment(0).key(); |
| if (key.empty() || key.size() >= kMaxRealtimeKeySize) { |
| // 1) If key is empty, realtime conversion doesn't work. |
| // 2) If the key is too long, we'll hit a performance issue. |
| return false; |
| } |
| |
| return (segments.request_type() == Segments::PARTIAL_SUGGESTION || |
| GET_CONFIG(use_realtime_conversion) || |
| IsMixedConversionEnabled(request.request())); |
| } |
| |
| bool DictionaryPredictor::IsZipCodeRequest(const string &key) { |
| if (key.empty()) { |
| return false; |
| } |
| |
| for (ConstChar32Iterator iter(key); !iter.Done(); iter.Next()) { |
| const char32 c = iter.Get(); |
| if (!('0' <= c && c <= '9') && (c != '-')) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| } // namespace mozc |