blob: 99ab695040ebdc1ab6983d3a6139195a2f9e071a [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_
#define MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_
#include <functional>
#include <string>
#include <vector>
#include "base/util.h"
#include "dictionary/dictionary_token.h"
#include "prediction/predictor_interface.h"
// for FRIEND_TEST()
#include "testing/base/public/gunit_prod.h"
namespace mozc {
class Connector;
class ConversionRequest;
class ConverterInterface;
class DictionaryInterface;
class ImmutableConverterInterface;
class POSMatcher;
class SegmenterInterface;
class Segments;
class SuggestionFilter;
// Dictionary-based predictor
class DictionaryPredictor : public PredictorInterface {
public:
// Initializes a predictor with given references to submodules. Note that
// pointers are not owned by the class and to be deleted by the caller.
DictionaryPredictor(const ConverterInterface *converter,
const ImmutableConverterInterface *immutable_converter,
const DictionaryInterface *dictionary,
const DictionaryInterface *suffix_dictionary,
const Connector *connector,
const SegmenterInterface *segmenter,
const POSMatcher *pos_matcher,
const SuggestionFilter *suggestion_filter);
virtual ~DictionaryPredictor();
virtual bool PredictForRequest(const ConversionRequest &request,
Segments *segments) const;
virtual const string &GetPredictorName() const { return predictor_name_; }
protected:
// Protected members for unittesting
// For use util method accessing private members, made them protected.
// http://code.google.com/p/googletest/wiki/FAQ
enum PredictionType {
// don't need to show any suggestions.
NO_PREDICTION = 0,
// suggests from current key user is now typing
UNIGRAM = 1,
// suggests from the previous history key user typed before.
BIGRAM = 2,
// suggests from immutable_converter
REALTIME = 4,
// add suffixes like "さん", "が" which matches to the pevious context.
SUFFIX = 8,
// add English words.
ENGLISH = 16,
// add prediciton to type corrected keys
TYPING_CORRECTION = 32,
// Suggests from |converter_|. The difference from REALTIME is that it uses
// the full converter with rewriter, history, etc.
// TODO(noriyukit): This label should be integrated with REALTIME. This is
// why 65536 is used to indicate that it is a temporary assignment.
REALTIME_TOP = 65536,
};
// Bitfield to store a set of PredictionType.
typedef int32 PredictionTypes;
struct Result {
Result() : types(NO_PREDICTION), wcost(0), cost(0), lid(0), rid(0),
candidate_attributes(0), consumed_key_size(0) {}
void InitializeByTokenAndTypes(const Token &token, PredictionTypes types);
void SetTypesAndTokenAttributes(PredictionTypes prediction_types,
Token::AttributesBitfield token_attr);
string key;
string value;
// Indicating which PredictionType creates this instance.
// UNIGRAM, BIGRAM, REALTIME, SUFFIX, ENGLISH or TYPING_CORRECTION
// is set exclusively.
// TODO(matsuzakit): Using PredictionTypes both as input and output
// makes the code complex. Let's split them.
PredictionTypes types;
// Context "insensitive" candidate cost.
int wcost;
// Context "sensitive" candidate cost.
int cost;
int lid;
int rid;
// Boundary information for realtime conversion.
// This will be set only for realtime conversion result candidates.
// This contains inner segment size for key and value.
// If the candidate key and value are
// "わたしの|なまえは|なかのです", " 私の|名前は|中野です",
// |inner_segment_boundary| have [(4,2), (4, 3), (5, 4)].
vector<uint32> inner_segment_boundary;
uint32 candidate_attributes;
size_t consumed_key_size;
};
// On MSVS2008/2010, Constructors of TestableDictionaryPredictor::Result
// causes a compile error even if you change the access right of it to public.
// You can use TestableDictionaryPredictor::MakeEmptyResult() instead.
static Result MakeEmptyResult() {
return Result();
}
class PredictiveLookupCallback;
class PredictiveBigramLookupCallback;
class ResultWCostLess;
class ResultCostLess;
void AggregateRealtimeConversion(PredictionTypes types,
const ConversionRequest &request,
Segments *segments,
vector<Result> *results) const;
void AggregateUnigramPrediction(PredictionTypes types,
const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
void AggregateBigramPrediction(PredictionTypes types,
const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
void AggregateSuffixPrediction(PredictionTypes types,
const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
void AggregateEnglishPrediction(PredictionTypes types,
const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
void AggregateTypeCorrectingPrediction(PredictionTypes types,
const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
bool AggregateNumberZeroQueryPrediction(const Segments &segments,
vector<Result> *results) const;
bool AggregateZeroQueryPrediction(const Segments &segments,
vector<Result> *result) const;
void ApplyPenaltyForKeyExpansion(const Segments &segments,
vector<Result> *results) const;
bool AddPredictionToCandidates(const ConversionRequest &request,
Segments *segments,
vector<Result> *results) const;
private:
FRIEND_TEST(DictionaryPredictorTest, GetPredictionTypes);
FRIEND_TEST(DictionaryPredictorTest,
GetPredictionTypesTestWithTypingCorrection);
FRIEND_TEST(DictionaryPredictorTest,
GetPredictionTypesTestWithZeroQuerySuggestion);
FRIEND_TEST(DictionaryPredictorTest, IsZipCodeRequest);
FRIEND_TEST(DictionaryPredictorTest, GetRealtimeCandidateMaxSize);
FRIEND_TEST(DictionaryPredictorTest, GetRealtimeCandidateMaxSizeForMixed);
FRIEND_TEST(DictionaryPredictorTest,
GetRealtimeCandidateMaxSizeWithActualConverter);
FRIEND_TEST(DictionaryPredictorTest, GetCandidateCutoffThreshold);
FRIEND_TEST(DictionaryPredictorTest, AggregateUnigramPrediction);
FRIEND_TEST(DictionaryPredictorTest, AggregateBigramPrediction);
FRIEND_TEST(DictionaryPredictorTest, AggregateSuffixPrediction);
FRIEND_TEST(DictionaryPredictorTest, ZeroQuerySuggestionAfterNumbers);
FRIEND_TEST(DictionaryPredictorTest, TriggerNumberZeroQuerySuggestion);
FRIEND_TEST(DictionaryPredictorTest, TriggerZeroQuerySuggestion);
FRIEND_TEST(DictionaryPredictorTest, GetHistoryKeyAndValue);
FRIEND_TEST(DictionaryPredictorTest, RealtimeConversionStartingWithAlphabets);
FRIEND_TEST(DictionaryPredictorTest, IsAggressiveSuggestion);
FRIEND_TEST(DictionaryPredictorTest,
RealtimeConversionWithSpellingCorrection);
FRIEND_TEST(DictionaryPredictorTest, GetMissSpelledPosition);
FRIEND_TEST(DictionaryPredictorTest, RemoveMissSpelledCandidates);
FRIEND_TEST(DictionaryPredictorTest, ConformCharacterWidthToPreference);
FRIEND_TEST(DictionaryPredictorTest, SetLMCost);
FRIEND_TEST(DictionaryPredictorTest, SetDescription);
FRIEND_TEST(DictionaryPredictorTest, SetDebugDescription);
// Returns false if no results were aggregated.
bool AggregatePrediction(const ConversionRequest &request,
Segments *segments,
vector<Result> *results) const;
void SetCost(const ConversionRequest &request,
const Segments &segments, vector<Result> *results) const;
// Removes prediciton by setting NO_PREDICTION to result type if necessary.
void RemovePrediction(const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
// Adds prediction results from history key and value.
void AddBigramResultsFromHistory(const string &history_key,
const string &history_value,
const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
// Changes the prediction type for irrelevant bigram candidate.
void CheckBigramResult(const Token &history_token,
const Util::ScriptType history_ctype,
const Util::ScriptType last_history_ctype,
Result *result) const;
void GetPredictiveResults(const DictionaryInterface &dictionary,
const string &history_key,
const ConversionRequest &request,
const Segments &segments,
PredictionTypes types,
size_t lookup_limit,
vector<Result> *results) const;
void GetPredictiveResultsForBigram(const DictionaryInterface &dictionary,
const string &history_key,
const string &history_value,
const ConversionRequest &request,
const Segments &segments,
PredictionTypes types,
size_t lookup_limit,
vector<Result> *results) const;
// Performs a custom look up for English words where case-conversion might be
// applied to lookup key and/or output results.
void GetPredictiveResultsForEnglish(const DictionaryInterface &dictionary,
const string &history_key,
const ConversionRequest &request,
const Segments &segments,
PredictionTypes types,
size_t lookup_limit,
vector<Result> *results) const;
// Performs look-ups using type-corrected queries from composer. Usually
// involves multiple look-ups from dictionary.
void GetPredictiveResultsUsingTypingCorrection(
const DictionaryInterface &dictionary,
const string &history_key,
const ConversionRequest &request,
const Segments &segments,
PredictionTypes types,
size_t lookup_limit,
vector<Result> *results) const;
// Returns the position of misspelled character position.
//
// Example1:
// key: "れみおめろん"
// value: "レミオロメン"
// returns 3
//
// Example3:
// key: "ろっぽんぎ"5
// value: "六本木"
// returns 5 (charslen("六本木"))
size_t GetMissSpelledPosition(const string &key,
const string &value) const;
// Returns language model cost of |token| given prediciton type |type|.
// |rid| is the right id of previous word (token).
// If |rid| is uknown, set 0 as a default value.
int GetLMCost(const Result &result, int rid) const;
// Given the results aggregated by aggregates, remove
// miss-spelled results from the |results|.
// we don't directly remove miss-spelled result but set
// result[i].type = NO_PREDICTION.
//
// Here's the basic step of removal:
// Case1:
// result1: "あぼがど" => "アボガド"
// result2: "あぼがど" => "アボカド" (spelling correction)
// result3: "あぼかど" => "アボカド"
// In this case, we can remove result 1 and 2.
// If there exists the same result2.key in result1,3 and
// the same result2.value in result1,3, we can remove the
// 1) spelling correction candidate 2) candidate having
// the same key as the spelling correction candidate.
//
// Case2:
// result1: "あぼかど" => "アボカド"
// result2: "あぼがど" => "アボカド" (spelling correction)
// In this case, remove result2.
//
// Case3:
// result1: "あぼがど" => "アボガド"
// result2: "あぼがど" => "アボカド" (spelling correction)
// In this case,
// a) user input: あ,あぼ,あぼ => remove result1, result2
// b) user input: あぼが,あぼがど => remove result1
//
// let |same_key_size| and |same_value_size| be the number of
// non-spelling-correction-candidates who have the same key/value as
// spelling-correction-candidate respectively.
//
// if (same_key_size > 0 && same_value_size > 0) {
// remove spelling correction and candidates having the
// same key as the spelling correction.
// } else if (same_key_size == 0 && same_value_size > 0) {
// remove spelling correction
// } else {
// do nothing.
// }
void RemoveMissSpelledCandidates(size_t request_key_len,
vector<Result> *results) const;
// Scoring function which takes prediction bounus into account.
// It basically reranks the candidate by lang_prob * (1 + remain_len).
// This algorithm is mainly used for desktop.
void SetPredictionCost(const Segments &segments,
vector<Result> *results) const;
// Language model-based scoring function.
// This algorithm is mainly used for mobile.
void SetLMCost(const Segments &segments,
vector<Result> *results) const;
// Returns true if the suggestion is classified
// as "aggressive".
bool IsAggressiveSuggestion(
size_t query_len, size_t key_len, int cost,
bool is_suggestion, size_t total_candidates_size) const;
// Gets history key/value.
// Returns false if history segments are
// not found.
bool GetHistoryKeyAndValue(const Segments &segments,
string *key, string *value) const;
// Returns a bitfield of PredictionType.
static PredictionTypes GetPredictionTypes(const ConversionRequest &request,
const Segments &segments);
// Returns true if the realtime conversion should be used.
// TODO(hidehiko): add Config and Request instances into the arguments
// to represent the dependency explicitly.
static bool ShouldRealTimeConversionEnabled(const ConversionRequest &request,
const Segments &segments);
// Returns true if key consistes of '0'-'9' or '-'
static bool IsZipCodeRequest(const string &key);
// Returns max size of realtime candidates.
size_t GetRealtimeCandidateMaxSize(const Segments &segments,
bool mixed_conversion,
size_t max_size) const;
// Aggregates unigram candidate for non mixed conversion.
void AggregateUnigramCandidate(const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
// Aggregates unigram candidate for mixed conversion.
// This reduces redundant candidates.
void AggregateUnigramCandidateForMixedConversion(
const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
// Returns cutoff threshold of unigram candidates.
// AggregateUnigramPrediction method does not return any candidates
// if there are too many (>= cutoff threshold) eligible candidates.
// This behavior prevents a user from seeing too many prefix-match
// candidates.
size_t GetCandidateCutoffThreshold(const Segments &segments) const;
// Generates a top conversion result from |converter_| and adds its result to
// |results|.
bool PushBackTopConversionResult(const ConversionRequest &request,
const Segments &segments,
vector<Result> *results) const;
// Sets candidate description.
static void SetDescription(PredictionTypes types,
uint32 attributes,
string *description);
// Description for DEBUG mode.
static void SetDebugDescription(PredictionTypes types,
string *description);
const ConverterInterface *converter_;
const ImmutableConverterInterface *immutable_converter_;
const DictionaryInterface *dictionary_;
const DictionaryInterface *suffix_dictionary_;
const Connector *connector_;
const SegmenterInterface *segmenter_;
const SuggestionFilter *suggestion_filter_;
const uint16 counter_suffix_word_id_;
const string predictor_name_;
DISALLOW_COPY_AND_ASSIGN(DictionaryPredictor);
};
} // namespace mozc
#endif // MOZC_PREDICTION_DICTIONARY_PREDICTOR_H_