blob: b33b948e91a0ef0ee2e350341c39bb5e92c79ab3 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/user_segment_history_rewriter.h"
#include <algorithm>
#include <cctype>
#include <set>
#include <string>
#include <vector>
#include "base/compiler_specific.h"
#include "base/config_file_stream.h"
#include "base/file_util.h"
#include "base/logging.h"
#include "base/number_util.h"
#include "base/string_piece.h"
#include "base/util.h"
#include "config/character_form_manager.h"
#include "config/config.pb.h"
#include "config/config_handler.h"
#include "converter/conversion_request.h"
#include "converter/segments.h"
#include "dictionary/pos_group.h"
#include "dictionary/pos_matcher.h"
#include "rewriter/rewriter_interface.h"
#include "rewriter/variants_rewriter.h"
#include "storage/lru_storage.h"
#include "transliteration/transliteration.h"
#include "usage_stats/usage_stats.h"
namespace mozc {
using config::CharacterFormManager;
using config::Config;
using storage::LRUStorage;
namespace {
const uint32 kValueSize = 4;
const uint32 kLRUSize = 20000;
const uint32 kSeedValue = 0xf28defe3;
const uint32 kMaxCandidatesSize = 255;
// Size of candidates to be reranked to the top at one sorting operation.
// Note, if sorting operation is called twice, up to 10 (= 5 * 2) candidates
// could be reranked in total.
const size_t kMaxRerankSize = 5;
const char kFileName[] = "user://segment.db";
// Temporarily disable unused private field warning against
// FeatureValue::reserved_ from Clang.
// We use MOZC_CLANG_HAS_WARNING to check whether "-Wunused-private-field" is
// available, because XCode 4.4 clang (based on LLVM 3.1svn) doesn't have it.
MOZC_CLANG_PUSH_WARNING();
#if MOZC_CLANG_HAS_WARNING(unused-private-field)
MOZC_CLANG_DISABLE_WARNING(unused-private-field);
#endif
class FeatureValue {
public:
FeatureValue() : feature_type_(1), reserved_(0) {}
bool IsValid() const {
return (feature_type_ == 1);
}
private:
uint32 feature_type_ : 1; // always 1
uint32 reserved_ : 31; // this area is reserved for future
};
MOZC_CLANG_POP_WARNING();
bool IsPunctuationInternal(const string &str) {
// return (str == "。" || str == "。" ||
// str == "、" || str == "、" ||
// str == "," || str == "," ||
// str == "." || str == ".");
return (str == "\xE3\x80\x82" || str == "\xEF\xBD\xA1" ||
str == "\xE3\x80\x81" || str == "\xEF\xBD\xA4" ||
str == "\xEF\xBC\x8C" || str == "," ||
str == "\xEF\xBC\x8E" || str == ".");
}
// Temporarily disable unused private field warning against
// KeyTriggerValue::reserved_ from Clang.
// We use MOZC_CLANG_HAS_WARNING to check whether "-Wunused-private-field" is
// available, because XCode 4.4 clang (based on LLVM 3.1svn) doesn't have it.
MOZC_CLANG_PUSH_WARNING();
#if MOZC_CLANG_HAS_WARNING(unused-private-field)
MOZC_CLANG_DISABLE_WARNING(unused-private-field);
#endif
class KeyTriggerValue {
public:
KeyTriggerValue()
: feature_type_(0), reserved_(0), candidates_size_(0) {}
bool IsValid() const {
return (feature_type_ == 0);
}
uint32 candidates_size() const {
return candidates_size_;
}
void set_candidates_size(uint32 size) {
candidates_size_ = min(size, kMaxCandidatesSize);
}
private:
uint32 feature_type_ : 1; // always 0
uint32 reserved_ : 23; // this area is reserved for future
// want to encode POS, freq etc.
uint32 candidates_size_ : 8; // candidate size
};
MOZC_CLANG_POP_WARNING();
class ScoreTypeCompare {
public:
bool operator() (const UserSegmentHistoryRewriter::ScoreType &a,
const UserSegmentHistoryRewriter::ScoreType &b) const {
if (a.score != b.score) {
return (a.score > b.score);
}
return (a.last_access_time > b.last_access_time);
}
};
// return the first candiadte which has "BEST_CANDIDATE" attribute
inline int GetDefaultCandidateIndex(const Segment &segment) {
// Check up to kMaxRerankSize + 1 candidates because candidate with
// BEST_CANDIATE is highly possibly in that range (http://b/9992330).
const int size = static_cast<int>(min(segment.candidates_size(),
kMaxRerankSize + 1));
for (int i = 0; i < size; ++i) {
if (segment.candidate(i).attributes &
Segment::Candidate::BEST_CANDIDATE) {
return i;
}
}
LOG(WARNING) << "Cannot find default candidate. "
<< "key: " << segment.key() << ", "
<< "candidates_size: " << segment.candidates_size();
return 0;
}
// JoinStringWithTabN joins N strings with TAB delimiters ('\t') in a way
// similar to Util::JoinStrings() and/or Util::AppendStringWithDelimiter() but
// in a more efficient way. Since this module is called every key stroke and
// performs many string concatenation, we use these functions instead of ones
// from Util.
inline void JoinStringsWithTab2(
const StringPiece s1, const StringPiece s2, string *output) {
// Pre-allocate the buffer, including 1 TAB delimiter.
output->reserve(s1.size() + s2.size() + 1);
output->assign(s1.data(), s1.size()).append("\t")
.append(s2.data(), s2.size());
}
inline void JoinStringsWithTab3(
const StringPiece s1, const StringPiece s2, const StringPiece s3,
string *output) {
// Pre-allocate the buffer, including 2 TAB delimiters.
output->reserve(s1.size() + s2.size() + s3.size() + 2);
output->assign(s1.data(), s1.size()).append("\t")
.append(s2.data(), s2.size()).append("\t")
.append(s3.data(), s3.size());
}
inline void JoinStringsWithTab4(
const StringPiece s1, const StringPiece s2, const StringPiece s3,
const StringPiece s4, string *output) {
// Pre-allocate the buffer, including 3 TAB delimiters.
output->reserve(s1.size() + s2.size() + s3.size() + s4.size() + 3);
output->assign(s1.data(), s1.size()).append("\t")
.append(s2.data(), s2.size()).append("\t")
.append(s3.data(), s3.size()).append("\t")
.append(s4.data(), s4.size());
}
inline void JoinStringsWithTab5(
const StringPiece s1, const StringPiece s2, const StringPiece s3,
const StringPiece s4, const StringPiece s5, string *output) {
// Pre-allocate the buffer, including 4 TAB delimiters.
output->reserve(
s1.size() + s2.size() + s3.size() + s4.size() + s5.size() + 4);
output->assign(s1.data(), s1.size()).append("\t")
.append(s2.data(), s2.size()).append("\t")
.append(s3.data(), s3.size()).append("\t")
.append(s4.data(), s4.size()).append("\t")
.append(s5.data(), s5.size());
}
// Feature "Left Right"
inline bool GetFeatureLR(const Segments &segments, size_t i,
const string &base_key,
const string &base_value, string *value) {
DCHECK(value);
if (i + 1 >= segments.segments_size() || i <= 0) {
return false;
}
const int j1 = GetDefaultCandidateIndex(segments.segment(i - 1));
const int j2 = GetDefaultCandidateIndex(segments.segment(i + 1));
JoinStringsWithTab5(StringPiece("LR", 2),
base_key,
segments.segment(i - 1).candidate(j1).value,
base_value,
segments.segment(i + 1).candidate(j2).value,
value);
return true;
}
// Feature "Left Left"
inline bool GetFeatureLL(const Segments &segments, size_t i,
const string &base_key,
const string &base_value, string *value) {
DCHECK(value);
if (i < 2) {
return false;
}
const int j1 = GetDefaultCandidateIndex(segments.segment(i - 2));
const int j2 = GetDefaultCandidateIndex(segments.segment(i - 1));
JoinStringsWithTab5(StringPiece("LL", 2),
base_key,
segments.segment(i - 2).candidate(j1).value,
segments.segment(i - 1).candidate(j2).value,
base_value,
value);
return true;
}
// Feature "Right Right"
inline bool GetFeatureRR(const Segments &segments, size_t i,
const string &base_key,
const string &base_value, string *value) {
DCHECK(value);
if (i + 2 >= segments.segments_size()) {
return false;
}
const int j1 = GetDefaultCandidateIndex(segments.segment(i + 1));
const int j2 = GetDefaultCandidateIndex(segments.segment(i + 2));
JoinStringsWithTab5(StringPiece("RR", 2),
base_key,
base_value,
segments.segment(i + 1).candidate(j1).value,
segments.segment(i + 2).candidate(j2).value,
value);
return true;
}
// Feature "Left"
inline bool GetFeatureL(const Segments &segments, size_t i,
const string &base_key,
const string &base_value, string *value) {
DCHECK(value);
if (i < 1) {
return false;
}
const int j = GetDefaultCandidateIndex(segments.segment(i - 1));
JoinStringsWithTab4(StringPiece("L", 1),
base_key,
segments.segment(i - 1).candidate(j).value,
base_value,
value);
return true;
}
// Feature "Right"
inline bool GetFeatureR(const Segments &segments, size_t i,
const string &base_key,
const string &base_value, string *value) {
DCHECK(value);
if (i + 1 >= segments.segments_size()) {
return false;
}
const int j = GetDefaultCandidateIndex(segments.segment(i + 1));
JoinStringsWithTab4(StringPiece("R", 1),
base_key,
base_value,
segments.segment(i + 1).candidate(j).value,
value);
return true;
}
// Feature "Current"
inline bool GetFeatureC(const Segments &segments, size_t i,
const string &base_key,
const string &base_value, string *value) {
DCHECK(value);
JoinStringsWithTab3(StringPiece("C", 1), base_key, base_value, value);
return true;
}
// Feature "Single"
inline bool GetFeatureS(const Segments &segments, size_t i,
const string &base_key,
const string &base_value, string *value) {
DCHECK(value);
if (segments.segments_size() - segments.history_segments_size() != 1) {
return false;
}
JoinStringsWithTab3(StringPiece("S", 1), base_key, base_value, value);
return true;
}
// Feature "Number"
// used for number rewrite
inline bool GetFeatureN(uint16 type, string *value) {
DCHECK(value);
JoinStringsWithTab2(StringPiece("N", 1), NumberUtil::SimpleItoa(type), value);
return true;
}
bool IsNumberSegment(const Segment &seg) {
if (seg.key().empty()) {
return false;
}
bool is_number = true;
for (size_t i = 0; i < seg.key().size(); ++i) {
if (!isdigit(static_cast<unsigned char>(seg.key()[i]))) {
is_number = false;
break;
}
}
return is_number;
}
void GetValueByType(const Segment *segment,
NumberUtil::NumberString::Style style,
string *output) {
DCHECK(output);
for (size_t i = 0; i < segment->candidates_size(); ++i) {
if (segment->candidate(i).style == style) {
*output = segment->candidate(i).value;
return;
}
}
return;
}
// NormalizeCandidate using config
void NormalizeCandidate(const Segment *segment, int n,
string *normalized_value) {
const Segment::Candidate &candidate = segment->candidate(n);
// use "AS IS"
if (candidate.attributes & Segment::Candidate::NO_VARIANTS_EXPANSION) {
*normalized_value = candidate.value;
return;
}
string result = candidate.value;
switch (candidate.style) {
case NumberUtil::NumberString::DEFAULT_STYLE:
CharacterFormManager::GetCharacterFormManager()->
ConvertConversionString(candidate.value, &result);
break;
case NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_HALFWIDTH:
case NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_FULLWIDTH:
// Convert separated arabic here and don't use character form manager
// so that suppressing mixed form of candidates
// ("1,234" etc.)
// and the forms of separated arabics are learned in converter using
// style.
{
const Config::CharacterForm form =
CharacterFormManager::GetCharacterFormManager()->
GetConversionCharacterForm("0");
if (form == Config::FULL_WIDTH) {
GetValueByType(
segment,
NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_FULLWIDTH,
&result);
} else if (form == Config::HALF_WIDTH) {
GetValueByType(
segment,
NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_HALFWIDTH,
&result);
}
}
break;
default:
break;
}
*normalized_value = result;
}
// Gets the candidate index which has same value as given candidate.
// This function returns false if not found.
// When candidate is in meta candidate,
// set meta candidate index, (-index-1) to position.
bool GetSameValueCandidatePosition(const Segment *segment,
const Segment::Candidate *candidate,
int *position) {
DCHECK(position);
for (size_t i = 0; i < segment->candidates_size(); ++i) {
if (segment->candidate(i).value == candidate->value) {
*position = i;
return true;
}
}
for (size_t i = 0; i < segment->meta_candidates_size(); ++i) {
if (segment->meta_candidate(i).value == candidate->value) {
*position = (-static_cast<int>(i)-1); // meta candidate index
return true;
}
}
return false;
}
bool IsT13NCandidate(const Segment::Candidate &cand) {
// Regard the cand with 0-id as the transliterated candidate.
return (cand.lid == 0 && cand.rid == 0);
}
} // namespace
bool UserSegmentHistoryRewriter::SortCandidates(
const vector<ScoreType> &sorted_scores, Segment *segment) const {
const uint32 top_score = sorted_scores[0].score;
const size_t size = min(sorted_scores.size(), kMaxRerankSize);
const uint32 kScoreGap = 20; // TODO(taku): no justification
set<string> seen;
size_t next_pos = 0;
for (size_t n = 0; n < size; ++n) {
// Move candidates when the score is close to the top score.
if (kScoreGap < (top_score - sorted_scores[n].score)) {
break;
}
const Segment::Candidate *candidate = sorted_scores[n].candidate;
DCHECK(candidate);
int old_position = 0;
if (!GetSameValueCandidatePosition(segment, candidate, &old_position)) {
LOG(ERROR) << "cannot find the candidate: " << sorted_scores[0].candidate;
return false;
}
// We check character form here. If user prefers "half-width",
// Mozc always provides half-width even when user input
// full-width before.
string normalized_value;
NormalizeCandidate(segment, old_position, &normalized_value);
if (normalized_value != candidate->value) {
const Segment::Candidate *normalized_cand = NULL;
for (size_t l = 0; l < segment->candidates_size(); ++l) {
if (segment->candidate(l).value == normalized_value) {
normalized_cand = &segment->candidate(l);
break;
}
}
if (normalized_cand != NULL) {
if (seen.find(normalized_value) == seen.end()) {
const int pos = segment->indexOf(normalized_cand);
DCHECK(pos != segment->candidates_size());
segment->move_candidate(pos, next_pos);
++next_pos;
seen.insert(normalized_value);
}
} else {
// If default character form is different and
// is not found in the candidates, make a new
// candidate and push it to the top.
Segment::Candidate *new_candidate =
segment->insert_candidate(next_pos);
DCHECK(new_candidate);
*new_candidate = *candidate; // copy candidate
new_candidate->value = normalized_value;
CharacterFormManager::GetCharacterFormManager()->
ConvertConversionString(candidate->content_value,
&(new_candidate->content_value));
// Update description so it matches candidate's current value.
// This fix addresses Bug #3493644.
// (Wrong character width annotation after learning alphabet)
new_candidate->description.clear();
VariantsRewriter::SetDescriptionForCandidate(*pos_matcher_,
new_candidate);
++next_pos;
seen.insert(normalized_value);
}
} else {
if (seen.find(candidate->value) == seen.end()) {
segment->move_candidate(old_position, next_pos);
++next_pos;
seen.insert(candidate->value);
}
}
}
return true;
}
UserSegmentHistoryRewriter::UserSegmentHistoryRewriter(
const POSMatcher *pos_matcher,
const PosGroup *pos_group)
: storage_(new LRUStorage),
pos_matcher_(pos_matcher),
pos_group_(pos_group) {
Reload();
CHECK_EQ(sizeof(uint32), sizeof(FeatureValue));
CHECK_EQ(sizeof(uint32), sizeof(KeyTriggerValue));
}
UserSegmentHistoryRewriter::~UserSegmentHistoryRewriter() {}
#define INSERT_FEATURE(func, base_key, base_value, force_insert) \
do { \
if (func((segments), segment_index, base_key, base_value, &feature_key)) { \
FeatureValue v; \
DCHECK(v.IsValid()); \
if (force_insert) { \
storage_->Insert(feature_key, reinterpret_cast<const char *>(&v)); \
} else { \
storage_->TryInsert(feature_key, reinterpret_cast<const char *>(&v)); \
} \
} \
} while (0)
#define FETCH_FEATURE(func, base_key, base_value, weight) \
do { \
if (func(segments, segment_index, base_key, base_value, &feature_key)) { \
const FeatureValue *v = \
reinterpret_cast<const FeatureValue *> \
(storage_->Lookup(feature_key, &last_access_time_result)); \
if (v != NULL && v->IsValid()) { \
*score = max(*score, weight); \
*last_access_time = max(*last_access_time, last_access_time_result); \
} \
} \
} while (0)
bool UserSegmentHistoryRewriter::GetScore(const Segments &segments,
size_t segment_index,
int candidate_index,
uint32 *score,
uint32 *last_access_time) const {
const size_t segments_size = segments.conversion_segments_size();
const Segment::Candidate &top_candidate =
segments.segment(segment_index).candidate(0);
const Segment::Candidate &candidate =
segments.segment(segment_index).candidate(candidate_index);
const string &all_value = candidate.value;
const string &content_value = candidate.content_value;
const string &all_key = segments.segment(segment_index).key();
const string &content_key = candidate.content_key;
// if the segments are resized by user OR
// either top/target candidate has CONTEXT_SENSITIVE flags,
// don't apply UNIGRAM model
const bool context_sensitive =
segments.resized() ||
(candidate.attributes & Segment::Candidate::CONTEXT_SENSITIVE) ||
(segments.segment(segment_index).candidate(0).attributes &
Segment::Candidate::CONTEXT_SENSITIVE);
DCHECK(score);
DCHECK(last_access_time);
*score = 0;
*last_access_time = 0;
// They are used inside FETCH_FEATURE
uint32 last_access_time_result = 0;
string feature_key;
const uint32 trigram_score = (segments_size == 3) ? 180 : 30;
const uint32 bigram_score = (segments_size == 2) ? 60 : 10;
const uint32 bigram_number_score = (segments_size == 2) ? 50 : 8;
const uint32 unigram_score = (segments_size == 1) ? 36 : 6;
const uint32 single_score = (segments_size == 1) ? 90 : 15;
FETCH_FEATURE(GetFeatureLR, all_key, all_value, trigram_score);
FETCH_FEATURE(GetFeatureLL, all_key, all_value, trigram_score);
FETCH_FEATURE(GetFeatureRR, all_key, all_value, trigram_score);
FETCH_FEATURE(GetFeatureL, all_key, all_value, bigram_score);
FETCH_FEATURE(GetFeatureR, all_key, all_value, bigram_score);
FETCH_FEATURE(GetFeatureS, all_key, all_value, single_score);
FETCH_FEATURE(GetFeatureLN, content_key, content_value, bigram_number_score);
FETCH_FEATURE(GetFeatureRN, content_key, content_value, bigram_number_score);
const bool is_replaceable = Replaceable(top_candidate, candidate);
if (!context_sensitive && is_replaceable) {
FETCH_FEATURE(GetFeatureC, all_key, all_value, unigram_score);
}
if (!is_replaceable) {
return (*score > 0);
}
FETCH_FEATURE(GetFeatureLR, content_key, content_value, trigram_score / 2);
FETCH_FEATURE(GetFeatureLL, content_key, content_value, trigram_score / 2);
FETCH_FEATURE(GetFeatureRR, content_key, content_value, trigram_score / 2);
FETCH_FEATURE(GetFeatureL, content_key, content_value, bigram_score / 2);
FETCH_FEATURE(GetFeatureR, content_key, content_value, bigram_score / 2);
FETCH_FEATURE(GetFeatureS, content_key, content_value, single_score / 2);
FETCH_FEATURE(GetFeatureLN, content_key,
content_value, bigram_number_score / 2);
FETCH_FEATURE(GetFeatureRN, content_key,
content_value, bigram_number_score / 2);
if (!context_sensitive) {
FETCH_FEATURE(GetFeatureC, content_key, content_value, unigram_score / 2);
}
return (*score > 0);
}
// Returns true if |lhs| candidate can be replaceable with |rhs|.
bool UserSegmentHistoryRewriter::Replaceable(
const Segment::Candidate &lhs, const Segment::Candidate &rhs) const {
const bool same_functional_value =
(lhs.functional_value() == rhs.functional_value());
const bool same_pos_group =
(pos_group_->GetPosGroup(lhs.lid) == pos_group_->GetPosGroup(rhs.lid));
return (same_functional_value &&
(same_pos_group || IsT13NCandidate(lhs) || IsT13NCandidate(rhs)));
}
void UserSegmentHistoryRewriter::RememberNumberPreference(
const Segment &segment) {
const Segment::Candidate &candidate = segment.candidate(0);
if ((candidate.style ==
NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_HALFWIDTH) ||
(candidate.style ==
NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_FULLWIDTH)) {
// in the case of:
// 1. submit "123"
// 2. submit "一二三"
// 3. submit "1、234"
// 4. type "123"
// We want "123", not "一二三"
// So learn default before learning separated
// However, access time is count by second, so
// separated and default is learned at same time
// This problem is solved by workaround on lookup.
string default_feature_key;
GetFeatureN(NumberUtil::NumberString::DEFAULT_STYLE, &default_feature_key);
FeatureValue v;
DCHECK(v.IsValid());
storage_->Insert(default_feature_key, reinterpret_cast<const char *>(&v));
}
string feature_key;
GetFeatureN(candidate.style, &feature_key);
FeatureValue v;
DCHECK(v.IsValid());
// Always insert for numbers
storage_->Insert(feature_key, reinterpret_cast<const char *>(&v));
}
void UserSegmentHistoryRewriter::RememberFirstCandidate(
const Segments &segments,
size_t segment_index) {
const Segment &seg = segments.segment(segment_index);
if (seg.candidates_size() <= 1) {
return;
}
const Segment::Candidate &candidate = seg.candidate(0);
// http://b/issue?id=3156109
// Do not remember the preference of Punctuations
if (IsPunctuation(seg, candidate)) {
return;
}
const bool context_sensitive = segments.resized() ||
(candidate.attributes & Segment::Candidate::CONTEXT_SENSITIVE);
const string &all_value = candidate.value;
const string &content_value = candidate.content_value;
const string &all_key = seg.key();
const string &content_key = candidate.content_key;
// even if the candiate was the top (default) candidate,
// ERANKED will be set when user changes the ranking
const bool force_insert =
((candidate.attributes & Segment::Candidate::RERANKED) != 0);
// Compare the POS group and Functional value.
// if "is_replaceable" is true, it means that the target candidate can
// "SAFELY" be replaceable with the top candidate.
const int top_index = GetDefaultCandidateIndex(seg);
const bool is_replaceable_with_top =
((top_index == 0) || Replaceable(seg.candidate(top_index), candidate));
// |feature_key| is used inside INSERT_FEATURE
string feature_key;
INSERT_FEATURE(GetFeatureLR, all_key, all_value, force_insert);
INSERT_FEATURE(GetFeatureLL, all_key, all_value, force_insert);
INSERT_FEATURE(GetFeatureRR, all_key, all_value, force_insert);
INSERT_FEATURE(GetFeatureL, all_key, all_value, force_insert);
INSERT_FEATURE(GetFeatureR, all_key, all_value, force_insert);
INSERT_FEATURE(GetFeatureLN, all_key, all_value, force_insert);
INSERT_FEATURE(GetFeatureRN, all_key, all_value, force_insert);
INSERT_FEATURE(GetFeatureS, all_key, all_value, force_insert);
if (!context_sensitive && is_replaceable_with_top) {
INSERT_FEATURE(GetFeatureC, all_key, all_value, force_insert);
}
// save content value
if (all_value != content_value &&
all_key != content_key &&
is_replaceable_with_top) {
INSERT_FEATURE(GetFeatureLR, content_key, content_value, force_insert);
INSERT_FEATURE(GetFeatureLL, content_key, content_value, force_insert);
INSERT_FEATURE(GetFeatureRR, content_key, content_value, force_insert);
INSERT_FEATURE(GetFeatureL, content_key, content_value, force_insert);
INSERT_FEATURE(GetFeatureR, content_key, content_value, force_insert);
INSERT_FEATURE(GetFeatureLN, content_key, content_value, force_insert);
INSERT_FEATURE(GetFeatureRN, content_key, content_value, force_insert);
INSERT_FEATURE(GetFeatureS, content_key, content_value, force_insert);
if (!context_sensitive) {
INSERT_FEATURE(GetFeatureC, content_key, content_value, force_insert);
}
}
// learn CloseBracket when OpenBracket is fixed.
string close_bracket_key;
string close_bracket_value;
if (Util::IsOpenBracket(content_key, &close_bracket_key) &&
Util::IsOpenBracket(content_value, &close_bracket_value)) {
INSERT_FEATURE(GetFeatureS, close_bracket_key,
close_bracket_value, force_insert);
if (!context_sensitive) {
INSERT_FEATURE(GetFeatureC, close_bracket_key,
close_bracket_value, force_insert);
}
}
}
bool UserSegmentHistoryRewriter::IsAvailable(const Segments &segments) const {
if (GET_CONFIG(incognito_mode)) {
VLOG(2) << "incognito_mode";
return false;
}
if (!segments.user_history_enabled()) {
VLOG(2) << "!user_history_enabled";
return false;
}
if (storage_.get() == NULL) {
VLOG(2) << "storage is NULL";
return false;
}
// check that all segments have candidate
for (size_t i = 0; i < segments.segments_size(); ++i) {
if (segments.segment(i).candidates_size() == 0) {
LOG(ERROR) << "candidate size is 0";
return false;
}
}
return true;
}
void UserSegmentHistoryRewriter::Finish(const ConversionRequest &request,
Segments *segments) {
if (segments->request_type() != Segments::CONVERSION) {
return;
}
if (!IsAvailable(*segments)) {
return;
}
if (GET_CONFIG(history_learning_level) != Config::DEFAULT_HISTORY) {
VLOG(2) << "history_learning_level is not DEFAULT_HISTORY";
return;
}
for (size_t i = segments->history_segments_size();
i < segments->segments_size(); ++i) {
const Segment &segment = segments->segment(i);
if (segment.candidates_size() <= 0 ||
segment.segment_type() != Segment::FIXED_VALUE ||
segment.candidate(0).attributes &
Segment::Candidate::NO_HISTORY_LEARNING) {
continue;
}
if (IsNumberSegment(segment)) {
RememberNumberPreference(segment);
continue;
}
InsertTriggerKey(segment);
RememberFirstCandidate(*segments, i);
}
// update usage stats here
usage_stats::UsageStats::SetInteger("UserSegmentHistoryEntrySize",
static_cast<int>(storage_->used_size()));
}
bool UserSegmentHistoryRewriter::Reload() {
const string filename = ConfigFileStream::GetFileName(kFileName);
if (!storage_->OpenOrCreate(filename.c_str(),
kValueSize, kLRUSize, kSeedValue)) {
LOG(WARNING) << "cannot initialize UserSegmentHistoryRewriter";
storage_.reset(NULL);
return false;
}
const char kFileSuffix[] = ".merge_pending";
const string merge_pending_file = filename + kFileSuffix;
// merge pending file does not always exist.
if (FileUtil::FileExists(merge_pending_file)) {
storage_->Merge(merge_pending_file.c_str());
FileUtil::Unlink(merge_pending_file);
}
return true;
}
bool UserSegmentHistoryRewriter::ShouldRewrite(
const Segment &segment,
size_t *max_candidates_size) const {
if (segment.candidates_size() == 0) {
LOG(ERROR) << "candidate size is 0";
return false;
}
DCHECK(storage_.get());
const KeyTriggerValue *v1 =
reinterpret_cast<const KeyTriggerValue *>
(storage_->Lookup(segment.key()));
const KeyTriggerValue *v2 = NULL;
if (segment.key() != segment.candidate(0).content_key) {
v2 = reinterpret_cast<const KeyTriggerValue *>
(storage_->Lookup(segment.candidate(0).content_key));
}
const size_t v1_size = (v1 == NULL || !v1->IsValid()) ?
0 : v1->candidates_size();
const size_t v2_size = (v2 == NULL || !v2->IsValid()) ?
0 : v2->candidates_size();
*max_candidates_size = max(v1_size, v2_size);
return *max_candidates_size > 0;
}
void UserSegmentHistoryRewriter::InsertTriggerKey(const Segment &segment) {
if (!(segment.candidate(0).attributes & Segment::Candidate::RERANKED)) {
VLOG(2) << "InsertTriggerKey is skipped";
return;
}
DCHECK(storage_.get());
KeyTriggerValue v;
static_assert(sizeof(uint32) == sizeof(v),
"KeyTriggerValue must be 32-bit int size.");
// TODO(taku): saving segment.candidate_size() might be too heavy and
// increases the chance of hash collisions.
v.set_candidates_size(segment.candidates_size());
storage_->Insert(segment.key(), reinterpret_cast<const char *>(&v));
if (segment.key() != segment.candidate(0).content_key) {
storage_->Insert(segment.candidate(0).content_key,
reinterpret_cast<const char *>(&v));
}
string close_bracket_key;
if (Util::IsOpenBracket(segment.key(), &close_bracket_key)) {
storage_->Insert(close_bracket_key,
reinterpret_cast<const char *>(&v));
}
}
bool UserSegmentHistoryRewriter::RewriteNumber(Segment *segment) const {
vector<ScoreType> scores;
for (size_t l = 0;
l < segment->candidates_size() + segment->meta_candidates_size(); ++l) {
int j = static_cast<int>(l);
if (j >= static_cast<int>(segment->candidates_size())) {
j -= static_cast<int>(segment->candidates_size() +
segment->meta_candidates_size());
}
uint32 score = 0;
uint32 last_access_time = 0;
string feature_key;
GetFeatureN(segment->candidate(j).style, &feature_key);
const FeatureValue *v =
reinterpret_cast<const FeatureValue *>
(storage_->Lookup(feature_key, &last_access_time));
if (v != NULL && v->IsValid()) {
score = 10;
// Workaround for separated arabic.
// Because separated arabic and normal number is learned at the
// same time, make the time gap here so that separated arabic
// has higher rank by sorting of scores.
if (last_access_time > 0 &&
(segment->candidate(j).style
!= NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_FULLWIDTH) &&
(segment->candidate(j).style
!= NumberUtil::NumberString::NUMBER_SEPARATED_ARABIC_HALFWIDTH)) {
last_access_time--;
}
scores.resize(scores.size() + 1);
scores.back().score = score;
scores.back().last_access_time = last_access_time;
scores.back().candidate = segment->mutable_candidate(j);
}
}
if (scores.empty()) {
return false;
}
stable_sort(scores.begin(), scores.end(), ScoreTypeCompare());
return SortCandidates(scores, segment);
}
bool UserSegmentHistoryRewriter::Rewrite(const ConversionRequest &request,
Segments *segments) const {
if (!IsAvailable(*segments)) {
return false;
}
if (GET_CONFIG(history_learning_level) == Config::NO_HISTORY) {
VLOG(2) << "history_learning_level is NO_HISTORY";
return false;
}
// set BEST_CANDIDATE marker in advance
for (size_t i = 0; i < segments->segments_size(); ++i) {
Segment *segment = segments->mutable_segment(i);
DCHECK(segment);
DCHECK_GT(segment->candidates_size(), 0);
segment->mutable_candidate(0)->attributes |=
Segment::Candidate::BEST_CANDIDATE;
}
bool modified = false;
for (size_t i = segments->history_segments_size();
i < segments->segments_size(); ++i) {
Segment *segment = segments->mutable_segment(i);
DCHECK(segment);
DCHECK_GT(segment->candidates_size(), 0);
if (segment->segment_type() == Segment::FIXED_VALUE) {
continue;
}
if (IsPunctuation(*segment, segment->candidate(0))) {
continue;
}
if (IsNumberSegment(*segment)) {
modified |= RewriteNumber(segment);
continue;
}
size_t max_candidates_size = 0;
if (!ShouldRewrite(*segment, &max_candidates_size)) {
continue;
}
DVLOG_IF(2, (segment->candidates_size() < max_candidates_size))
<< "Cannot expand candidates. ignored. Rewrite may be failed";
// for each all candidates expanded
vector<ScoreType> scores;
for (size_t l = 0;
l < segment->candidates_size() + segment->meta_candidates_size();
++l) {
int j = static_cast<int>(l);
if (j >= static_cast<int>(segment->candidates_size())) {
j -= static_cast<int>(segment->candidates_size() +
transliteration::NUM_T13N_TYPES);
}
uint32 score = 0;
uint32 last_access_time = 0;
if (GetScore(*segments, i, j, &score, &last_access_time)) {
scores.push_back(ScoreType());
scores.back().score = score;
scores.back().last_access_time = last_access_time;
scores.back().candidate = segment->mutable_candidate(j);
}
}
if (scores.empty()) {
continue;
}
stable_sort(scores.begin(), scores.end(), ScoreTypeCompare());
modified |= SortCandidates(scores, segment);
}
return modified;
}
void UserSegmentHistoryRewriter::Clear() {
if (storage_.get() != NULL) {
VLOG(1) << "Clearing user segment data";
storage_->Clear();
}
}
bool UserSegmentHistoryRewriter::IsPunctuation(
const Segment &seg,
const Segment::Candidate &candidate) const {
return (pos_matcher_->IsJapanesePunctuations(candidate.lid) &&
candidate.lid == candidate.rid &&
IsPunctuationInternal(seg.key()) &&
IsPunctuationInternal(candidate.value));
}
// Feature "Left Number"
bool UserSegmentHistoryRewriter::GetFeatureLN(const Segments &segments,
size_t i,
const string &base_key,
const string &base_value,
string *value) const {
DCHECK(value);
if (i < 1) {
return false;
}
const int j = GetDefaultCandidateIndex(segments.segment(i - 1));
const Segment::Candidate &candidate = segments.segment(i - 1).candidate(j);
if (pos_matcher_->IsNumber(candidate.rid) ||
pos_matcher_->IsKanjiNumber(candidate.rid) ||
Util::GetScriptType(candidate.value) == Util::NUMBER) {
JoinStringsWithTab3(StringPiece("LN", 2), base_key, base_value, value);
return true;
}
return false;
}
// Feature "Right Number"
bool UserSegmentHistoryRewriter::GetFeatureRN(const Segments &segments,
size_t i,
const string &base_key,
const string &base_value,
string *value) const {
DCHECK(value);
if (i + 1 >= segments.segments_size()) {
return false;
}
const int j = GetDefaultCandidateIndex(segments.segment(i + 1));
const Segment::Candidate &candidate = segments.segment(i + 1).candidate(j);
if (pos_matcher_->IsNumber(candidate.lid) ||
pos_matcher_->IsKanjiNumber(candidate.lid) ||
Util::GetScriptType(candidate.value) == Util::NUMBER) {
JoinStringsWithTab3(StringPiece("RN", 2), base_key, base_value, value);
return true;
}
return false;
}
} // namespace mozc