blob: 20582875c522eed5635e08f736d00c9d14e44f72 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// This class is used to filter out generated candidate by its
// cost, structure and relation with previously added candidates.
#include "converter/candidate_filter.h"
#include <algorithm>
#include <climits>
#include <string>
#include <vector>
#include "base/logging.h"
#include "base/port.h"
#include "base/util.h"
#include "converter/node.h"
#include "converter/segments.h"
#include "dictionary/pos_matcher.h"
#include "dictionary/suppression_dictionary.h"
#include "prediction/suggestion_filter.h"
namespace mozc {
namespace converter {
namespace {
const size_t kMaxCandidatesSize = 200; // how many candidates we expand
// Currently, the cost (logprob) is calcurated as cost = -500 * log(prob).
// Suppose having two candidates A and B and prob(A) = C * prob(B), where
// C = 1000 (some constant variable). The word "A" appears 1000 times more
// frequently than B.
// In this case,
// cost(B) - cost(A) = -500 * [log(prob(B)) - log (C * prob(B)) ]
// = -500 * [-log(C) + log(prob(B)) - log(prob(B))]
// = 500 * log(C)
// This implies that it is more reasonable to filter candiates
// by an absolute difference of costs between cost(B) and cost(A).
//
// Here's "C" and cost-diff relation:
// C cost diff: 500 * log(C)
// 10 1151.29
// 100 2302.58
// 1000 3453.87
// 10000 4605.17
// 100000 5756.46
// 1000000 6907.75
const int kMinCost = 100;
const int kCostOffset = 6907;
const int kStructureCostOffset = 3453;
const int kMinStructureCostOffset = 1151;
const int32 kStopEnmerationCacheSize = 15;
} // anonymous namespace
CandidateFilter::CandidateFilter(
const SuppressionDictionary *suppression_dictionary,
const POSMatcher *pos_matcher,
const SuggestionFilter *suggestion_filter)
: suppression_dictionary_(suppression_dictionary),
pos_matcher_(pos_matcher),
suggestion_filter_(suggestion_filter),
top_candidate_(NULL) {
CHECK(suppression_dictionary_);
CHECK(pos_matcher_);
CHECK(suggestion_filter_);
}
CandidateFilter::~CandidateFilter() {}
void CandidateFilter::Reset() {
seen_.clear();
top_candidate_ = NULL;
}
CandidateFilter::ResultType CandidateFilter::FilterCandidateInternal(
const string &original_key,
const Segment::Candidate *candidate,
const vector<const Node *> &nodes,
Segments::RequestType request_type) {
DCHECK(candidate);
// Filtering by the suggestion filter, which is applied only for the
// PREDICTION and SUGGESTION modes.
switch (request_type) {
case Segments::PREDICTION:
// In the PREDICTION mode, the suggestion filter is not applied and the
// same filtering rule as the CONVERSION mode is used because the
// PREDICTION is triggered by user action (hitting tab keys), i.e.,
// prediction candidates are not automatically shown to users. On the
// contrary, since a user hit tab keys to run prediction, even unfavorable
// words might be what the user wants to type. Therefore, filtering rule
// is relaxed for the PREDICTION mode: we don't apply the suggestion
// filter if the user input key is exactly the same as candidate's.
if (original_key == candidate->key) {
break;
}
FALLTHROUGH_INTENDED;
case Segments::SUGGESTION:
// In contrast to the PREDICTION mode, the SUGGESTION is triggered without
// any user actions, i.e., suggestion candidates are automatically
// displayed to users. Therefore, it's better to filter unfavorable words
// in this mode.
CHECK(suggestion_filter_);
if (suggestion_filter_->IsBadSuggestion(candidate->value)) {
return BAD_CANDIDATE;
}
// TODO(noriyukit): In the implementation below, the possibility remains
// that multiple nodes constitute bad candidates. For stronger filtering,
// we may want to check all the possibilities.
for (size_t i = 0; i < nodes.size(); ++i) {
if (suggestion_filter_->IsBadSuggestion(nodes[i]->value)) {
return BAD_CANDIDATE;
}
}
break;
default:
break;
}
// In general, the cost of constrained node tends to be overestimated.
// If the top candidate has constrained node, we skip the main body
// of CandidateFilter, meaning that the node is not treated as the top
// node for CandidateFilter.
if (candidate->attributes & Segment::Candidate::CONTEXT_SENSITIVE) {
return CandidateFilter::GOOD_CANDIDATE;
}
const size_t candidate_size = seen_.size();
if (top_candidate_ == NULL || candidate_size == 0) {
top_candidate_ = candidate;
}
CHECK(top_candidate_);
// Don't remove duplications if USER_DICTIONARY.
if (candidate->attributes & Segment::Candidate::USER_DICTIONARY) {
return CandidateFilter::GOOD_CANDIDATE;
}
// too many candidates size
if (candidate_size + 1 >= kMaxCandidatesSize) {
return CandidateFilter::STOP_ENUMERATION;
}
// The candidate is already seen.
if (seen_.find(candidate->value) != seen_.end()) {
return CandidateFilter::BAD_CANDIDATE;
}
CHECK(!nodes.empty());
// "短縮よみ" must only have 1 node.
if (pos_matcher_->IsIsolatedWord(nodes[0]->lid) &&
(nodes.size() > 1 ||
nodes[0]->prev == NULL ||
nodes[0]->prev->node_type == Node::NOR_NODE ||
nodes[0]->prev->node_type == Node::CON_NODE ||
nodes[0]->next == NULL ||
nodes[0]->next->node_type == Node::NOR_NODE ||
nodes[0]->next->node_type == Node::CON_NODE)) {
return CandidateFilter::BAD_CANDIDATE;
}
// Remove "抑制単語" just in case.
if (suppression_dictionary_->SuppressEntry(candidate->key,
candidate->value) ||
(candidate->key != candidate->content_key &&
candidate->value != candidate->content_value &&
suppression_dictionary_->SuppressEntry(candidate->content_key,
candidate->content_value))) {
return CandidateFilter::BAD_CANDIDATE;
}
// Suppress "書います", "買いて"
if (Util::GetScriptType(nodes[0]->value) != Util::HIRAGANA &&
((nodes.size() >= 2 &&
pos_matcher_->IsKagyoTaConnectionVerb(nodes[0]->rid) &&
pos_matcher_->IsMasuSuffix(nodes[1]->lid)) ||
(nodes[0]->lid != nodes[0]->rid &&
pos_matcher_->IsKagyoTaConnectionVerb(nodes[0]->lid) &&
pos_matcher_->IsMasuSuffix(nodes[0]->rid)) ||
(nodes.size() >= 2 &&
pos_matcher_->IsWagyoRenyoConnectionVerb(nodes[0]->rid) &&
pos_matcher_->IsTeSuffix(nodes[1]->lid)) ||
(nodes[0]->lid != nodes[0]->rid &&
pos_matcher_->IsWagyoRenyoConnectionVerb(nodes[0]->lid) &&
pos_matcher_->IsTeSuffix(nodes[0]->rid)))) {
return CandidateFilter::BAD_CANDIDATE;
}
// The candidate consists of only one token
if (nodes.size() == 1) {
VLOG(1) << "don't filter single segment";
return CandidateFilter::GOOD_CANDIDATE;
}
// don't drop single character
if (Util::CharsLen(candidate->value) == 1) {
VLOG(1) << "don't filter single character";
return CandidateFilter::GOOD_CANDIDATE;
}
// don't drop lid/rid are the same as those
// of top candidate.
// http://b/issue?id=4285213
if (top_candidate_->structure_cost == 0 &&
candidate->lid == top_candidate_->lid &&
candidate->rid == top_candidate_->rid) {
VLOG(1) << "don't filter lid/rid are the same";
return CandidateFilter::GOOD_CANDIDATE;
}
// "好かっ|たり" vs "良かっ|たり" have same non_content_value.
// "良かっ|たり" is also a good candidate but it is not the top candidate.
if (top_candidate_ != candidate &&
top_candidate_->content_value != top_candidate_->value &&
(top_candidate_->value.compare(
top_candidate_->content_value.size(),
top_candidate_->value.size() - top_candidate_->content_value.size(),
candidate->value,
candidate->content_value.size(),
candidate->value.size() - candidate->content_value.size()) == 0)) {
VLOG(1) << "don't filter if non-content value are the same";
return CandidateFilter::GOOD_CANDIDATE;
}
// Check Katakana transliterations
// Skip this check when the conversion mode is real-time;
// otherwise this ruins the whole sentence
// that starts with alphabets.
if (!(candidate->attributes & Segment::Candidate::REALTIME_CONVERSION)) {
const bool is_top_english_t13n =
Util::IsEnglishTransliteration(nodes[0]->value);
for (size_t i = 1; i < nodes.size(); ++i) {
// EnglishT13N must be the prefix of the candidate.
if (Util::IsEnglishTransliteration(nodes[i]->value)) {
return CandidateFilter::BAD_CANDIDATE;
}
// nodes[1..] are non-functional candidates.
// In other words, the node just after KatakanaT13n candidate should
// be a functional word.
if (is_top_english_t13n && !pos_matcher_->IsFunctional(nodes[i]->lid)) {
return CandidateFilter::BAD_CANDIDATE;
}
}
}
const int top_cost = max(kMinCost, top_candidate_->cost);
const int top_structure_cost = max(kMinCost, top_candidate_->structure_cost);
// If candidate size < 3, don't filter candidate aggressively
// TOOD(taku): This is a tentative workaround for the case where
// TOP candidate is compound and the structure cost for it is "0"
// If 2nd or 3rd candidates are regular candidate but not having
// non-zero cost, they might be removed. This hack removes such case.
if (candidate_size < 3 &&
candidate->cost < top_cost + 2302 &&
candidate->structure_cost < 6907) {
return CandidateFilter::GOOD_CANDIDATE;
}
// if candidate starts with prefix "お", we'd like to demote
// the candidate if the rank of the candidate is below 1 (0-origin).
// This is a temporal workaround for fixing "おそう" => "御|総"
// TODO(taku): remove it after intorducing a word clustering for noun.
if (candidate_size >= 1 && nodes.size() > 1 &&
nodes[0]->lid == nodes[0]->rid &&
pos_matcher_->IsWeakCompoundPrefix(nodes[0]->lid)) {
VLOG(2) << "removing noisy prefix pattern";
return CandidateFilter::BAD_CANDIDATE;
}
// Don't drop personal names aggressivly.
// We have to show personal names even if they are too minor.
// We basically ignores the cost threadshould. Filter candidate
// only by structure cost.
int cost_offset = kCostOffset;
if (candidate->lid == pos_matcher_->GetLastNameId() ||
candidate->lid == pos_matcher_->GetFirstNameId()) {
cost_offset = INT_MAX - top_cost;
}
// Filters out candidates with higher cost.
if (top_cost + cost_offset < candidate->cost &&
top_structure_cost + kMinStructureCostOffset
< candidate->structure_cost) {
// Stops candidates enumeration when we see sufficiently high cost
// candidate.
VLOG(2) << "cost is invalid: "
<< "top_cost=" << top_cost
<< " cost_offset=" << cost_offset
<< " value=" << candidate->value
<< " cost=" << candidate->cost
<< " top_structure_cost=" << top_structure_cost
<< " structure_cost=" << candidate->structure_cost
<< " lid=" << candidate->lid
<< " rid=" << candidate->rid;
if (candidate_size < kStopEnmerationCacheSize) {
// Even when the current candidate is classified as bad candidate,
// we don't return STOP_ENUMERATION here.
// When the current candidate is removed only with the "structure_cost",
// there might exist valid candidates just after the current candidate.
// We don't want to miss them.
return CandidateFilter::BAD_CANDIDATE;
} else {
return CandidateFilter::STOP_ENUMERATION;
}
}
// Filters out candidates with higher cost structure.
if (max(top_structure_cost, kMinStructureCostOffset) + kStructureCostOffset <
candidate->structure_cost) {
// We don't stop enumeration here. Just drops high cost structure
// looks enough.
// |top_structure_cost| can be so small especially for compound or
// web dictionary entries.
// For avoiding over filtering, we use kMinStructureCostOffset if
// |top_structure_cost| is small.
VLOG(2) << "structure cost is invalid: "
<< candidate->value << " " << candidate->content_value << " "
<< candidate->structure_cost
<< " " << candidate->cost;
return CandidateFilter::BAD_CANDIDATE;
}
return CandidateFilter::GOOD_CANDIDATE;
}
CandidateFilter::ResultType CandidateFilter::FilterCandidate(
const string &original_key,
const Segment::Candidate *candidate,
const vector<const Node *> &nodes,
Segments::RequestType request_type) {
if (request_type == Segments::REVERSE_CONVERSION) {
// In reverse conversion, only remove duplicates because the filtering
// criteria of FilterCandidateInternal() are completely designed for
// (forward) conversion.
const bool inserted = seen_.insert(candidate->value).second;
return inserted ? GOOD_CANDIDATE : BAD_CANDIDATE;
} else {
const ResultType result = FilterCandidateInternal(original_key, candidate,
nodes, request_type);
if (result != GOOD_CANDIDATE) {
return result;
}
seen_.insert(candidate->value);
return result;
}
}
} // namespace converter
} // namespace mozc