| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // This class is used to filter out generated candidate by its |
| // cost, structure and relation with previously added candidates. |
| |
| #include "converter/candidate_filter.h" |
| |
| #include <algorithm> |
| #include <climits> |
| #include <string> |
| #include <vector> |
| |
| #include "base/logging.h" |
| #include "base/port.h" |
| #include "base/util.h" |
| #include "converter/node.h" |
| #include "converter/segments.h" |
| #include "dictionary/pos_matcher.h" |
| #include "dictionary/suppression_dictionary.h" |
| #include "prediction/suggestion_filter.h" |
| |
| using mozc::dictionary::POSMatcher; |
| using mozc::dictionary::SuppressionDictionary; |
| |
| namespace mozc { |
| namespace converter { |
| namespace { |
| |
| const size_t kMaxCandidatesSize = 200; // how many candidates we expand |
| |
| // Currently, the cost (logprob) is calcurated as cost = -500 * log(prob). |
| // Suppose having two candidates A and B and prob(A) = C * prob(B), where |
| // C = 1000 (some constant variable). The word "A" appears 1000 times more |
| // frequently than B. |
| // In this case, |
| // cost(B) - cost(A) = -500 * [log(prob(B)) - log (C * prob(B)) ] |
| // = -500 * [-log(C) + log(prob(B)) - log(prob(B))] |
| // = 500 * log(C) |
| // This implies that it is more reasonable to filter candiates |
| // by an absolute difference of costs between cost(B) and cost(A). |
| // |
| // Here's "C" and cost-diff relation: |
| // C cost diff: 500 * log(C) |
| // 10 1151.29 |
| // 100 2302.58 |
| // 1000 3453.87 |
| // 10000 4605.17 |
| // 100000 5756.46 |
| // 1000000 6907.75 |
| const int kMinCost = 100; |
| const int kCostOffset = 6907; |
| const int kStructureCostOffset = 3453; |
| const int kMinStructureCostOffset = 1151; |
| const int32 kStopEnmerationCacheSize = 15; |
| |
| } // anonymous namespace |
| |
| CandidateFilter::CandidateFilter( |
| const SuppressionDictionary *suppression_dictionary, |
| const POSMatcher *pos_matcher, |
| const SuggestionFilter *suggestion_filter) |
| : suppression_dictionary_(suppression_dictionary), |
| pos_matcher_(pos_matcher), |
| suggestion_filter_(suggestion_filter), |
| top_candidate_(NULL) { |
| CHECK(suppression_dictionary_); |
| CHECK(pos_matcher_); |
| CHECK(suggestion_filter_); |
| } |
| |
| CandidateFilter::~CandidateFilter() {} |
| |
| void CandidateFilter::Reset() { |
| seen_.clear(); |
| top_candidate_ = NULL; |
| } |
| |
| CandidateFilter::ResultType CandidateFilter::FilterCandidateInternal( |
| const string &original_key, |
| const Segment::Candidate *candidate, |
| const vector<const Node *> &nodes, |
| Segments::RequestType request_type) { |
| DCHECK(candidate); |
| |
| // Filtering by the suggestion filter, which is applied only for the |
| // PREDICTION and SUGGESTION modes. |
| switch (request_type) { |
| case Segments::PREDICTION: |
| // In the PREDICTION mode, the suggestion filter is not applied and the |
| // same filtering rule as the CONVERSION mode is used because the |
| // PREDICTION is triggered by user action (hitting tab keys), i.e., |
| // prediction candidates are not automatically shown to users. On the |
| // contrary, since a user hit tab keys to run prediction, even unfavorable |
| // words might be what the user wants to type. Therefore, filtering rule |
| // is relaxed for the PREDICTION mode: we don't apply the suggestion |
| // filter if the user input key is exactly the same as candidate's. |
| if (original_key == candidate->key) { |
| break; |
| } |
| FALLTHROUGH_INTENDED; |
| case Segments::SUGGESTION: |
| // In contrast to the PREDICTION mode, the SUGGESTION is triggered without |
| // any user actions, i.e., suggestion candidates are automatically |
| // displayed to users. Therefore, it's better to filter unfavorable words |
| // in this mode. |
| CHECK(suggestion_filter_); |
| if (suggestion_filter_->IsBadSuggestion(candidate->value)) { |
| return BAD_CANDIDATE; |
| } |
| // TODO(noriyukit): In the implementation below, the possibility remains |
| // that multiple nodes constitute bad candidates. For stronger filtering, |
| // we may want to check all the possibilities. |
| for (size_t i = 0; i < nodes.size(); ++i) { |
| if (suggestion_filter_->IsBadSuggestion(nodes[i]->value)) { |
| return BAD_CANDIDATE; |
| } |
| } |
| break; |
| default: |
| break; |
| } |
| |
| // In general, the cost of constrained node tends to be overestimated. |
| // If the top candidate has constrained node, we skip the main body |
| // of CandidateFilter, meaning that the node is not treated as the top |
| // node for CandidateFilter. |
| if (candidate->attributes & Segment::Candidate::CONTEXT_SENSITIVE) { |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| const size_t candidate_size = seen_.size(); |
| if (top_candidate_ == NULL || candidate_size == 0) { |
| top_candidate_ = candidate; |
| } |
| |
| CHECK(top_candidate_); |
| |
| // Don't remove duplications if USER_DICTIONARY. |
| if (candidate->attributes & Segment::Candidate::USER_DICTIONARY) { |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| // too many candidates size |
| if (candidate_size + 1 >= kMaxCandidatesSize) { |
| return CandidateFilter::STOP_ENUMERATION; |
| } |
| |
| // The candidate is already seen. |
| if (seen_.find(candidate->value) != seen_.end()) { |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| CHECK(!nodes.empty()); |
| |
| // "短縮よみ" must only have 1 node. |
| if (pos_matcher_->IsIsolatedWord(nodes[0]->lid) && |
| (nodes.size() > 1 || |
| nodes[0]->prev == NULL || |
| nodes[0]->prev->node_type == Node::NOR_NODE || |
| nodes[0]->prev->node_type == Node::CON_NODE || |
| nodes[0]->next == NULL || |
| nodes[0]->next->node_type == Node::NOR_NODE || |
| nodes[0]->next->node_type == Node::CON_NODE)) { |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| // Remove "抑制単語" just in case. |
| if (suppression_dictionary_->SuppressEntry(candidate->key, |
| candidate->value) || |
| (candidate->key != candidate->content_key && |
| candidate->value != candidate->content_value && |
| suppression_dictionary_->SuppressEntry(candidate->content_key, |
| candidate->content_value))) { |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| // Suppress "書います", "買いて" |
| if (Util::GetScriptType(nodes[0]->value) != Util::HIRAGANA && |
| ((nodes.size() >= 2 && |
| pos_matcher_->IsKagyoTaConnectionVerb(nodes[0]->rid) && |
| pos_matcher_->IsMasuSuffix(nodes[1]->lid)) || |
| (nodes[0]->lid != nodes[0]->rid && |
| pos_matcher_->IsKagyoTaConnectionVerb(nodes[0]->lid) && |
| pos_matcher_->IsMasuSuffix(nodes[0]->rid)) || |
| (nodes.size() >= 2 && |
| pos_matcher_->IsWagyoRenyoConnectionVerb(nodes[0]->rid) && |
| pos_matcher_->IsTeSuffix(nodes[1]->lid)) || |
| (nodes[0]->lid != nodes[0]->rid && |
| pos_matcher_->IsWagyoRenyoConnectionVerb(nodes[0]->lid) && |
| pos_matcher_->IsTeSuffix(nodes[0]->rid)))) { |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| // The candidate consists of only one token |
| if (nodes.size() == 1) { |
| VLOG(1) << "don't filter single segment"; |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| // don't drop single character |
| if (Util::CharsLen(candidate->value) == 1) { |
| VLOG(1) << "don't filter single character"; |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| // don't drop lid/rid are the same as those |
| // of top candidate. |
| // http://b/issue?id=4285213 |
| if (top_candidate_->structure_cost == 0 && |
| candidate->lid == top_candidate_->lid && |
| candidate->rid == top_candidate_->rid) { |
| VLOG(1) << "don't filter lid/rid are the same"; |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| // "好かっ|たり" vs "良かっ|たり" have same non_content_value. |
| // "良かっ|たり" is also a good candidate but it is not the top candidate. |
| if (top_candidate_ != candidate && |
| top_candidate_->content_value != top_candidate_->value && |
| (top_candidate_->value.compare( |
| top_candidate_->content_value.size(), |
| top_candidate_->value.size() - top_candidate_->content_value.size(), |
| candidate->value, |
| candidate->content_value.size(), |
| candidate->value.size() - candidate->content_value.size()) == 0)) { |
| VLOG(1) << "don't filter if non-content value are the same"; |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| // Check Katakana transliterations |
| // Skip this check when the conversion mode is real-time; |
| // otherwise this ruins the whole sentence |
| // that starts with alphabets. |
| if (!(candidate->attributes & Segment::Candidate::REALTIME_CONVERSION)) { |
| const bool is_top_english_t13n = |
| Util::IsEnglishTransliteration(nodes[0]->value); |
| for (size_t i = 1; i < nodes.size(); ++i) { |
| // EnglishT13N must be the prefix of the candidate. |
| if (Util::IsEnglishTransliteration(nodes[i]->value)) { |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| // nodes[1..] are non-functional candidates. |
| // In other words, the node just after KatakanaT13n candidate should |
| // be a functional word. |
| if (is_top_english_t13n && !pos_matcher_->IsFunctional(nodes[i]->lid)) { |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| } |
| } |
| |
| const int top_cost = max(kMinCost, top_candidate_->cost); |
| const int top_structure_cost = max(kMinCost, top_candidate_->structure_cost); |
| |
| // If candidate size < 3, don't filter candidate aggressively |
| // TOOD(taku): This is a tentative workaround for the case where |
| // TOP candidate is compound and the structure cost for it is "0" |
| // If 2nd or 3rd candidates are regular candidate but not having |
| // non-zero cost, they might be removed. This hack removes such case. |
| if (candidate_size < 3 && |
| candidate->cost < top_cost + 2302 && |
| candidate->structure_cost < 6907) { |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| // if candidate starts with prefix "お", we'd like to demote |
| // the candidate if the rank of the candidate is below 1 (0-origin). |
| // This is a temporal workaround for fixing "おそう" => "御|総" |
| // TODO(taku): remove it after intorducing a word clustering for noun. |
| if (candidate_size >= 1 && nodes.size() > 1 && |
| nodes[0]->lid == nodes[0]->rid && |
| pos_matcher_->IsWeakCompoundPrefix(nodes[0]->lid)) { |
| VLOG(2) << "removing noisy prefix pattern"; |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| // Don't drop personal names aggressivly. |
| // We have to show personal names even if they are too minor. |
| // We basically ignores the cost threadshould. Filter candidate |
| // only by structure cost. |
| int cost_offset = kCostOffset; |
| if (candidate->lid == pos_matcher_->GetLastNameId() || |
| candidate->lid == pos_matcher_->GetFirstNameId()) { |
| cost_offset = INT_MAX - top_cost; |
| } |
| |
| // Filters out candidates with higher cost. |
| if (top_cost + cost_offset < candidate->cost && |
| top_structure_cost + kMinStructureCostOffset |
| < candidate->structure_cost) { |
| // Stops candidates enumeration when we see sufficiently high cost |
| // candidate. |
| VLOG(2) << "cost is invalid: " |
| << "top_cost=" << top_cost |
| << " cost_offset=" << cost_offset |
| << " value=" << candidate->value |
| << " cost=" << candidate->cost |
| << " top_structure_cost=" << top_structure_cost |
| << " structure_cost=" << candidate->structure_cost |
| << " lid=" << candidate->lid |
| << " rid=" << candidate->rid; |
| if (candidate_size < kStopEnmerationCacheSize) { |
| // Even when the current candidate is classified as bad candidate, |
| // we don't return STOP_ENUMERATION here. |
| // When the current candidate is removed only with the "structure_cost", |
| // there might exist valid candidates just after the current candidate. |
| // We don't want to miss them. |
| return CandidateFilter::BAD_CANDIDATE; |
| } else { |
| return CandidateFilter::STOP_ENUMERATION; |
| } |
| } |
| |
| // Filters out candidates with higher cost structure. |
| if (max(top_structure_cost, kMinStructureCostOffset) + kStructureCostOffset < |
| candidate->structure_cost) { |
| // We don't stop enumeration here. Just drops high cost structure |
| // looks enough. |
| // |top_structure_cost| can be so small especially for compound or |
| // web dictionary entries. |
| // For avoiding over filtering, we use kMinStructureCostOffset if |
| // |top_structure_cost| is small. |
| VLOG(2) << "structure cost is invalid: " |
| << candidate->value << " " << candidate->content_value << " " |
| << candidate->structure_cost |
| << " " << candidate->cost; |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| CandidateFilter::ResultType CandidateFilter::FilterCandidate( |
| const string &original_key, |
| const Segment::Candidate *candidate, |
| const vector<const Node *> &nodes, |
| Segments::RequestType request_type) { |
| if (request_type == Segments::REVERSE_CONVERSION) { |
| // In reverse conversion, only remove duplicates because the filtering |
| // criteria of FilterCandidateInternal() are completely designed for |
| // (forward) conversion. |
| const bool inserted = seen_.insert(candidate->value).second; |
| return inserted ? GOOD_CANDIDATE : BAD_CANDIDATE; |
| } else { |
| const ResultType result = FilterCandidateInternal(original_key, candidate, |
| nodes, request_type); |
| if (result != GOOD_CANDIDATE) { |
| return result; |
| } |
| seen_.insert(candidate->value); |
| return result; |
| } |
| } |
| |
| } // namespace converter |
| } // namespace mozc |