| // Copyright 2010, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // This class is used to filter out generated candidate by its |
| // cost, structure and relation with previously added candidates. |
| |
| #include "converter/candidate_filter.h" |
| |
| #include <limits.h> |
| #include <map> |
| #include <set> |
| #include <utility> |
| |
| #include "base/util.h" |
| #include "base/singleton.h" |
| #include "converter/node.h" |
| #include "converter/pos.h" |
| #include "converter/segments.h" |
| |
| namespace mozc { |
| |
| namespace { |
| const size_t kMaxCandidatesSize = 200; // how many candidates we expand |
| |
| // Currently, the cost (logprob) is calcurated as cost = -500 * log(prob). |
| // Suppose having two candidates A and B and prob(A) = C * prob(B), where |
| // C = 1000 (some constant variable). The word "A" appears 1000 times more |
| // frequently than B. |
| // In this case, |
| // cost(B) - cost(A) = -500 * [log(prob(B)) - log (C * prob(B)) ] |
| // = -500 * [-log(C) + log(prob(B)) - log(prob(B))] |
| // = 500 * log(C) |
| // This implies that it is more reasonable to filter candiates |
| // by an absolute difference of costs between cost(B) and cost(A). |
| // |
| // Here's "C" and cost-diff relation: |
| // C cost diff: 500 * log(C) |
| // 10 1151.29 |
| // 100 2302.58 |
| // 1000 3453.87 |
| // 10000 4605.17 |
| // 100000 5756.46 |
| // 1000000 6907.75 |
| const int kMinCost = 100; |
| const int kCostOffset = 6907; |
| const int kStructureCostOffset = 3453; |
| const int kMinStructureCostOffset = 1151; |
| const int32 kNoFilterRank = 3; |
| const int32 kNoFilterIfSameIdRank = 10; |
| } |
| |
| CandidateFilter::CandidateFilter() |
| : top_candidate_(NULL) {} |
| |
| CandidateFilter::~CandidateFilter() {} |
| |
| void CandidateFilter::Reset() { |
| seen_.clear(); |
| top_candidate_ = NULL; |
| } |
| |
| CandidateFilter::ResultType CandidateFilter::FilterCandidateInternal( |
| const Segment::Candidate *candidate) { |
| DCHECK(candidate); |
| const size_t candidate_size = seen_.size(); |
| if (top_candidate_ == NULL || candidate_size == 0) { |
| top_candidate_ = candidate; |
| } |
| |
| CHECK(top_candidate_); |
| |
| // too many candidates size |
| if (candidate_size + 1 >= kMaxCandidatesSize) { |
| return CandidateFilter::STOP_ENUMERATION; |
| } |
| |
| // The candidate is already seen. |
| if (seen_.find(candidate->value) != seen_.end()) { |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| // The candidate consists of only one token |
| if (candidate->nodes.size() == 1) { |
| VLOG(1) << "don't filter single segment"; |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| // don't drop single character |
| if (Util::CharsLen(candidate->value) == 1) { |
| VLOG(1) << "don't filter single character"; |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| const int top_cost = max(kMinCost, top_candidate_->cost); |
| const int top_structure_cost = max(kMinCost, top_candidate_->structure_cost); |
| |
| // If candidate size < 3, don't filter candidate aggressively |
| // TOOD(taku): This is a tentative workaround for the case where |
| // TOP candidate is compound and the structure cost for it is "0" |
| // If 2nd or 3rd candidates are regular candidate but not having |
| // non-zero cost, they might be removed. This hack removes such case. |
| if (candidate_size < 3 && |
| candidate->cost < top_cost + 2302 && |
| candidate->structure_cost < 6907) { |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| // if candidate starts with prefix "お", we'd like to demote |
| // the candidate if the rank of the candidate is below 3. |
| // This is a temporal workaround for fixing "おそう" => "御|総" |
| // TODO(taku): remove it after intorducing a word clustering for noun. |
| if (candidate_size >= 3 && candidate->nodes.size() > 1 && |
| (candidate->nodes[0]->key == "\xE3\x81\x8A" || // "お|ご" |
| candidate->nodes[0]->key == "\xE3\x81\x94")) { |
| VLOG(1) << "removing noisy prefix pattern"; |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| // don't drop personal names aggressivly. |
| // We have to show personal names even if they are minor enough. |
| // We basically ignores the cost threadshould. Filter candidate |
| // only with StructureCost |
| int cost_offset = kCostOffset; |
| if (candidate->lid == POS::last_name_id() || |
| candidate->lid == POS::first_name_id()) { |
| cost_offset = INT_MAX; |
| } |
| |
| // Filters out candidates with higher cost. |
| if (top_cost + cost_offset < candidate->cost && |
| top_structure_cost + kMinStructureCostOffset |
| < candidate->structure_cost) { |
| // Stops candidates enumeration when we see sufficiently high cost |
| // candidate. |
| VLOG(1) << "cost is invalid: " |
| << "top_cost " << top_cost |
| << " value " << candidate->value |
| << " cost " << candidate->cost; |
| return CandidateFilter::STOP_ENUMERATION; |
| } |
| |
| // Filters out candidates with higher cost structure. |
| if (top_structure_cost + kStructureCostOffset < candidate->structure_cost) { |
| // We don't stop enumeration here. Just drops high cost structure |
| // looks enough. |
| VLOG(1) << "structure cost is invalid: " |
| << candidate->value << " " << candidate->structure_cost |
| << " " << candidate->cost; |
| return CandidateFilter::BAD_CANDIDATE; |
| } |
| |
| return CandidateFilter::GOOD_CANDIDATE; |
| } |
| |
| CandidateFilter::ResultType CandidateFilter::FilterCandidate( |
| const Segment::Candidate *candidate) { |
| const CandidateFilter::ResultType result = FilterCandidateInternal(candidate); |
| if (result != CandidateFilter::GOOD_CANDIDATE) { |
| return result; |
| } |
| seen_.insert(candidate->value); |
| return result; |
| } |
| } // namespace mozc |