| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "rewriter/collocation_rewriter.h" |
| |
| #include <algorithm> |
| #include <string> |
| #include <vector> |
| |
| #include "base/logging.h" |
| #include "base/singleton.h" |
| #include "base/string_piece.h" |
| #include "base/util.h" |
| #include "converter/conversion_request.h" |
| #include "converter/segments.h" |
| #include "data_manager/data_manager_interface.h" |
| #include "dictionary/pos_matcher.h" |
| #include "rewriter/collocation_util.h" |
| #include "storage/existence_filter.h" |
| |
| DEFINE_bool(use_collocation, true, "use collocation rewrite"); |
| |
| namespace mozc { |
| |
| using mozc::storage::ExistenceFilter; |
| |
| namespace { |
| const size_t kCandidateSize = 12; |
| |
| // For collocation, we use two segments. |
| enum SegmentLookupType { |
| LEFT, |
| RIGHT, |
| }; |
| |
| // returns true if the given string contains number including Kanji. |
| bool ContainsNumber(const string &str) { |
| for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) { |
| if (CollocationUtil::IsNumber(iter.Get())) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| // Returns true if value matches the pattern XXXPPPYYY, where XXX is a Kanji |
| // sequence, PPP is the given pattern, and YYY is a sequence containing at least |
| // one Kanji character. In the value matches the pattern, XXX and YYY are |
| // substituted to |first_content| and |second|, respectively. Returns false if |
| // the value isn't of the form XXXPPPYYY. |
| bool ParseCompound(const StringPiece value, const StringPiece pattern, |
| StringPiece *first_content, StringPiece *second) { |
| DCHECK(!value.empty()); |
| DCHECK(!pattern.empty()); |
| |
| // Find the |first_content| candidate and check if it consists of Kanji only. |
| StringPiece::const_iterator pattern_begin = |
| find(value.begin(), value.end(), pattern[0]); |
| if (pattern_begin == value.end()) { |
| return false; |
| } |
| first_content->set(value.data(), distance(value.begin(), pattern_begin)); |
| if (!Util::IsScriptType(*first_content, Util::KANJI)) { |
| return false; |
| } |
| |
| // Check if the middle part matches |pattern|. |
| const StringPiece remaining_value = value.substr(first_content->size()); |
| if (!Util::StartsWith(remaining_value, pattern)) { |
| return false; |
| } |
| |
| // Check if the last substring is eligible for |second|. |
| *second = remaining_value.substr(pattern.size()); |
| if (second->empty() || !Util::ContainsScriptType(*second, Util::KANJI)) { |
| return false; |
| } |
| |
| // Just verify that |value| = |first_content| + |pattern| + |second|. |
| DCHECK_EQ( |
| value, |
| first_content->as_string() + pattern.as_string() + second->as_string()); |
| return true; |
| } |
| |
| // Fast way of pushing back a string piece to a vector. |
| inline void PushBackStringPiece(const StringPiece s, vector<string> *v) { |
| v->push_back(string()); |
| v->back().assign(s.data(), s.size()); |
| } |
| |
| // Fast way of pushing back the concatenated string of two string pieces to a |
| // vector. |
| inline void PushBackJoinedStringPieces( |
| const StringPiece s1, const StringPiece s2, vector<string> *v) { |
| v->push_back(string()); |
| v->back().reserve(s1.size() + s2.size()); |
| v->back().assign(s1.data(), s1.size()).append(s2.data(), s2.size()); |
| } |
| |
| // Handles compound such as "本を読む"(one segment) |
| // we want to rewrite using it as if it was "<本|を><読む>" |
| // so that we can use collocation data like "厚い本" |
| void ResolveCompoundSegment(const string &top_value, const string &value, |
| SegmentLookupType type, |
| vector<string> *output) { |
| // "格助詞" |
| // see "http://ja.wikipedia.org/wiki/助詞" |
| static const char kPat1[] = "\xE3\x81\x8C"; // "が" |
| // "の" was not good... |
| // static const char kPat2[] = "\xE3\x81\xAE"; // "の" |
| static const char kPat3[] = "\xE3\x82\x92"; // "を" |
| static const char kPat4[] = "\xE3\x81\xAB"; // "に" |
| static const char kPat5[] = "\xE3\x81\xB8"; // "へ" |
| static const char kPat6[] = "\xE3\x81\xA8"; // "と" |
| static const char kPat7[] = "\xE3\x81\x8B\xE3\x82\x89"; // "から" |
| static const char kPat8[] = "\xE3\x82\x88\xE3\x82\x8A"; // "より" |
| static const char kPat9[] = "\xE3\x81\xA7"; // "で" |
| |
| static const struct { |
| const char *pat; |
| size_t len; |
| } kParticles[] = { |
| {kPat1, arraysize(kPat1) - 1}, |
| // {kPat2, arraysize(kPat2) - 1}, |
| {kPat3, arraysize(kPat3) - 1}, |
| {kPat4, arraysize(kPat4) - 1}, |
| {kPat5, arraysize(kPat5) - 1}, |
| {kPat6, arraysize(kPat6) - 1}, |
| {kPat7, arraysize(kPat7) - 1}, |
| {kPat8, arraysize(kPat8) - 1}, |
| {kPat9, arraysize(kPat9) - 1}, |
| {NULL, 0} |
| }; |
| |
| for (size_t i = 0; kParticles[i].pat != NULL; ++i) { |
| const StringPiece particle(kParticles[i].pat, kParticles[i].len); |
| StringPiece first_content, second; |
| if (!ParseCompound(top_value, particle, &first_content, &second)) { |
| continue; |
| } |
| if (ParseCompound(value, particle, &first_content, &second)) { |
| if (type == LEFT) { |
| PushBackStringPiece(second, output); |
| PushBackJoinedStringPieces(first_content, particle, output); |
| } else { |
| PushBackStringPiece(first_content, output); |
| } |
| return; |
| } |
| } |
| } |
| |
| bool IsNaturalContent(const Segment::Candidate &cand, |
| const Segment::Candidate &top_cand, |
| SegmentLookupType type, |
| vector<string> *output) { |
| const string &content = cand.content_value; |
| const string &value = cand.value; |
| const string &top_content = top_cand.content_value; |
| const string &top_value = top_cand.value; |
| |
| const size_t top_content_len = Util::CharsLen(top_content); |
| const size_t content_len = Util::CharsLen(content); |
| |
| if (type == RIGHT && |
| value != top_value && |
| top_content_len >= 2 && |
| content_len == 1) { |
| return false; |
| } |
| |
| if (type == LEFT) { |
| output->push_back(value); |
| } else { |
| output->push_back(content); |
| // "舞って" workaround |
| // V+"て" is often treated as one compound. |
| static const char kPat[] = "\xE3\x81\xA6"; // "て" |
| if (Util::EndsWith(content, StringPiece(kPat, arraysize(kPat) - 1))) { |
| PushBackStringPiece( |
| Util::SubStringPiece(content, 0, content_len - 1), output); |
| } |
| } |
| |
| // we don't rewrite NUMBER to others and vice versa |
| if (ContainsNumber(value) != ContainsNumber(top_value)) { |
| return false; |
| } |
| |
| const StringPiece top_aux_value = |
| Util::SubStringPiece(top_value, top_content_len, string::npos); |
| const size_t top_aux_value_len = Util::CharsLen(top_aux_value); |
| const Util::ScriptType top_value_script_type = Util::GetScriptType(top_value); |
| |
| // we don't rewrite KATAKANA segment |
| // for example, we don't rewrite "コーヒー飲みます" to "珈琲飲みます" |
| if (type == LEFT && |
| top_aux_value_len == 0 && |
| top_value != value && |
| top_value_script_type == Util::KATAKANA) { |
| return false; |
| } |
| |
| // special cases |
| if (top_content_len == 1) { |
| const char *begin = top_content.data(); |
| const char *end = top_content.data() + top_content.size(); |
| size_t mblen = 0; |
| const char32 wchar = Util::UTF8ToUCS4(begin, end, &mblen); |
| |
| switch (wchar) { |
| case 0x304a: // "お" |
| case 0x5fa1: // "御" |
| case 0x3054: // "ご" |
| return true; |
| default: |
| break; |
| } |
| } |
| |
| const StringPiece aux_value = |
| Util::SubStringPiece(value, content_len, string::npos); |
| |
| // Remove number in normalization for the left segment. |
| string aux_normalized, top_aux_normalized; |
| CollocationUtil::GetNormalizedScript( |
| aux_value, (type == LEFT), &aux_normalized); |
| CollocationUtil::GetNormalizedScript( |
| top_aux_value, (type == LEFT), &top_aux_normalized); |
| if (!aux_normalized.empty() && |
| !Util::IsScriptType(aux_normalized, Util::HIRAGANA)) { |
| if (type == RIGHT) { |
| return false; |
| } |
| if (aux_normalized != top_aux_normalized) { |
| return false; |
| } |
| } |
| |
| ResolveCompoundSegment(top_value, value, type, output); |
| |
| const size_t aux_value_len = Util::CharsLen(aux_value); |
| const size_t value_len = Util::CharsLen(value); |
| |
| // "<XXいる|>" can be rewrited to "<YY|いる>" and vice versa |
| { |
| static const char kPat[] = "\xE3\x81\x84\xE3\x82\x8B"; // "いる" |
| const StringPiece kSuffix(kPat, arraysize(kPat) - 1); |
| if (top_aux_value_len == 0 && |
| aux_value_len == 2 && |
| Util::EndsWith(top_value, kSuffix) && |
| Util::EndsWith(aux_value, kSuffix)) { |
| if (type == RIGHT) { |
| // "YYいる" in addition to "YY" |
| output->push_back(value); |
| } |
| return true; |
| } |
| if (aux_value_len == 0 && |
| top_aux_value_len == 2 && |
| Util::EndsWith(value, kSuffix) && |
| Util::EndsWith(top_aux_value, kSuffix)) { |
| if (type == RIGHT) { |
| // "YY" in addition to "YYいる" |
| PushBackStringPiece( |
| Util::SubStringPiece(value, 0, value_len - 2), output); |
| } |
| return true; |
| } |
| } |
| |
| // "<XXせる|>" can be rewrited to "<YY|せる>" and vice versa |
| { |
| const char kPat[] = "\xE3\x81\x9B\xE3\x82\x8B"; // "せる" |
| const StringPiece kSuffix(kPat, arraysize(kPat) - 1); |
| if (top_aux_value_len == 0 && |
| aux_value_len == 2 && |
| Util::EndsWith(top_value, kSuffix) && |
| Util::EndsWith(aux_value, kSuffix)) { |
| if (type == RIGHT) { |
| // "YYせる" in addition to "YY" |
| output->push_back(value); |
| } |
| return true; |
| } |
| if (aux_value_len == 0 && |
| top_aux_value_len == 2 && |
| Util::EndsWith(value, kSuffix) && |
| Util::EndsWith(top_aux_value, kSuffix)) { |
| if (type == RIGHT) { |
| // "YY" in addition to "YYせる" |
| PushBackStringPiece( |
| Util::SubStringPiece(value, 0, value_len - 2), output); |
| } |
| return true; |
| } |
| } |
| |
| const Util::ScriptType content_script_type = Util::GetScriptType(content); |
| |
| // "<XX|する>" can be rewrited using "<XXす|る>" and "<XX|する>" |
| // in "<XX|する>", XX must be single script type |
| // "評する" |
| { |
| static const char kPat[] = "\xE3\x81\x99\xE3\x82\x8B"; // "する" |
| const StringPiece kSuffix(kPat, arraysize(kPat) - 1); |
| if (aux_value_len == 2 && |
| Util::EndsWith(aux_value, kSuffix)) { |
| if (content_script_type != Util::KATAKANA && |
| content_script_type != Util::HIRAGANA && |
| content_script_type != Util::KANJI && |
| content_script_type != Util::ALPHABET) { |
| return false; |
| } |
| if (type == RIGHT) { |
| // "YYす" in addition to "YY" |
| PushBackStringPiece( |
| Util::SubStringPiece(value, 0, value_len - 1), output); |
| } |
| return true; |
| } |
| } |
| |
| // "<XXる>" can be rewrited using "<XX|る>" |
| // "まとめる", "衰える" |
| { |
| static const char kPat[] = "\xE3\x82\x8B"; // "る" |
| const StringPiece kSuffix(kPat, arraysize(kPat) - 1); |
| if (aux_value_len == 0 && |
| Util::EndsWith(value, kSuffix)) { |
| if (type == RIGHT) { |
| // "YY" in addition to "YYる" |
| PushBackStringPiece( |
| Util::SubStringPiece(value, 0, value_len - 1), output); |
| } |
| return true; |
| } |
| } |
| |
| // "<XXす>" can be rewrited using "XXする" |
| { |
| static const char kPat[] = "\xE3\x81\x99"; // "す" |
| const StringPiece kSuffix(kPat, arraysize(kPat) - 1); |
| if (Util::EndsWith(value, kSuffix) && |
| Util::IsScriptType( |
| Util::SubStringPiece(value, 0, value_len - 1), |
| Util::KANJI)) { |
| if (type == RIGHT) { |
| const char kRu[] = "\xE3\x82\x8B"; |
| // "YYする" in addition to "YY" |
| PushBackJoinedStringPieces( |
| value, StringPiece(kRu, arraysize(kRu) - 1), output); |
| } |
| return true; |
| } |
| } |
| |
| // "<XXし|た>" can be rewrited using "<XX|した>" |
| { |
| static const char kPat[] = "\xE3\x81\x97\xE3\x81\x9F"; // "した" |
| const StringPiece kShi(kPat, 3), kTa(kPat + 3, 3); |
| if (Util::EndsWith(content, kShi) && |
| aux_value == kTa && |
| Util::EndsWith(top_content, kShi) && |
| top_aux_value == kTa) { |
| if (type == RIGHT) { |
| const StringPiece val = |
| Util::SubStringPiece(content, 0, content_len - 1); |
| // XX must be KANJI |
| if (Util::IsScriptType(val, Util::KANJI)) { |
| PushBackStringPiece(val, output); |
| } |
| } |
| return true; |
| } |
| } |
| |
| const int aux_len = value_len - content_len; |
| const int top_aux_len = Util::CharsLen(top_value) - top_content_len; |
| if (aux_len != top_aux_len) { |
| return false; |
| } |
| |
| const Util::ScriptType top_content_script_type = |
| Util::GetScriptType(top_content); |
| |
| // we don't rewrite HIRAGANA to KATAKANA |
| if (top_content_script_type == Util::HIRAGANA && |
| content_script_type == Util::KATAKANA) { |
| return false; |
| } |
| |
| // we don't rewrite second KATAKANA |
| // for example, we don't rewrite "このコーヒー" to "この珈琲" |
| if (type == RIGHT && |
| top_content_script_type == Util::KATAKANA && |
| value != top_value) { |
| return false; |
| } |
| |
| if (top_content_len == 1 && |
| top_content_script_type == Util::HIRAGANA) { |
| return false; |
| } |
| |
| // suppress "<身|ています>" etc. |
| if (top_content_len == 1 && |
| content_len == 1 && |
| top_aux_value_len >= 2 && |
| aux_value_len >= 2 && |
| top_content_script_type == Util::KANJI && |
| content_script_type == Util::KANJI && |
| top_content != content) { |
| return false; |
| } |
| |
| return true; |
| } |
| |
| // Just a wrapper of IsNaturalContent for debug. |
| bool VerifyNaturalContent(const Segment::Candidate &cand, |
| const Segment::Candidate &top_cand, |
| SegmentLookupType type) { |
| vector<string> nexts; |
| return IsNaturalContent(cand, top_cand, RIGHT, &nexts); |
| } |
| |
| inline bool IsKeyUnknown(const Segment &seg) { |
| return Util::IsScriptType(seg.key(), Util::UNKNOWN_SCRIPT); |
| } |
| |
| } // namespace |
| |
| bool CollocationRewriter::RewriteCollocation(Segments *segments) const { |
| // return false if at least one segment is fixed. |
| for (size_t i = segments->history_segments_size(); |
| i < segments->segments_size(); ++i) { |
| if (segments->segment(i).segment_type() == Segment::FIXED_VALUE) { |
| return false; |
| } |
| } |
| |
| vector<bool> segs_changed(segments->segments_size(), false); |
| bool changed = false; |
| |
| for (size_t i = segments->history_segments_size(); |
| i < segments->segments_size(); ++i) { |
| bool rewrited_next = false; |
| |
| if (IsKeyUnknown(segments->segment(i))) { |
| continue; |
| } |
| |
| if (i + 1 < segments->segments_size() && |
| RewriteUsingNextSegment(segments->mutable_segment(i + 1), |
| segments->mutable_segment(i))) { |
| changed = true; |
| rewrited_next = true; |
| segs_changed[i] = true; |
| segs_changed[i + 1] = true; |
| } |
| |
| if (!segs_changed[i] && |
| !rewrited_next && |
| i > 0 && |
| RewriteFromPrevSegment(segments->segment(i - 1).candidate(0), |
| segments->mutable_segment(i))) { |
| changed = true; |
| segs_changed[i - 1] = true; |
| segs_changed[i] = true; |
| } |
| |
| const Segment::Candidate &cand = segments->segment(i).candidate(0); |
| if (i >= 2 && |
| // Cross over only adverbs |
| // Segment is adverb if; |
| // 1) lid and rid is adverb. |
| // 2) or rid is adverb suffix. |
| ((pos_matcher_->IsAdverb(segments->segment(i - 1).candidate(0).lid) && |
| pos_matcher_->IsAdverb(segments->segment(i - 1).candidate(0).rid)) || |
| pos_matcher_->IsAdverbSegmentSuffix( |
| segments->segment(i - 1).candidate(0).rid)) && |
| (cand.content_value != cand.value || |
| cand.value != "\xe3\x83\xbb")) { // "・" workaround |
| if (!segs_changed[i - 2] && |
| !segs_changed[i] && |
| RewriteUsingNextSegment(segments->mutable_segment(i), |
| segments->mutable_segment(i - 2))) { |
| changed = true; |
| segs_changed[i] = true; |
| segs_changed[i - 2] = true; |
| } else if (!segs_changed[i] && |
| RewriteFromPrevSegment( |
| segments->segment(i - 2).candidate(0), |
| segments->mutable_segment(i))) { |
| changed = true; |
| segs_changed[i] = true; |
| segs_changed[i - 2] = true; |
| } |
| } |
| } |
| |
| return changed; |
| } |
| |
| class CollocationRewriter::CollocationFilter { |
| public: |
| CollocationFilter(const char *existence_data, size_t size) |
| : filter_(ExistenceFilter::Read(existence_data, size)) { |
| } |
| ~CollocationFilter() { |
| } |
| |
| bool Exists(const string &left, const string &right) const { |
| if (left.empty() || right.empty()) { |
| return false; |
| } |
| string key; |
| key.reserve(left.size() + right.size()); |
| key.assign(left).append(right); |
| const uint64 id = Util::Fingerprint(key); |
| return filter_->Exists(id); |
| } |
| |
| private: |
| scoped_ptr<ExistenceFilter> filter_; |
| |
| DISALLOW_COPY_AND_ASSIGN(CollocationFilter); |
| }; |
| |
| class CollocationRewriter::SuppressionFilter { |
| public: |
| SuppressionFilter(const char *suppression_data, size_t size) |
| : filter_(ExistenceFilter::Read(suppression_data, size)) { |
| } |
| ~SuppressionFilter() { |
| } |
| |
| bool Exists(const Segment::Candidate &cand) const { |
| // TODO(noriyukit): We should share key generation rule with |
| // gen_collocation_suppression_data_main.cc. |
| string key; |
| key.reserve(cand.content_value.size() + 1 + cand.content_key.size()); |
| key.assign(cand.content_value).append("\t").append(cand.content_key); |
| const uint64 id = Util::Fingerprint(key); |
| return filter_->Exists(id); |
| } |
| |
| private: |
| scoped_ptr<ExistenceFilter> filter_; |
| |
| DISALLOW_COPY_AND_ASSIGN(SuppressionFilter); |
| }; |
| |
| CollocationRewriter::CollocationRewriter( |
| const DataManagerInterface *data_manager) |
| : pos_matcher_(data_manager->GetPOSMatcher()), |
| first_name_id_(pos_matcher_->GetFirstNameId()), |
| last_name_id_(pos_matcher_->GetLastNameId()) { |
| const char *data = NULL; |
| size_t size = 0; |
| |
| data_manager->GetCollocationData(&data, &size); |
| collocation_filter_.reset(new CollocationFilter(data, size)); |
| |
| data_manager->GetCollocationSuppressionData(&data, &size); |
| suppression_filter_.reset(new SuppressionFilter(data, size)); |
| } |
| |
| CollocationRewriter::~CollocationRewriter() {} |
| |
| bool CollocationRewriter::Rewrite(const ConversionRequest &request, |
| Segments *segments) const { |
| return RewriteCollocation(segments); |
| } |
| |
| bool CollocationRewriter::IsName(const Segment::Candidate &cand) const { |
| const bool ret = (cand.lid == last_name_id_ || cand.lid == first_name_id_); |
| VLOG_IF(3, ret) << cand.value << " is name sagment"; |
| return ret; |
| } |
| |
| bool CollocationRewriter::RewriteFromPrevSegment( |
| const Segment::Candidate &prev_cand, |
| Segment *seg) const { |
| string prev; |
| CollocationUtil::GetNormalizedScript(prev_cand.value, true, &prev); |
| |
| const size_t i_max = min(seg->candidates_size(), kCandidateSize); |
| |
| // Reuse |curs| and |cur| in the loop as this method is performance critical. |
| vector<string> curs; |
| string cur; |
| for (size_t i = 0; i < i_max; ++i) { |
| if (IsName(seg->candidate(i))) { |
| continue; |
| } |
| if (suppression_filter_->Exists(seg->candidate(i))) { |
| continue; |
| } |
| curs.clear(); |
| if (!IsNaturalContent(seg->candidate(i), seg->candidate(0), RIGHT, &curs)) { |
| continue; |
| } |
| |
| for (int j = 0; j < curs.size(); ++j) { |
| cur.clear(); |
| CollocationUtil::GetNormalizedScript(curs[j], false, &cur); |
| if (collocation_filter_->Exists(prev, cur)) { |
| VLOG_IF(3, i != 0) << prev << cur << " " |
| << seg->candidate(0).value << "->" |
| << seg->candidate(i).value; |
| seg->move_candidate(i, 0); |
| seg->mutable_candidate(0)->attributes |
| |= Segment::Candidate::CONTEXT_SENSITIVE; |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| bool CollocationRewriter::RewriteUsingNextSegment(Segment *next_seg, |
| Segment *seg) const { |
| const size_t i_max = min(seg->candidates_size(), kCandidateSize); |
| const size_t j_max = min(next_seg->candidates_size(), kCandidateSize); |
| |
| // Cache the results for the next segment |
| vector<int> next_seg_ok(j_max); // Avoiding vector<bool> |
| vector<vector<string> > normalized_string(j_max); |
| |
| // Reuse |nexts| in the loop as this method is performance critical. |
| vector<string> nexts; |
| for (size_t j = 0; j < j_max; ++j) { |
| next_seg_ok[j] = 0; |
| |
| if (IsName(next_seg->candidate(j))) { |
| continue; |
| } |
| if (suppression_filter_->Exists(next_seg->candidate(j))) { |
| continue; |
| } |
| nexts.clear(); |
| if (!IsNaturalContent(next_seg->candidate(j), |
| next_seg->candidate(0), RIGHT, &nexts)) { |
| continue; |
| } |
| |
| next_seg_ok[j] = 1; |
| for (vector<string>::const_iterator it = nexts.begin(); |
| it != nexts.end(); ++it) { |
| normalized_string[j].push_back(string()); |
| CollocationUtil::GetNormalizedScript( |
| *it, false, &normalized_string[j].back()); |
| } |
| } |
| |
| // Reuse |curs| and |cur| in the loop as this method is performance critical. |
| vector<string> curs; |
| string cur; |
| for (size_t i = 0; i < i_max; ++i) { |
| if (IsName(seg->candidate(i))) { |
| continue; |
| } |
| if (suppression_filter_->Exists(seg->candidate(i))) { |
| continue; |
| } |
| curs.clear(); |
| if (!IsNaturalContent(seg->candidate(i), seg->candidate(0), LEFT, &curs)) { |
| continue; |
| } |
| |
| for (int k = 0; k < curs.size(); ++k) { |
| cur.clear(); |
| CollocationUtil::GetNormalizedScript(curs[k], true, &cur); |
| for (size_t j = 0; j < j_max; ++j) { |
| if (!next_seg_ok[j]) { |
| continue; |
| } |
| |
| for (int l = 0; l < normalized_string[j].size(); ++l) { |
| const string &next = normalized_string[j][l]; |
| if (collocation_filter_->Exists(cur, next)) { |
| DCHECK(VerifyNaturalContent( |
| next_seg->candidate(j), next_seg->candidate(0), RIGHT)) |
| << "IsNaturalContent() should not fail here."; |
| seg->move_candidate(i, 0); |
| seg->mutable_candidate(0)->attributes |
| |= Segment::Candidate::CONTEXT_SENSITIVE; |
| next_seg->move_candidate(j, 0); |
| next_seg->mutable_candidate(0)->attributes |
| |= Segment::Candidate::CONTEXT_SENSITIVE; |
| return true; |
| } |
| } |
| } |
| } |
| } |
| return false; |
| } |
| |
| } // namespace mozc |