| // Copyright 2010-2014, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "rewriter/single_kanji_rewriter.h" |
| |
| #include <algorithm> |
| #include <string> |
| #include <vector> |
| #include <set> |
| |
| #include "base/logging.h" |
| #include "base/singleton.h" |
| #include "base/util.h" |
| #include "config/config.pb.h" |
| #include "config/config_handler.h" |
| #include "converter/conversion_request.h" |
| #include "converter/segments.h" |
| #include "dictionary/pos_matcher.h" |
| #include "rewriter/embedded_dictionary.h" |
| #include "rewriter/rewriter_interface.h" |
| #include "session/commands.pb.h" |
| |
| namespace mozc { |
| |
| namespace { |
| |
| struct SingleKanjiList { |
| // reading, single_kanji_s |
| // { "あ", "亜阿有..." } |
| const char *key; |
| const char *values; |
| }; |
| |
| struct KanjiVariantItem { |
| // target, original, type_id |
| // { "亞", "亜", 0 } , |
| const char *target; |
| const char *original; |
| int type_id; |
| }; |
| |
| #include "rewriter/single_kanji_rewriter_data.h" |
| |
| // Since NounPrefixDictionary is just a tentative workaround, |
| // we copy the SingleKanji structure so that we can remove this workaround |
| // easily. Also, the logic of NounPrefix insertion is put independently from |
| // the single kanji dictionary. Ideally, we want to regenerate our |
| // language model for fixing noun-prefix issue. |
| class NounPrefixDictionary { |
| public: |
| NounPrefixDictionary() |
| : dic_(new EmbeddedDictionary(kNounPrefixData_token_data, |
| kNounPrefixData_token_size)) {} |
| |
| ~NounPrefixDictionary() {} |
| |
| EmbeddedDictionary *GetDictionary() const { |
| return dic_.get(); |
| } |
| |
| private: |
| scoped_ptr<EmbeddedDictionary> dic_; |
| }; |
| |
| struct SingleKanjiListCompare { |
| bool operator()(const SingleKanjiList &lhs, |
| const SingleKanjiList &rhs) const { |
| return (strcmp(lhs.key, rhs.key) < 0); |
| } |
| }; |
| |
| // Lookup SingleKanjiList from key (reading). |
| // Returns false if not found. |
| bool LookupKanjiList(const string &key, vector<string> *kanji_list) { |
| DCHECK(kanji_list); |
| SingleKanjiList key_item; |
| key_item.key = key.c_str(); |
| const SingleKanjiList *result = |
| lower_bound(kSingleKanjis, kSingleKanjis + arraysize(kSingleKanjis), |
| key_item, SingleKanjiListCompare()); |
| if (result == (kSingleKanjis + arraysize(kSingleKanjis)) || |
| key.compare(result->key) != 0) { |
| return false; |
| } |
| Util::SplitStringToUtf8Chars(result->values, kanji_list); |
| return true; |
| } |
| |
| struct KanjiVariantItemCompare { |
| bool operator()(const KanjiVariantItem &lhs, |
| const KanjiVariantItem &rhs) const { |
| return (strcmp(lhs.target, rhs.target) < 0); |
| } |
| }; |
| |
| // Generates kanji variant description from key. |
| // Does nothing if not found. |
| void GenerateDescription(const string &key, string *desc) { |
| DCHECK(desc); |
| KanjiVariantItem key_item; |
| key_item.target = key.c_str(); |
| const KanjiVariantItem *result = |
| lower_bound(kKanjiVariants, kKanjiVariants + arraysize(kKanjiVariants), |
| key_item, KanjiVariantItemCompare()); |
| if (result == (kKanjiVariants + arraysize(kKanjiVariants)) || |
| key.compare(result->target) != 0) { |
| return; |
| } |
| DCHECK_LT(result->type_id, arraysize(kKanjiVariantTypes)); |
| desc->assign(Util::StringPrintf( |
| // "%sの%s" |
| "%s\xe3\x81\xae%s", |
| result->original, kKanjiVariantTypes[result->type_id])); |
| } |
| |
| // Add single kanji variants description to existing candidates, |
| // because if we have candidates with same value, the lower ranked candidate |
| // will be removed. |
| void AddDescriptionForExsistingCandidates(Segment *segment) { |
| DCHECK(segment); |
| for (size_t i = 0; i < segment->candidates_size(); ++i) { |
| Segment::Candidate *cand = segment->mutable_candidate(i); |
| if (!cand->description.empty()) { |
| continue; |
| } |
| GenerateDescription(cand->value, &cand->description); |
| } |
| } |
| |
| void FillCandidate(const string &key, const string &value, |
| int cost, uint16 single_kanji_id, |
| Segment::Candidate *cand) { |
| cand->lid = single_kanji_id; |
| cand->rid = single_kanji_id; |
| cand->cost = cost; |
| cand->content_key = key; |
| cand->content_value = value; |
| cand->key = key; |
| cand->value = value; |
| cand->attributes |= Segment::Candidate::CONTEXT_SENSITIVE; |
| cand->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION; |
| GenerateDescription(value, &cand->description); |
| } |
| |
| // Insert SingleKanji into segment. |
| void InsertCandidate(bool is_single_segment, |
| uint16 single_kanji_id, |
| const vector<string> &kanji_list, |
| Segment *segment) { |
| DCHECK(segment); |
| if (segment->candidates_size() == 0) { |
| LOG(WARNING) << "candidates_size is 0"; |
| return; |
| } |
| |
| const string &candidate_key = ((!segment->key().empty()) ? |
| segment->key() : |
| segment->candidate(0).key); |
| |
| // Adding 8000 to the single kanji cost |
| // Note that this cost does not make no effect. |
| // Here we set the cost just in case. |
| const int kOffsetCost = 8000; |
| |
| // Append single-kanji |
| for (size_t i = 0; i < kanji_list.size(); ++i) { |
| Segment::Candidate *c = segment->push_back_candidate(); |
| FillCandidate(candidate_key, kanji_list[i], |
| kOffsetCost + i, single_kanji_id, c); |
| } |
| } |
| |
| // Insert Noun prefix into segment. |
| void InsertNounPrefix(const POSMatcher &pos_matcher, |
| Segment *segment, |
| const EmbeddedDictionary::Value *dict_values, |
| size_t dict_values_size) { |
| DCHECK(dict_values); |
| DCHECK_GT(dict_values_size, 0); |
| |
| if (segment->candidates_size() == 0) { |
| LOG(WARNING) << "candidates_size is 0"; |
| return; |
| } |
| |
| if (segment->segment_type() == Segment::FIXED_VALUE) { |
| return; |
| } |
| |
| const string &candidate_key = ((!segment->key().empty()) ? |
| segment->key() : |
| segment->candidate(0).key); |
| for (int i = 0; i < dict_values_size; ++i) { |
| const int insert_pos = min( |
| static_cast<int>(segment->candidates_size()), |
| static_cast<int>(dict_values[i].cost + |
| (segment->candidate(0).attributes & |
| Segment::Candidate::CONTEXT_SENSITIVE) ? 1 : 0)); |
| Segment::Candidate *c = segment->insert_candidate(insert_pos); |
| c->lid = pos_matcher.GetNounPrefixId(); |
| c->rid = pos_matcher.GetNounPrefixId(); |
| c->cost = 5000; |
| c->content_value = dict_values[i].value; |
| c->key = candidate_key; |
| c->content_key = candidate_key; |
| c->value = dict_values[i].value; |
| c->attributes |= Segment::Candidate::CONTEXT_SENSITIVE; |
| c->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION; |
| } |
| } |
| } // namespace |
| |
| SingleKanjiRewriter::SingleKanjiRewriter(const POSMatcher &pos_matcher) |
| : pos_matcher_(&pos_matcher) {} |
| |
| SingleKanjiRewriter::~SingleKanjiRewriter() {} |
| |
| int SingleKanjiRewriter::capability(const ConversionRequest &request) const { |
| if (request.request().mixed_conversion()) { |
| return RewriterInterface::ALL; |
| } |
| return RewriterInterface::CONVERSION; |
| } |
| |
| bool SingleKanjiRewriter::Rewrite(const ConversionRequest &request, |
| Segments *segments) const { |
| if (!GET_CONFIG(use_single_kanji_conversion)) { |
| VLOG(2) << "no use_single_kanji_conversion"; |
| return false; |
| } |
| |
| bool modified = false; |
| const size_t segments_size = segments->conversion_segments_size(); |
| const bool is_single_segment = (segments_size == 1); |
| for (size_t i = 0; i < segments_size; ++i) { |
| AddDescriptionForExsistingCandidates( |
| segments->mutable_conversion_segment(i)); |
| |
| const string &key = segments->conversion_segment(i).key(); |
| vector<string> kanji_list; |
| if (!LookupKanjiList(key, &kanji_list)) { |
| continue; |
| } |
| InsertCandidate(is_single_segment, |
| pos_matcher_->GetGeneralSymbolId(), |
| kanji_list, |
| segments->mutable_conversion_segment(i)); |
| |
| modified = true; |
| } |
| |
| // Tweak for noun prefix. |
| // TODO(team): Ideally, this issue can be fixed via the language model |
| // and dictionary generation. |
| for (size_t i = 0; i < segments_size; ++i) { |
| if (segments->conversion_segment(i).candidates_size() == 0) { |
| continue; |
| } |
| |
| if (i + 1 < segments_size) { |
| const Segment::Candidate &right_candidate = |
| segments->conversion_segment(i + 1).candidate(0); |
| // right segment must be a noun. |
| if (!pos_matcher_->IsContentNoun(right_candidate.lid)) { |
| continue; |
| } |
| } else if (segments_size != 1) { // also apply if segments_size == 1. |
| continue; |
| } |
| |
| const string &key = segments->conversion_segment(i).key(); |
| const EmbeddedDictionary::Token *token = |
| Singleton<NounPrefixDictionary>::get()->GetDictionary()->Lookup(key); |
| if (token == NULL) { |
| continue; |
| } |
| InsertNounPrefix(*pos_matcher_, |
| segments->mutable_conversion_segment(i), |
| token->value, token->value_size); |
| // Ignore the next noun content word. |
| ++i; |
| modified = true; |
| } |
| |
| return modified; |
| } |
| } // namespace mozc |