| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "rewriter/usage_rewriter.h" |
| |
| #include <string> |
| |
| #include "base/logging.h" |
| #include "base/util.h" |
| #include "config/config.pb.h" |
| #include "config/config_handler.h" |
| #include "converter/conversion_request.h" |
| #include "converter/segments.h" |
| #include "data_manager/data_manager_interface.h" |
| #include "dictionary/pos_matcher.h" |
| #include "dictionary/dictionary_interface.h" |
| |
| using mozc::dictionary::DictionaryInterface; |
| using mozc::dictionary::POSMatcher; |
| |
| namespace mozc { |
| |
| UsageRewriter::UsageRewriter(const DataManagerInterface *data_manager, |
| const DictionaryInterface *dictionary) |
| : pos_matcher_(data_manager->GetPOSMatcher()), |
| dictionary_(dictionary), |
| base_conjugation_suffix_(NULL) { |
| const ConjugationSuffix *conjugation_suffix_data = NULL; |
| const int *conjugation_suffix_data_index = NULL; |
| const UsageDictItem *usage_data_value = NULL; |
| data_manager->GetUsageRewriterData(&base_conjugation_suffix_, |
| &conjugation_suffix_data, |
| &conjugation_suffix_data_index, |
| &usage_data_value); |
| CHECK(base_conjugation_suffix_); |
| CHECK(conjugation_suffix_data); |
| CHECK(conjugation_suffix_data_index); |
| CHECK(conjugation_suffix_data_index); |
| const UsageDictItem *item = usage_data_value; |
| // TODO(taku): To reduce memory footprint, better to replace it with |
| // binary search over the conjugation_suffix_data diretly. |
| for (; item->key != NULL; ++item) { |
| for (size_t i = conjugation_suffix_data_index[item->conjugation_id]; |
| i < conjugation_suffix_data_index[item->conjugation_id + 1]; |
| ++i) { |
| StrPair key_value1( |
| string(item->key) + conjugation_suffix_data[i].key_suffix, |
| string(item->value) + conjugation_suffix_data[i].value_suffix); |
| key_value_usageitem_map_[key_value1] = item; |
| StrPair key_value2( |
| "", |
| string(item->value) + conjugation_suffix_data[i].value_suffix); |
| key_value_usageitem_map_[key_value2] = item; |
| } |
| } |
| } |
| |
| UsageRewriter::~UsageRewriter() { |
| } |
| |
| // static |
| // "合いました" => "合い" |
| string UsageRewriter::GetKanjiPrefixAndOneHiragana(const string &word) { |
| // TODO(hidehiko): Refactor more based on ConstChar32Iterator. |
| string result; |
| int pos = 0; |
| bool has_kanji = false; |
| bool has_hiragana = false; |
| for (ConstChar32Iterator iter(word); !iter.Done(); iter.Next()) { |
| const char32 w = iter.Get(); |
| const Util::ScriptType s = Util::GetScriptType(w); |
| if (pos == 0 && s != Util::KANJI) { |
| return ""; |
| } else if (pos >= 0 && pos <= 1 && s == Util::KANJI) { |
| // length of kanji <= 2. |
| has_kanji = true; |
| ++pos; |
| Util::UCS4ToUTF8Append(w, &result); |
| continue; |
| } else if (pos > 0 && s == Util::HIRAGANA) { |
| has_hiragana = true; |
| Util::UCS4ToUTF8Append(w, &result); |
| break; |
| } else { |
| return ""; |
| } |
| } |
| |
| if (has_hiragana && has_kanji) { |
| return result; |
| } |
| |
| return ""; |
| } |
| |
| const UsageDictItem* UsageRewriter::LookupUnmatchedUsageHeuristically( |
| const Segment::Candidate &candidate) const { |
| // We check Unknwon POS ("名詞,サ変接続") as well, since |
| // target verbs/adjectives may be in web dictionary. |
| if (!pos_matcher_->IsContentWordWithConjugation(candidate.lid) && |
| !pos_matcher_->IsUnknown(candidate.lid)) { |
| return NULL; |
| } |
| |
| const string value = GetKanjiPrefixAndOneHiragana(candidate.content_value); |
| if (value.empty()) { |
| return NULL; |
| } |
| |
| // key is empty; |
| StrPair key_value("", value); |
| const map<StrPair, const UsageDictItem *>::const_iterator itr = |
| key_value_usageitem_map_.find(key_value); |
| // Check result key part is a prefix of the content_key. |
| if (itr != key_value_usageitem_map_.end() && |
| Util::StartsWith(candidate.content_key, itr->second->key)) { |
| return itr->second; |
| } |
| |
| return NULL; |
| } |
| |
| const UsageDictItem* UsageRewriter::LookupUsage( |
| const Segment::Candidate &candidate) const { |
| const string &key = candidate.content_key; |
| const string &value = candidate.content_value; |
| StrPair key_value(key, value); |
| const map<StrPair, const UsageDictItem *>::const_iterator itr = |
| key_value_usageitem_map_.find(key_value); |
| if (itr != key_value_usageitem_map_.end()) { |
| return itr->second; |
| } |
| |
| return LookupUnmatchedUsageHeuristically(candidate); |
| } |
| |
| bool UsageRewriter::Rewrite(const ConversionRequest &request, |
| Segments *segments) const { |
| VLOG(2) << segments->DebugString(); |
| |
| const config::Config &config = config::ConfigHandler::GetConfig(); |
| // Default value of use_local_usage_dictionary() is true. |
| // So if information_list_config() is not available in the config, |
| // we don't need to return false here. |
| if (config.has_information_list_config() && |
| !config.information_list_config().use_local_usage_dictionary()) { |
| return false; |
| } |
| |
| bool modified = false; |
| // UsageIDs for embedded usage dictionary are generated in advance by |
| // gen_usage_rewriter_dictionary_main.cc (which are just sequential numbers). |
| // However, since user dictionary comments don't have such IDs, dynamically |
| // genereate them so that they don't conflict with those of the embedded usage |
| // dictionary. Since just the uniqueness in one Segments is sufficient, for |
| // usage from the user dictionary, we simply assign sequential numbers larger |
| // than the maximum ID of the embedded usage dictionary. |
| int32 usage_id_for_user_comment = key_value_usageitem_map_.size(); |
| string comment; |
| for (size_t i = 0; i < segments->conversion_segments_size(); ++i) { |
| Segment *segment = segments->mutable_conversion_segment(i); |
| DCHECK(segment); |
| for (size_t j = 0; j < segment->candidates_size(); ++j) { |
| ++usage_id_for_user_comment; |
| |
| // First, search the user dictionary for comment. |
| if (dictionary_ != NULL) { |
| if (dictionary_->LookupComment(segment->candidate(j).content_key, |
| segment->candidate(j).content_value, |
| &comment)) { |
| Segment::Candidate *candidate = segment->mutable_candidate(j); |
| candidate->usage_id = usage_id_for_user_comment; |
| candidate->usage_title = segment->candidate(j).content_value; |
| candidate->usage_description = comment; |
| comment.clear(); |
| modified = true; |
| continue; |
| } |
| } |
| |
| // If comment isn't in the user dictionary, search the system usage |
| // dictionary. |
| const UsageDictItem *usage = LookupUsage(segment->candidate(j)); |
| if (usage != NULL) { |
| Segment::Candidate *candidate = segment->mutable_candidate(j); |
| DCHECK(candidate); |
| candidate->usage_id = usage->id; |
| candidate->usage_title |
| .assign(usage->value) |
| .append( |
| base_conjugation_suffix_[usage->conjugation_id].value_suffix); |
| candidate->usage_description = usage->meaning; |
| VLOG(2) << i << ":" << j << ":" << |
| candidate->content_key << ":" << candidate->content_value << |
| ":" << usage->key << ":" << usage->value << |
| ":" << usage->conjugation_id << ":" << usage->meaning; |
| modified = true; |
| } |
| } |
| } |
| return modified; |
| } |
| } // namespace mozc |