blob: eb32cffb2b9cc4dc79ca9e6dc7a14d1d8d241ff5 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/single_kanji_rewriter.h"
#include <algorithm>
#include <string>
#include <vector>
#include <set>
#include "base/logging.h"
#include "base/singleton.h"
#include "base/util.h"
#include "config/config.pb.h"
#include "config/config_handler.h"
#include "converter/conversion_request.h"
#include "converter/segments.h"
#include "dictionary/pos_matcher.h"
#include "rewriter/embedded_dictionary.h"
#include "rewriter/rewriter_interface.h"
#include "session/commands.pb.h"
namespace mozc {
namespace {
struct SingleKanjiList {
// reading, single_kanji_s
// { "あ", "亜阿有..." }
const char *key;
const char *values;
};
struct KanjiVariantItem {
// target, original, type_id
// { "亞", "亜", 0 } ,
const char *target;
const char *original;
int type_id;
};
#include "rewriter/single_kanji_rewriter_data.h"
// Since NounPrefixDictionary is just a tentative workaround,
// we copy the SingleKanji structure so that we can remove this workaround
// easily. Also, the logic of NounPrefix insertion is put independently from
// the single kanji dictionary. Ideally, we want to regenerate our
// language model for fixing noun-prefix issue.
class NounPrefixDictionary {
public:
NounPrefixDictionary()
: dic_(new EmbeddedDictionary(kNounPrefixData_token_data,
kNounPrefixData_token_size)) {}
~NounPrefixDictionary() {}
EmbeddedDictionary *GetDictionary() const {
return dic_.get();
}
private:
scoped_ptr<EmbeddedDictionary> dic_;
};
struct SingleKanjiListCompare {
bool operator()(const SingleKanjiList &lhs,
const SingleKanjiList &rhs) const {
return (strcmp(lhs.key, rhs.key) < 0);
}
};
// Lookup SingleKanjiList from key (reading).
// Returns false if not found.
bool LookupKanjiList(const string &key, vector<string> *kanji_list) {
DCHECK(kanji_list);
SingleKanjiList key_item;
key_item.key = key.c_str();
const SingleKanjiList *result =
lower_bound(kSingleKanjis, kSingleKanjis + arraysize(kSingleKanjis),
key_item, SingleKanjiListCompare());
if (result == (kSingleKanjis + arraysize(kSingleKanjis)) ||
key.compare(result->key) != 0) {
return false;
}
Util::SplitStringToUtf8Chars(result->values, kanji_list);
return true;
}
struct KanjiVariantItemCompare {
bool operator()(const KanjiVariantItem &lhs,
const KanjiVariantItem &rhs) const {
return (strcmp(lhs.target, rhs.target) < 0);
}
};
// Generates kanji variant description from key.
// Does nothing if not found.
void GenerateDescription(const string &key, string *desc) {
DCHECK(desc);
KanjiVariantItem key_item;
key_item.target = key.c_str();
const KanjiVariantItem *result =
lower_bound(kKanjiVariants, kKanjiVariants + arraysize(kKanjiVariants),
key_item, KanjiVariantItemCompare());
if (result == (kKanjiVariants + arraysize(kKanjiVariants)) ||
key.compare(result->target) != 0) {
return;
}
DCHECK_LT(result->type_id, arraysize(kKanjiVariantTypes));
desc->assign(Util::StringPrintf(
// "%sの%s"
"%s\xe3\x81\xae%s",
result->original, kKanjiVariantTypes[result->type_id]));
}
// Add single kanji variants description to existing candidates,
// because if we have candidates with same value, the lower ranked candidate
// will be removed.
void AddDescriptionForExsistingCandidates(Segment *segment) {
DCHECK(segment);
for (size_t i = 0; i < segment->candidates_size(); ++i) {
Segment::Candidate *cand = segment->mutable_candidate(i);
if (!cand->description.empty()) {
continue;
}
GenerateDescription(cand->value, &cand->description);
}
}
void FillCandidate(const string &key, const string &value,
int cost, uint16 single_kanji_id,
Segment::Candidate *cand) {
cand->lid = single_kanji_id;
cand->rid = single_kanji_id;
cand->cost = cost;
cand->content_key = key;
cand->content_value = value;
cand->key = key;
cand->value = value;
cand->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
cand->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
GenerateDescription(value, &cand->description);
}
// Insert SingleKanji into segment.
void InsertCandidate(bool is_single_segment,
uint16 single_kanji_id,
const vector<string> &kanji_list,
Segment *segment) {
DCHECK(segment);
if (segment->candidates_size() == 0) {
LOG(WARNING) << "candidates_size is 0";
return;
}
const string &candidate_key = ((!segment->key().empty()) ?
segment->key() :
segment->candidate(0).key);
// Adding 8000 to the single kanji cost
// Note that this cost does not make no effect.
// Here we set the cost just in case.
const int kOffsetCost = 8000;
// Append single-kanji
for (size_t i = 0; i < kanji_list.size(); ++i) {
Segment::Candidate *c = segment->push_back_candidate();
FillCandidate(candidate_key, kanji_list[i],
kOffsetCost + i, single_kanji_id, c);
}
}
// Insert Noun prefix into segment.
void InsertNounPrefix(const POSMatcher &pos_matcher,
Segment *segment,
const EmbeddedDictionary::Value *dict_values,
size_t dict_values_size) {
DCHECK(dict_values);
DCHECK_GT(dict_values_size, 0);
if (segment->candidates_size() == 0) {
LOG(WARNING) << "candidates_size is 0";
return;
}
if (segment->segment_type() == Segment::FIXED_VALUE) {
return;
}
const string &candidate_key = ((!segment->key().empty()) ?
segment->key() :
segment->candidate(0).key);
for (int i = 0; i < dict_values_size; ++i) {
const int insert_pos = min(
static_cast<int>(segment->candidates_size()),
static_cast<int>(dict_values[i].cost +
(segment->candidate(0).attributes &
Segment::Candidate::CONTEXT_SENSITIVE) ? 1 : 0));
Segment::Candidate *c = segment->insert_candidate(insert_pos);
c->lid = pos_matcher.GetNounPrefixId();
c->rid = pos_matcher.GetNounPrefixId();
c->cost = 5000;
c->content_value = dict_values[i].value;
c->key = candidate_key;
c->content_key = candidate_key;
c->value = dict_values[i].value;
c->attributes |= Segment::Candidate::CONTEXT_SENSITIVE;
c->attributes |= Segment::Candidate::NO_VARIANTS_EXPANSION;
}
}
} // namespace
SingleKanjiRewriter::SingleKanjiRewriter(const POSMatcher &pos_matcher)
: pos_matcher_(&pos_matcher) {}
SingleKanjiRewriter::~SingleKanjiRewriter() {}
int SingleKanjiRewriter::capability(const ConversionRequest &request) const {
if (request.request().mixed_conversion()) {
return RewriterInterface::ALL;
}
return RewriterInterface::CONVERSION;
}
bool SingleKanjiRewriter::Rewrite(const ConversionRequest &request,
Segments *segments) const {
if (!GET_CONFIG(use_single_kanji_conversion)) {
VLOG(2) << "no use_single_kanji_conversion";
return false;
}
bool modified = false;
const size_t segments_size = segments->conversion_segments_size();
const bool is_single_segment = (segments_size == 1);
for (size_t i = 0; i < segments_size; ++i) {
AddDescriptionForExsistingCandidates(
segments->mutable_conversion_segment(i));
const string &key = segments->conversion_segment(i).key();
vector<string> kanji_list;
if (!LookupKanjiList(key, &kanji_list)) {
continue;
}
InsertCandidate(is_single_segment,
pos_matcher_->GetGeneralSymbolId(),
kanji_list,
segments->mutable_conversion_segment(i));
modified = true;
}
// Tweak for noun prefix.
// TODO(team): Ideally, this issue can be fixed via the language model
// and dictionary generation.
for (size_t i = 0; i < segments_size; ++i) {
if (segments->conversion_segment(i).candidates_size() == 0) {
continue;
}
if (i + 1 < segments_size) {
const Segment::Candidate &right_candidate =
segments->conversion_segment(i + 1).candidate(0);
// right segment must be a noun.
if (!pos_matcher_->IsContentNoun(right_candidate.lid)) {
continue;
}
} else if (segments_size != 1) { // also apply if segments_size == 1.
continue;
}
const string &key = segments->conversion_segment(i).key();
const EmbeddedDictionary::Token *token =
Singleton<NounPrefixDictionary>::get()->GetDictionary()->Lookup(key);
if (token == NULL) {
continue;
}
InsertNounPrefix(*pos_matcher_,
segments->mutable_conversion_segment(i),
token->value, token->value_size);
// Ignore the next noun content word.
++i;
modified = true;
}
return modified;
}
} // namespace mozc