blob: 8c98bf93ce6e97f2caa7e058195006615732fdde [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/collocation_rewriter.h"
#include <algorithm>
#include <string>
#include <vector>
#include "base/logging.h"
#include "base/singleton.h"
#include "base/string_piece.h"
#include "base/util.h"
#include "converter/conversion_request.h"
#include "converter/segments.h"
#include "data_manager/data_manager_interface.h"
#include "dictionary/pos_matcher.h"
#include "rewriter/collocation_util.h"
#include "storage/existence_filter.h"
DEFINE_bool(use_collocation, true, "use collocation rewrite");
namespace mozc {
using mozc::storage::ExistenceFilter;
namespace {
const size_t kCandidateSize = 12;
// For collocation, we use two segments.
enum SegmentLookupType {
LEFT,
RIGHT,
};
// returns true if the given string contains number including Kanji.
bool ContainsNumber(const string &str) {
for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) {
if (CollocationUtil::IsNumber(iter.Get())) {
return true;
}
}
return false;
}
// Returns true if value matches the pattern XXXPPPYYY, where XXX is a Kanji
// sequence, PPP is the given pattern, and YYY is a sequence containing at least
// one Kanji character. In the value matches the pattern, XXX and YYY are
// substituted to |first_content| and |second|, respectively. Returns false if
// the value isn't of the form XXXPPPYYY.
bool ParseCompound(const StringPiece value, const StringPiece pattern,
StringPiece *first_content, StringPiece *second) {
DCHECK(!value.empty());
DCHECK(!pattern.empty());
// Find the |first_content| candidate and check if it consists of Kanji only.
StringPiece::const_iterator pattern_begin =
find(value.begin(), value.end(), pattern[0]);
if (pattern_begin == value.end()) {
return false;
}
first_content->set(value.data(), distance(value.begin(), pattern_begin));
if (!Util::IsScriptType(*first_content, Util::KANJI)) {
return false;
}
// Check if the middle part matches |pattern|.
const StringPiece remaining_value = value.substr(first_content->size());
if (!Util::StartsWith(remaining_value, pattern)) {
return false;
}
// Check if the last substring is eligible for |second|.
*second = remaining_value.substr(pattern.size());
if (second->empty() || !Util::ContainsScriptType(*second, Util::KANJI)) {
return false;
}
// Just verify that |value| = |first_content| + |pattern| + |second|.
DCHECK_EQ(
value,
first_content->as_string() + pattern.as_string() + second->as_string());
return true;
}
// Fast way of pushing back a string piece to a vector.
inline void PushBackStringPiece(const StringPiece s, vector<string> *v) {
v->push_back(string());
v->back().assign(s.data(), s.size());
}
// Fast way of pushing back the concatenated string of two string pieces to a
// vector.
inline void PushBackJoinedStringPieces(
const StringPiece s1, const StringPiece s2, vector<string> *v) {
v->push_back(string());
v->back().reserve(s1.size() + s2.size());
v->back().assign(s1.data(), s1.size()).append(s2.data(), s2.size());
}
// Handles compound such as "本を読む"(one segment)
// we want to rewrite using it as if it was "<本|を><読む>"
// so that we can use collocation data like "厚い本"
void ResolveCompoundSegment(const string &top_value, const string &value,
SegmentLookupType type,
vector<string> *output) {
// "格助詞"
// see "http://ja.wikipedia.org/wiki/助詞"
static const char kPat1[] = "\xE3\x81\x8C"; // "が"
// "の" was not good...
// static const char kPat2[] = "\xE3\x81\xAE"; // "の"
static const char kPat3[] = "\xE3\x82\x92"; // "を"
static const char kPat4[] = "\xE3\x81\xAB"; // "に"
static const char kPat5[] = "\xE3\x81\xB8"; // "へ"
static const char kPat6[] = "\xE3\x81\xA8"; // "と"
static const char kPat7[] = "\xE3\x81\x8B\xE3\x82\x89"; // "から"
static const char kPat8[] = "\xE3\x82\x88\xE3\x82\x8A"; // "より"
static const char kPat9[] = "\xE3\x81\xA7"; // "で"
static const struct {
const char *pat;
size_t len;
} kParticles[] = {
{kPat1, arraysize(kPat1) - 1},
// {kPat2, arraysize(kPat2) - 1},
{kPat3, arraysize(kPat3) - 1},
{kPat4, arraysize(kPat4) - 1},
{kPat5, arraysize(kPat5) - 1},
{kPat6, arraysize(kPat6) - 1},
{kPat7, arraysize(kPat7) - 1},
{kPat8, arraysize(kPat8) - 1},
{kPat9, arraysize(kPat9) - 1},
{NULL, 0}
};
for (size_t i = 0; kParticles[i].pat != NULL; ++i) {
const StringPiece particle(kParticles[i].pat, kParticles[i].len);
StringPiece first_content, second;
if (!ParseCompound(top_value, particle, &first_content, &second)) {
continue;
}
if (ParseCompound(value, particle, &first_content, &second)) {
if (type == LEFT) {
PushBackStringPiece(second, output);
PushBackJoinedStringPieces(first_content, particle, output);
} else {
PushBackStringPiece(first_content, output);
}
return;
}
}
}
bool IsNaturalContent(const Segment::Candidate &cand,
const Segment::Candidate &top_cand,
SegmentLookupType type,
vector<string> *output) {
const string &content = cand.content_value;
const string &value = cand.value;
const string &top_content = top_cand.content_value;
const string &top_value = top_cand.value;
const size_t top_content_len = Util::CharsLen(top_content);
const size_t content_len = Util::CharsLen(content);
if (type == RIGHT &&
value != top_value &&
top_content_len >= 2 &&
content_len == 1) {
return false;
}
if (type == LEFT) {
output->push_back(value);
} else {
output->push_back(content);
// "舞って" workaround
// V+"て" is often treated as one compound.
static const char kPat[] = "\xE3\x81\xA6"; // "て"
if (Util::EndsWith(content, StringPiece(kPat, arraysize(kPat) - 1))) {
PushBackStringPiece(
Util::SubStringPiece(content, 0, content_len - 1), output);
}
}
// we don't rewrite NUMBER to others and vice versa
if (ContainsNumber(value) != ContainsNumber(top_value)) {
return false;
}
const StringPiece top_aux_value =
Util::SubStringPiece(top_value, top_content_len, string::npos);
const size_t top_aux_value_len = Util::CharsLen(top_aux_value);
const Util::ScriptType top_value_script_type = Util::GetScriptType(top_value);
// we don't rewrite KATAKANA segment
// for example, we don't rewrite "コーヒー飲みます" to "珈琲飲みます"
if (type == LEFT &&
top_aux_value_len == 0 &&
top_value != value &&
top_value_script_type == Util::KATAKANA) {
return false;
}
// special cases
if (top_content_len == 1) {
const char *begin = top_content.data();
const char *end = top_content.data() + top_content.size();
size_t mblen = 0;
const char32 wchar = Util::UTF8ToUCS4(begin, end, &mblen);
switch (wchar) {
case 0x304a: // "お"
case 0x5fa1: // "御"
case 0x3054: // "ご"
return true;
default:
break;
}
}
const StringPiece aux_value =
Util::SubStringPiece(value, content_len, string::npos);
// Remove number in normalization for the left segment.
string aux_normalized, top_aux_normalized;
CollocationUtil::GetNormalizedScript(
aux_value, (type == LEFT), &aux_normalized);
CollocationUtil::GetNormalizedScript(
top_aux_value, (type == LEFT), &top_aux_normalized);
if (!aux_normalized.empty() &&
!Util::IsScriptType(aux_normalized, Util::HIRAGANA)) {
if (type == RIGHT) {
return false;
}
if (aux_normalized != top_aux_normalized) {
return false;
}
}
ResolveCompoundSegment(top_value, value, type, output);
const size_t aux_value_len = Util::CharsLen(aux_value);
const size_t value_len = Util::CharsLen(value);
// "<XXいる|>" can be rewrited to "<YY|いる>" and vice versa
{
static const char kPat[] = "\xE3\x81\x84\xE3\x82\x8B"; // "いる"
const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
if (top_aux_value_len == 0 &&
aux_value_len == 2 &&
Util::EndsWith(top_value, kSuffix) &&
Util::EndsWith(aux_value, kSuffix)) {
if (type == RIGHT) {
// "YYいる" in addition to "YY"
output->push_back(value);
}
return true;
}
if (aux_value_len == 0 &&
top_aux_value_len == 2 &&
Util::EndsWith(value, kSuffix) &&
Util::EndsWith(top_aux_value, kSuffix)) {
if (type == RIGHT) {
// "YY" in addition to "YYいる"
PushBackStringPiece(
Util::SubStringPiece(value, 0, value_len - 2), output);
}
return true;
}
}
// "<XXせる|>" can be rewrited to "<YY|せる>" and vice versa
{
const char kPat[] = "\xE3\x81\x9B\xE3\x82\x8B"; // "せる"
const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
if (top_aux_value_len == 0 &&
aux_value_len == 2 &&
Util::EndsWith(top_value, kSuffix) &&
Util::EndsWith(aux_value, kSuffix)) {
if (type == RIGHT) {
// "YYせる" in addition to "YY"
output->push_back(value);
}
return true;
}
if (aux_value_len == 0 &&
top_aux_value_len == 2 &&
Util::EndsWith(value, kSuffix) &&
Util::EndsWith(top_aux_value, kSuffix)) {
if (type == RIGHT) {
// "YY" in addition to "YYせる"
PushBackStringPiece(
Util::SubStringPiece(value, 0, value_len - 2), output);
}
return true;
}
}
const Util::ScriptType content_script_type = Util::GetScriptType(content);
// "<XX|する>" can be rewrited using "<XXす|る>" and "<XX|する>"
// in "<XX|する>", XX must be single script type
// "評する"
{
static const char kPat[] = "\xE3\x81\x99\xE3\x82\x8B"; // "する"
const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
if (aux_value_len == 2 &&
Util::EndsWith(aux_value, kSuffix)) {
if (content_script_type != Util::KATAKANA &&
content_script_type != Util::HIRAGANA &&
content_script_type != Util::KANJI &&
content_script_type != Util::ALPHABET) {
return false;
}
if (type == RIGHT) {
// "YYす" in addition to "YY"
PushBackStringPiece(
Util::SubStringPiece(value, 0, value_len - 1), output);
}
return true;
}
}
// "<XXる>" can be rewrited using "<XX|る>"
// "まとめる", "衰える"
{
static const char kPat[] = "\xE3\x82\x8B"; // "る"
const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
if (aux_value_len == 0 &&
Util::EndsWith(value, kSuffix)) {
if (type == RIGHT) {
// "YY" in addition to "YYる"
PushBackStringPiece(
Util::SubStringPiece(value, 0, value_len - 1), output);
}
return true;
}
}
// "<XXす>" can be rewrited using "XXする"
{
static const char kPat[] = "\xE3\x81\x99"; // "す"
const StringPiece kSuffix(kPat, arraysize(kPat) - 1);
if (Util::EndsWith(value, kSuffix) &&
Util::IsScriptType(
Util::SubStringPiece(value, 0, value_len - 1),
Util::KANJI)) {
if (type == RIGHT) {
const char kRu[] = "\xE3\x82\x8B";
// "YYする" in addition to "YY"
PushBackJoinedStringPieces(
value, StringPiece(kRu, arraysize(kRu) - 1), output);
}
return true;
}
}
// "<XXし|た>" can be rewrited using "<XX|した>"
{
static const char kPat[] = "\xE3\x81\x97\xE3\x81\x9F"; // "した"
const StringPiece kShi(kPat, 3), kTa(kPat + 3, 3);
if (Util::EndsWith(content, kShi) &&
aux_value == kTa &&
Util::EndsWith(top_content, kShi) &&
top_aux_value == kTa) {
if (type == RIGHT) {
const StringPiece val =
Util::SubStringPiece(content, 0, content_len - 1);
// XX must be KANJI
if (Util::IsScriptType(val, Util::KANJI)) {
PushBackStringPiece(val, output);
}
}
return true;
}
}
const int aux_len = value_len - content_len;
const int top_aux_len = Util::CharsLen(top_value) - top_content_len;
if (aux_len != top_aux_len) {
return false;
}
const Util::ScriptType top_content_script_type =
Util::GetScriptType(top_content);
// we don't rewrite HIRAGANA to KATAKANA
if (top_content_script_type == Util::HIRAGANA &&
content_script_type == Util::KATAKANA) {
return false;
}
// we don't rewrite second KATAKANA
// for example, we don't rewrite "このコーヒー" to "この珈琲"
if (type == RIGHT &&
top_content_script_type == Util::KATAKANA &&
value != top_value) {
return false;
}
if (top_content_len == 1 &&
top_content_script_type == Util::HIRAGANA) {
return false;
}
// suppress "<身|ています>" etc.
if (top_content_len == 1 &&
content_len == 1 &&
top_aux_value_len >= 2 &&
aux_value_len >= 2 &&
top_content_script_type == Util::KANJI &&
content_script_type == Util::KANJI &&
top_content != content) {
return false;
}
return true;
}
// Just a wrapper of IsNaturalContent for debug.
bool VerifyNaturalContent(const Segment::Candidate &cand,
const Segment::Candidate &top_cand,
SegmentLookupType type) {
vector<string> nexts;
return IsNaturalContent(cand, top_cand, RIGHT, &nexts);
}
inline bool IsKeyUnknown(const Segment &seg) {
return Util::IsScriptType(seg.key(), Util::UNKNOWN_SCRIPT);
}
} // namespace
bool CollocationRewriter::RewriteCollocation(Segments *segments) const {
// return false if at least one segment is fixed.
for (size_t i = segments->history_segments_size();
i < segments->segments_size(); ++i) {
if (segments->segment(i).segment_type() == Segment::FIXED_VALUE) {
return false;
}
}
vector<bool> segs_changed(segments->segments_size(), false);
bool changed = false;
for (size_t i = segments->history_segments_size();
i < segments->segments_size(); ++i) {
bool rewrited_next = false;
if (IsKeyUnknown(segments->segment(i))) {
continue;
}
if (i + 1 < segments->segments_size() &&
RewriteUsingNextSegment(segments->mutable_segment(i + 1),
segments->mutable_segment(i))) {
changed = true;
rewrited_next = true;
segs_changed[i] = true;
segs_changed[i + 1] = true;
}
if (!segs_changed[i] &&
!rewrited_next &&
i > 0 &&
RewriteFromPrevSegment(segments->segment(i - 1).candidate(0),
segments->mutable_segment(i))) {
changed = true;
segs_changed[i - 1] = true;
segs_changed[i] = true;
}
const Segment::Candidate &cand = segments->segment(i).candidate(0);
if (i >= 2 &&
// Cross over only adverbs
// Segment is adverb if;
// 1) lid and rid is adverb.
// 2) or rid is adverb suffix.
((pos_matcher_->IsAdverb(segments->segment(i - 1).candidate(0).lid) &&
pos_matcher_->IsAdverb(segments->segment(i - 1).candidate(0).rid)) ||
pos_matcher_->IsAdverbSegmentSuffix(
segments->segment(i - 1).candidate(0).rid)) &&
(cand.content_value != cand.value ||
cand.value != "\xe3\x83\xbb")) { // "・" workaround
if (!segs_changed[i - 2] &&
!segs_changed[i] &&
RewriteUsingNextSegment(segments->mutable_segment(i),
segments->mutable_segment(i - 2))) {
changed = true;
segs_changed[i] = true;
segs_changed[i - 2] = true;
} else if (!segs_changed[i] &&
RewriteFromPrevSegment(
segments->segment(i - 2).candidate(0),
segments->mutable_segment(i))) {
changed = true;
segs_changed[i] = true;
segs_changed[i - 2] = true;
}
}
}
return changed;
}
class CollocationRewriter::CollocationFilter {
public:
CollocationFilter(const char *existence_data, size_t size)
: filter_(ExistenceFilter::Read(existence_data, size)) {
}
~CollocationFilter() {
}
bool Exists(const string &left, const string &right) const {
if (left.empty() || right.empty()) {
return false;
}
string key;
key.reserve(left.size() + right.size());
key.assign(left).append(right);
const uint64 id = Util::Fingerprint(key);
return filter_->Exists(id);
}
private:
scoped_ptr<ExistenceFilter> filter_;
DISALLOW_COPY_AND_ASSIGN(CollocationFilter);
};
class CollocationRewriter::SuppressionFilter {
public:
SuppressionFilter(const char *suppression_data, size_t size)
: filter_(ExistenceFilter::Read(suppression_data, size)) {
}
~SuppressionFilter() {
}
bool Exists(const Segment::Candidate &cand) const {
// TODO(noriyukit): We should share key generation rule with
// gen_collocation_suppression_data_main.cc.
string key;
key.reserve(cand.content_value.size() + 1 + cand.content_key.size());
key.assign(cand.content_value).append("\t").append(cand.content_key);
const uint64 id = Util::Fingerprint(key);
return filter_->Exists(id);
}
private:
scoped_ptr<ExistenceFilter> filter_;
DISALLOW_COPY_AND_ASSIGN(SuppressionFilter);
};
CollocationRewriter::CollocationRewriter(
const DataManagerInterface *data_manager)
: pos_matcher_(data_manager->GetPOSMatcher()),
first_name_id_(pos_matcher_->GetFirstNameId()),
last_name_id_(pos_matcher_->GetLastNameId()) {
const char *data = NULL;
size_t size = 0;
data_manager->GetCollocationData(&data, &size);
collocation_filter_.reset(new CollocationFilter(data, size));
data_manager->GetCollocationSuppressionData(&data, &size);
suppression_filter_.reset(new SuppressionFilter(data, size));
}
CollocationRewriter::~CollocationRewriter() {}
bool CollocationRewriter::Rewrite(const ConversionRequest &request,
Segments *segments) const {
return RewriteCollocation(segments);
}
bool CollocationRewriter::IsName(const Segment::Candidate &cand) const {
const bool ret = (cand.lid == last_name_id_ || cand.lid == first_name_id_);
VLOG_IF(3, ret) << cand.value << " is name sagment";
return ret;
}
bool CollocationRewriter::RewriteFromPrevSegment(
const Segment::Candidate &prev_cand,
Segment *seg) const {
string prev;
CollocationUtil::GetNormalizedScript(prev_cand.value, true, &prev);
const size_t i_max = min(seg->candidates_size(), kCandidateSize);
// Reuse |curs| and |cur| in the loop as this method is performance critical.
vector<string> curs;
string cur;
for (size_t i = 0; i < i_max; ++i) {
if (IsName(seg->candidate(i))) {
continue;
}
if (suppression_filter_->Exists(seg->candidate(i))) {
continue;
}
curs.clear();
if (!IsNaturalContent(seg->candidate(i), seg->candidate(0), RIGHT, &curs)) {
continue;
}
for (int j = 0; j < curs.size(); ++j) {
cur.clear();
CollocationUtil::GetNormalizedScript(curs[j], false, &cur);
if (collocation_filter_->Exists(prev, cur)) {
VLOG_IF(3, i != 0) << prev << cur << " "
<< seg->candidate(0).value << "->"
<< seg->candidate(i).value;
seg->move_candidate(i, 0);
seg->mutable_candidate(0)->attributes
|= Segment::Candidate::CONTEXT_SENSITIVE;
return true;
}
}
}
return false;
}
bool CollocationRewriter::RewriteUsingNextSegment(Segment *next_seg,
Segment *seg) const {
const size_t i_max = min(seg->candidates_size(), kCandidateSize);
const size_t j_max = min(next_seg->candidates_size(), kCandidateSize);
// Cache the results for the next segment
vector<int> next_seg_ok(j_max); // Avoiding vector<bool>
vector<vector<string> > normalized_string(j_max);
// Reuse |nexts| in the loop as this method is performance critical.
vector<string> nexts;
for (size_t j = 0; j < j_max; ++j) {
next_seg_ok[j] = 0;
if (IsName(next_seg->candidate(j))) {
continue;
}
if (suppression_filter_->Exists(next_seg->candidate(j))) {
continue;
}
nexts.clear();
if (!IsNaturalContent(next_seg->candidate(j),
next_seg->candidate(0), RIGHT, &nexts)) {
continue;
}
next_seg_ok[j] = 1;
for (vector<string>::const_iterator it = nexts.begin();
it != nexts.end(); ++it) {
normalized_string[j].push_back(string());
CollocationUtil::GetNormalizedScript(
*it, false, &normalized_string[j].back());
}
}
// Reuse |curs| and |cur| in the loop as this method is performance critical.
vector<string> curs;
string cur;
for (size_t i = 0; i < i_max; ++i) {
if (IsName(seg->candidate(i))) {
continue;
}
if (suppression_filter_->Exists(seg->candidate(i))) {
continue;
}
curs.clear();
if (!IsNaturalContent(seg->candidate(i), seg->candidate(0), LEFT, &curs)) {
continue;
}
for (int k = 0; k < curs.size(); ++k) {
cur.clear();
CollocationUtil::GetNormalizedScript(curs[k], true, &cur);
for (size_t j = 0; j < j_max; ++j) {
if (!next_seg_ok[j]) {
continue;
}
for (int l = 0; l < normalized_string[j].size(); ++l) {
const string &next = normalized_string[j][l];
if (collocation_filter_->Exists(cur, next)) {
DCHECK(VerifyNaturalContent(
next_seg->candidate(j), next_seg->candidate(0), RIGHT))
<< "IsNaturalContent() should not fail here.";
seg->move_candidate(i, 0);
seg->mutable_candidate(0)->attributes
|= Segment::Candidate::CONTEXT_SENSITIVE;
next_seg->move_candidate(j, 0);
next_seg->mutable_candidate(0)->attributes
|= Segment::Candidate::CONTEXT_SENSITIVE;
return true;
}
}
}
}
}
return false;
}
} // namespace mozc