blob: 7f3074571ac55634a51b3857be1864640472b7b5 [file] [log] [blame]
// Copyright 2010-2014, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/focus_candidate_rewriter.h"
#include <string>
#include "base/logging.h"
#include "base/number_util.h"
#include "base/util.h"
#include "converter/segments.h"
#include "data_manager/data_manager_interface.h"
#include "dictionary/pos_matcher.h"
#include "rewriter/number_compound_util.h"
namespace mozc {
namespace {
enum {
NUMBER = 1,
SUFFIX = 2,
CONNECTOR = 4
};
// TODO(taku): See POS and increase the coverage.
bool IsConnectorSegment(const Segment &segment) {
return (segment.key() == "\xE3\x81\xA8" ||
segment.key() == "\xE3\x82\x84");
}
// Finds value from the candidates list and move the canidate to the top.
bool RewriteCandidate(Segment *segment, const string &value) {
for (int i = 0; i < segment->candidates_size(); ++i) {
if (segment->candidate(i).content_value == value) {
segment->move_candidate(i, 0); // move to top
return true;
}
}
// Find value from meta candidates.
for (int i = 0; i < segment->meta_candidates_size(); ++i) {
if (segment->meta_candidate(i).content_value == value) {
segment->move_candidate(-i - 1, 0); // copy to top
return true;
}
}
return false;
}
// Returns true if the segment is valid.
bool IsValidSegment(const Segment &segment) {
return (segment.segment_type() == Segment::FREE ||
segment.segment_type() == Segment::FIXED_BOUNDARY ||
segment.segment_type() == Segment::FIXED_VALUE);
}
bool IsNumberCandidate(const Segment::Candidate &candidate) {
return (candidate.style != NumberUtil::NumberString::DEFAULT_STYLE ||
Util::GetScriptType(candidate.value) == Util::NUMBER);
}
bool IsNumberSegment(const Segment &segment) {
return (segment.candidates_size() > 0 &&
IsNumberCandidate(segment.candidate(0)));
}
// Returns true if two candidates have the same number form.
bool IsSameNumberType(const Segment::Candidate &candidate1,
const Segment::Candidate &candidate2) {
if (candidate1.style == candidate2.style) {
if (candidate1.style == NumberUtil::NumberString::DEFAULT_STYLE) {
if (IsNumberCandidate(candidate1) && IsNumberCandidate(candidate2) &&
Util::GetFormType(candidate1.value) ==
Util::GetFormType(candidate2.value)) {
return true;
}
} else {
return true;
}
}
return false;
}
bool RewriteNumber(Segment *segment, const Segment::Candidate &candidate) {
for (int i = 0; i < segment->candidates_size(); ++i) {
if (IsSameNumberType(candidate, segment->candidate(i))) {
segment->move_candidate(i, 0); // move to top
return true;
}
}
// Find value from meta candidates.
for (int i = 0; i < segment->meta_candidates_size(); ++i) {
if (IsSameNumberType(candidate, segment->meta_candidate(i))) {
segment->move_candidate(-i - 1, 0); // copy to top
return true;
}
}
return false;
}
} // namespace
FocusCandidateRewriter::FocusCandidateRewriter(
const DataManagerInterface *data_manager) {
data_manager->GetCounterSuffixSortedArray(&suffix_array_,
&suffix_array_size_);
pos_matcher_ = data_manager->GetPOSMatcher();
}
FocusCandidateRewriter::~FocusCandidateRewriter() {}
bool FocusCandidateRewriter::Focus(Segments *segments,
size_t segment_index,
int candidate_index) const {
if (segments == NULL) {
LOG(ERROR) << "Segments is NULL";
return false;
}
if (segment_index >= segments->segments_size()) {
LOG(WARNING) << "Segment index out of range";
return false;
}
const Segment &seg = segments->segment(segment_index);
// segment_type must be FREE or FIXED_BOUNDARY
if (!IsValidSegment(seg)) {
LOG(WARNING) << "Segment is not valid";
return false;
}
// invalid candidate index
if ((candidate_index < 0 &&
-candidate_index - 1 >= static_cast<int>(seg.meta_candidates_size())) ||
candidate_index >= static_cast<int>(seg.candidates_size())) {
LOG(WARNING) << "Candidate index out of range: "
<< candidate_index << " " << seg.candidates_size();
return false;
}
// left to right
{
const string &left_value = seg.candidate(candidate_index).content_value;
string right_value;
if (Util::IsOpenBracket(left_value, &right_value)) {
int num_nest = 1;
for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
Segment *target_right_seg = segments->mutable_segment(i);
if (target_right_seg == NULL ||
target_right_seg->candidates_size() <= 0) {
LOG(WARNING) << "target right seg is not valid";
return false;
}
if (!IsValidSegment(*target_right_seg)) {
continue;
}
const string &target_right_value =
target_right_seg->candidate(0).content_value;
string tmp;
if (Util::IsOpenBracket(target_right_value, &tmp)) {
++num_nest;
} else if (Util::IsCloseBracket(target_right_value, &tmp)) {
--num_nest;
}
if (num_nest == 0 && RewriteCandidate(target_right_seg, right_value)) {
return true;
}
}
VLOG(1) << "could not find close bracket";
return false;
}
}
// right to left
{
const string &right_value = seg.candidate(candidate_index).content_value;
string left_value;
if (Util::IsCloseBracket(right_value, &left_value)) {
int num_nest = 1;
for (int i = segment_index - 1; i >= 0; --i) {
Segment *target_left_seg = segments->mutable_segment(i);
if (target_left_seg == NULL ||
target_left_seg->candidates_size() <= 0) {
LOG(WARNING) << "target left seg is not valid";
return false;
}
if (!IsValidSegment(*target_left_seg)) {
continue;
}
const string &target_left_value =
target_left_seg->candidate(0).content_value;
string tmp;
if (Util::IsCloseBracket(target_left_value, &tmp)) {
++num_nest;
} else if (Util::IsOpenBracket(target_left_value, &tmp)) {
--num_nest;
}
if (num_nest == 0 && RewriteCandidate(target_left_seg, left_value)) {
return true;
}
}
VLOG(1) << "could not find open bracket";
return false;
}
}
{
if (IsNumberCandidate(seg.candidate(candidate_index))) {
bool modified = 0;
int distance = 0;
for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
Segment *target_right_seg = segments->mutable_segment(i);
if (target_right_seg == NULL ||
target_right_seg->candidates_size() <= 0) {
LOG(WARNING) << "target right seg is not valid";
return false;
}
if (!IsValidSegment(*target_right_seg)) {
continue;
}
// Make sure the first candidate of the segment is number.
if (IsNumberSegment(*target_right_seg) &&
RewriteNumber(target_right_seg,
seg.candidate(candidate_index))) {
modified = true;
distance = 0;
} else {
++distance;
}
// more than two segments between the target numbers
if (distance >= 2) {
break;
}
}
return modified;
}
}
{
// <Number><Suffix><Connector>?<Number><Suffix><Connector>?
if (segment_index > 0 &&
IsNumberSegment(segments->segment(segment_index - 1)) &&
seg.candidates_size() > 0 &&
seg.candidate(0).content_key ==
seg.candidate(candidate_index).content_key) {
int next_stat = CONNECTOR | NUMBER;
bool modified = false;
for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
if (next_stat == (CONNECTOR | NUMBER)) {
if (IsConnectorSegment(segments->segment(i))) {
next_stat = NUMBER;
} else if (IsNumberSegment(segments->segment(i))) {
next_stat = SUFFIX;
} else {
break;
}
} else if (next_stat == NUMBER &&
IsNumberSegment(segments->segment(i))) {
next_stat = SUFFIX;
} else if (next_stat == SUFFIX &&
segments->segment(i).candidates_size() > 0 &&
segments->segment(i).candidate(0).content_key ==
seg.candidate(0).content_key) {
if (!IsValidSegment(segments->segment(i))) {
continue;
}
modified |= RewriteCandidate(
segments->mutable_segment(i),
seg.candidate(candidate_index).content_value);
next_stat = CONNECTOR | NUMBER;
} else {
break;
}
}
return modified;
}
}
return RerankNumberCandidates(segments, segment_index, candidate_index);
}
bool FocusCandidateRewriter::RerankNumberCandidates(Segments *segments,
size_t segment_index,
int candidate_index) const {
// Check if the focused candidate is a number compound.
StringPiece number, suffix;
uint32 number_script_type = 0;
const Segment &seg = segments->segment(segment_index);
if (!ParseNumberCandidate(seg.candidate(candidate_index), &number, &suffix,
&number_script_type)) {
return false;
}
if (number.empty()) {
return false;
}
// Try reranking top candidates of subsequent segments using the number
// compound style of the focused candidate.
bool modified = false;
int distance = 0;
for (size_t i = segment_index + 1; i < segments->segments_size(); ++i) {
Segment *seg = segments->mutable_segment(i);
const int index = FindMatchingCandidates(*seg, number_script_type, suffix);
if (index == -1) {
// If there's no appropriate candidate having the same style, we increment
// the distance not to modify segments far from the focused one.
if (++distance > 2) {
break;
}
continue;
}
// Move the target candidate to the top. We don't need to move if the
// target is already at top (i.g., the case where index == 0).
if (index > 0) {
seg->move_candidate(index, 0);
modified = true;
distance = 0;
}
}
return modified;
}
int FocusCandidateRewriter::FindMatchingCandidates(
const Segment &seg, uint32 ref_script_type, StringPiece ref_suffix) const {
// Only segments whose top candidate is a number compound are target of
// reranking.
const Segment::Candidate &cand = seg.candidate(0);
StringPiece number, suffix;
uint32 script_type = 0;
if (!ParseNumberCandidate(cand, &number, &suffix, &script_type)) {
return -1;
}
// Top candidate matches the style.
if (script_type == ref_script_type && suffix == ref_suffix) {
return 0;
}
// Check only top 10 candidates because, when the top candidate is a number
// candidate, other number compounds likely to appear near the top candidate.
const size_t max_size = min(seg.candidates_size(), static_cast<size_t>(10));
for (size_t i = 1; i < max_size; ++i) {
if (!ParseNumberCandidate(seg.candidate(i), &number, &suffix,
&script_type)) {
continue;
}
if (script_type == ref_script_type && suffix == ref_suffix) {
return i;
}
}
return -1;
}
bool FocusCandidateRewriter::ParseNumberCandidate(
const Segment::Candidate &cand, StringPiece* number,
StringPiece* suffix, uint32 *script_type) const {
// If the lengths of content value and value are different, particles may be
// appended to value. In such cases, we only accept parallel markers.
// Otherwise, the following wrong rewrite will occur.
// Example: "一階へは | 二回 | 行った -> 一階へは | 二階 | 行った"
if (cand.content_value.size() != cand.value.size()) {
if (!pos_matcher_->IsParallelMarker(cand.rid)) {
return false;
}
}
return number_compound_util::SplitStringIntoNumberAndCounterSuffix(
suffix_array_, suffix_array_size_,
cand.content_value, number, suffix, script_type);
}
} // namespace mozc