blob: f6bf98a658155742bd2ba5b062facdc257e9d7ca [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/language_aware_rewriter.h"
#include <string>
#include "base/logging.h"
#include "base/util.h"
#include "composer/composer.h"
#include "config/config.pb.h"
#include "config/config_handler.h"
#include "converter/conversion_request.h"
#include "converter/segments.h"
#include "dictionary/dictionary_interface.h"
#include "dictionary/pos_matcher.h"
#include "session/commands.pb.h"
#include "usage_stats/usage_stats.h"
namespace mozc {
LanguageAwareRewriter::LanguageAwareRewriter(
const POSMatcher &pos_matcher,
const DictionaryInterface *dictionary)
: unknown_id_(pos_matcher.GetUnknownId()),
dictionary_(dictionary) {}
LanguageAwareRewriter::~LanguageAwareRewriter() {}
namespace {
bool IsEnabled(const mozc::commands::Request &request) {
// The current default value of language_aware_input is
// NO_LANGUAGE_AWARE_INPUT and only unittests set LANGUAGE_AWARE_SUGGESTION
// at this moment. Thus, FillRawText is not performed in the productions
// yet.
if (request.language_aware_input() ==
mozc::commands::Request::NO_LANGUAGE_AWARE_INPUT) {
return false;
} else if (request.language_aware_input() ==
mozc::commands::Request::LANGUAGE_AWARE_SUGGESTION) {
return true;
}
DCHECK_EQ(mozc::commands::Request::DEFAULT_LANGUAGE_AWARE_BEHAVIOR,
request.language_aware_input());
if (!GET_CONFIG(use_spelling_correction)) {
return false;
}
#ifdef OS_ANDROID
return false;
#else // OS_ANDROID
return true;
#endif // OS_ANDROID
}
} // namespace
int LanguageAwareRewriter::capability(
const ConversionRequest &request) const {
// Language aware input is performed only on suggestion or prediction.
if (!IsEnabled(request.request())) {
return RewriterInterface::NOT_AVAILABLE;
}
return (RewriterInterface::SUGGESTION | RewriterInterface::PREDICTION);
}
namespace {
bool IsRawQuery(const composer::Composer &composer,
const DictionaryInterface *dictionary,
int *rank) {
string raw_text;
composer.GetRawString(&raw_text);
// Check if the length of text is less than or equal to three.
// For example, "cat" is not treated as a raw query so far to avoid
// false negative cases.
if (raw_text.size() <= 3) {
return false;
}
// If the composition string is same with the raw_text, there is no
// need to add the candidate to suggestions.
string composition;
composer.GetStringForPreedit(&composition);
if (composition == raw_text) {
return false;
}
// If alphabet characters are in the middle of the composition, it is
// probably a raw query. For example, "えぁmpぇ" (example) contains
// "m" and "p" in the middle. So it is treated as a raw query. On the
// other hand, "くえry" (query) contains alphabet characters, but they
// are at the end of the string, so it cannot be determined here.
//
// Note, GetQueryForPrediction omits the trailing alphabet characters of
// the composition string and returns it.
string key;
composer.GetQueryForPrediction(&key);
if (Util::ContainsScriptType(key, Util::ALPHABET)) {
*rank = 0;
return true;
}
// If the composition is storead as a key in the dictionary like
// "はな" (hana), "たけ" (take), the query is not handled as a raw query.
// It is a little conservative, but a safer way.
if (dictionary->HasKey(key)) {
return false;
}
// If the input text is stored in the dictionary, it is perhaps a raw query.
// For example, the input characters of "れもヴぇ" (remove) is in the
// dictionary, so it is treated as a raw text.
if (dictionary->HasValue(raw_text)) {
*rank = 2;
return true;
}
return false;
}
// Get T13n candidate ids from existing candidates.
void GetAlphabetIds(const Segment &segment, uint16 *lid, uint16 *rid) {
DCHECK(lid);
DCHECK(rid);
for (int i = 0; i < segment.candidates_size(); ++i) {
const Segment::Candidate &candidate = segment.candidate(i);
const Util::ScriptType type = Util::GetScriptType(candidate.value);
if (type == Util::ALPHABET) {
*lid = candidate.lid;
*rid = candidate.rid;
return;
}
}
}
} // namespace
// Note: This function seemed slow, but the benchmark tests
// resulted that it was only less than 0.1% point penalty.
// = session_handler_benchmark_test
// BM_PerformanceForRandomKeyEvents: 891944807 -> 892740748 (1.00089)
// = converter_benchmark_test
// BM_DesktopAnthyCorpusConversion 25062440090 -> 25101542382 (1.002)
// BM_DesktopStationPredictionCorpusPrediction 8695341697 -> 8672187681 (0.997)
// BM_DesktopStationPredictionCorpusSuggestion 6149502840 -> 6152393270 (1.000)
bool LanguageAwareRewriter::FillRawText(
const ConversionRequest &request, Segments *segments) const {
if (segments->conversion_segments_size() != 1 || !request.has_composer()) {
return false;
}
int rank = 0;
if (!IsRawQuery(request.composer(), dictionary_, &rank)) {
return false;
}
Segment *segment = segments->mutable_conversion_segment(0);
string raw_string;
request.composer().GetRawString(&raw_string);
uint16 lid = unknown_id_;
uint16 rid = unknown_id_;
GetAlphabetIds(*segment, &lid, &rid);
// Create a candidate.
if (rank > segment->candidates_size()) {
rank = segment->candidates_size();
}
Segment::Candidate *candidate = segment->insert_candidate(rank);
candidate->Init();
candidate->value = raw_string;
candidate->key = raw_string;
candidate->content_value = raw_string;
candidate->content_key = raw_string;
candidate->lid = lid;
candidate->rid = rid;
candidate->attributes |= (Segment::Candidate::NO_VARIANTS_EXPANSION |
Segment::Candidate::NO_EXTRA_DESCRIPTION);
candidate->prefix = "\xE2\x86\x92 "; // "→ "
candidate->description =
// "もしかして"
"\xE3\x82\x82\xE3\x81\x97\xE3\x81\x8B\xE3\x81\x97\xE3\x81\xA6";
// Set usage stats
usage_stats::UsageStats::IncrementCount("LanguageAwareSuggestionTriggered");
return true;
}
bool LanguageAwareRewriter::Rewrite(
const ConversionRequest &request, Segments *segments) const {
if (!IsEnabled(request.request())) {
return false;
}
return FillRawText(request, segments);
}
namespace {
bool IsLangaugeAwareInputCandidate(const composer::Composer &composer,
const Segment::Candidate &candidate) {
// Check candidate.prefix to filter if the candidate is probably generated
// from LanguangeAwareInput or not.
//
// "→ "
if (candidate.prefix != "\xE2\x86\x92 ") {
return false;
}
string raw_string;
composer.GetRawString(&raw_string);
if (raw_string != candidate.value) {
return false;
}
return true;
}
} // namespace
void LanguageAwareRewriter::Finish(const ConversionRequest &request,
Segments *segments) {
if (request.request().language_aware_input() !=
mozc::commands::Request::LANGUAGE_AWARE_SUGGESTION) {
return;
}
if (segments->conversion_segments_size() != 1 || !request.has_composer()) {
return;
}
// Update usage stats
const Segment &segment = segments->conversion_segment(0);
// Ignores segments which are not converted or not committed.
if (segment.candidates_size() == 0 ||
segment.segment_type() != Segment::FIXED_VALUE) {
return;
}
if (IsLangaugeAwareInputCandidate(request.composer(),
segment.candidate(0))) {
usage_stats::UsageStats::IncrementCount("LanguageAwareSuggestionCommitted");
}
}
} // namespace mozc