blob: 139e50afb7c26302ce6abdfc480ddb2b9c3d18fc [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "converter/key_corrector.h"
#include <cstring>
#include "base/logging.h"
#include "base/port.h"
#include "base/util.h"
namespace mozc {
namespace {
// maximum key length KeyCorrector can handle
// if key is too long, we don't do key correction
const size_t kMaxSize = 128;
// invalid alignment marker
const size_t kInvalidPos = static_cast<size_t>(-1);
// "ん" (few "n" pettern)
// "んあ" -> "んな"
// "んい" -> "んに"
// "んう" -> "んぬ"
// "んえ" -> "んね"
// "んお" -> "んの"
bool RewriteNN(size_t key_pos,
const char *begin, const char *end,
size_t *mblen, string *output) {
if (key_pos == 0) {
*mblen = 0;
return false;
}
const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
if (ucs4 != 0x3093) { // "ん"
*mblen = 0;
return false;
}
if (begin + *mblen >= end) {
*mblen = 0;
return false;
}
size_t mblen2 = 0;
const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
uint16 output_ucs4 = 0x0000;
switch (next_ucs4) {
case 0x3042: // "あ"
output_ucs4 = 0x306A; // "な"
break;
case 0x3044: // "い"
output_ucs4 = 0x306B; // "に"
break;
case 0x3046: // "う"
output_ucs4 = 0x306C; // "ぬ"
break;
case 0x3048: // "え"
output_ucs4 = 0x306D; // "ね"
break;
case 0x304A: // "お"
output_ucs4 = 0x306E; // "の"
break;
default:
break;
}
if (output_ucs4 != 0x0000) { // "ん[あいうえお]"
Util::UCS4ToUTF8Append(ucs4, output);
Util::UCS4ToUTF8Append(output_ucs4, output);
*mblen += mblen2;
return true;
} else { // others
*mblen = 0;
return false;
}
return false;
}
// "んん" (many "n" pattern)
// "([^ん])んん[ん]" -> ignore
// "([^ん])んん[あいうえお]" -> $1 and leave "ん[あいうえお]"
// "([^ん])んん[^あいうえお]" -> $1"ん" and leave "[^あいうえお]"
bool RewriteDoubleNN(size_t key_pos,
const char *begin, const char *end,
size_t *mblen, string *output) {
// 0x3093: "ん"
static const uint16 kPattern[] = { 0x0000, 0x3093, 0x3093 };
*mblen = 0;
uint16 first_char = 0x0000;
size_t first_mblen = 0;
for (size_t i = 0; i < arraysize(kPattern); ++i) {
if (begin >= end) {
*mblen = 0;
return false;
}
size_t mblen2 = 0;
const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
if ((kPattern[i] == 0x0000 && ucs4 != 0x3093 &&
Util::GetScriptType(ucs4) == Util::HIRAGANA) ||
(kPattern[i] == 0x3093 && ucs4 == 0x3093)) {
if (i == 0) {
first_mblen = mblen2;
first_char = ucs4;
}
} else {
*mblen = 0;
return false;
}
begin += mblen2;
*mblen += mblen2;
}
if (begin >= end) {
*mblen = 0;
return false;
}
if (first_char == 0x0000 || first_mblen == 0) {
*mblen = 0;
return false;
}
size_t mblen2 = 0;
const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
if (ucs4 == 0x3093) { // "ん" ignore
*mblen = 0;
return false;
} else if (ucs4 == 0x3042 || // "[あいうえお]"
ucs4 == 0x3044 ||
ucs4 == 0x3046 ||
ucs4 == 0x3048 ||
ucs4 == 0x304A) {
// drop first "ん" and leave "ん[あいうえお]"
// remained part will be handled by RewriteNN(), e.g, "んあ" -> "んな"
Util::UCS4ToUTF8Append(first_char, output);
// Skip one Hiragana character in UTF-8, which is 3 bytes.
*mblen = first_mblen + 3;
return true;
} else { // "[^あいうえお]"
Util::UCS4ToUTF8Append(first_char, output);
Util::UCS4ToUTF8Append(0x3093, output); // "ん"
return true;
}
return false;
}
// "に" pattern
// "にゃ" -> "んや"
// "にゅ" -> "んゆ"
// "にょ" -> "んよ"
bool RewriteNI(size_t key_pos,
const char *begin, const char *end,
size_t *mblen, string *output) {
const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
if (ucs4 != 0x306B) { // "に"
*mblen = 0;
return false;
}
if (begin + *mblen >= end) {
*mblen = 0;
return false;
}
size_t mblen2 = 0;
const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
uint16 output_ucs4 = 0x0000;
switch (next_ucs4) {
case 0x3083: // "ゃ"
output_ucs4 = 0x3084; // "や"
break;
case 0x3085: // "ゅ"
output_ucs4 = 0x3086; // "ゆ"
break;
case 0x3087: // "ょ"
output_ucs4 = 0x3088; // "よ"
break;
default:
output_ucs4 = 0x0000;
break;
}
if (output_ucs4 != 0x0000) {
Util::UCS4ToUTF8Append(0x3093, output); // "ん"
Util::UCS4ToUTF8Append(output_ucs4, output);
*mblen += mblen2;
return true;
} else {
*mblen = 0;
return false;
}
return false;
}
// "m" Pattern (not BOS)
// "m[ばびぶべぼぱぴぷぺぽ]" -> "ん[ばびぶべぼぱぴぷぺぽ]"
bool RewriteM(size_t key_pos,
const char *begin, const char *end,
size_t *mblen, string *output) {
if (key_pos == 0) {
*mblen = 0;
return false;
}
const char32 ucs4 = Util::UTF8ToUCS4(begin, end, mblen);
// "m" or "m" (don't take capitial letter, as "M" might not be a misspelling)
if (ucs4 != 0x006D && ucs4 != 0xFF4D) {
*mblen = 0;
return false;
}
if (begin + *mblen >= end) {
*mblen = 0;
return false;
}
size_t mblen2 = 0;
const uint16 next_ucs4 = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
// "[はばぱひびぴふぶぷへべぺほぼぽ]" => [0x306F .. 0X307D]
// Here we want to take "[は..ぽ]" except for "はひふへほ"
if (next_ucs4 % 3 != 0 && // not "はひふへほ"
next_ucs4 >= 0x306F && next_ucs4 <= 0x307D) { // "は..ぽ"
Util::UCS4ToUTF8Append(0x3093, output); // "ん"
Util::UCS4ToUTF8Append(next_ucs4, output);
*mblen += mblen2;
return true;
} else {
*mblen = 0;
return false;
}
return true;
}
// "きっって" ->" きって"
// replace "([^っ])っっ([^っ])" => "$1っ$2"
// Don't consider more that three "っっっ"
// e.g, "かっっった" -> "かっっった"
bool RewriteSmallTSU(size_t key_pos,
const char *begin, const char *end,
size_t *mblen, string *output) {
// 0x0000 is a place holder for "[^っ]"
// "っ": 0x3063
static const uint16 kPattern[] = { 0x0000, 0x3063, 0x3063, 0x0000 };
uint16 first_char = 0x0000;
uint16 last_char = 0x0000;
for (size_t i = 0; i < arraysize(kPattern); ++i) {
if (begin >= end) {
*mblen = 0;
return false;
}
size_t mblen2 = 0;
const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen2);
if ((kPattern[i] == 0x0000 && ucs4 != 0x3063 &&
Util::GetScriptType(ucs4) == Util::HIRAGANA) ||
(kPattern[i] == 0x3063 && ucs4 == 0x3063)) {
if (i == 0) {
first_char = ucs4;
} else if (i == arraysize(kPattern) - 1) {
last_char = ucs4;
}
} else {
*mblen = 0;
return false;
}
begin += mblen2;
*mblen += mblen2;
}
if (first_char == 0x0000 || last_char == 0x0000) {
*mblen = 0;
return false;
}
Util::UCS4ToUTF8Append(first_char, output);
Util::UCS4ToUTF8Append(0x3063, output); // "っ"
Util::UCS4ToUTF8Append(last_char, output);
return true;
}
// Not implemented yet, as they looks minor
// "[子音][ゃゅょ][^う]" Pattern
// "きゅ[^う] -> きゅう"
// "しゅ[^う] -> しゅう"
// "ちゅ[^う] -> ちゅう"
// "にゅ[^う] -> にゅう"
// "ひゅ[^う] -> ひゅう"
// "りゅ[^う] -> りゅう"
bool RewriteYu(size_t key_pos,
const char *begin, const char *end,
size_t *mblen, string *output) {
const char32 first_char = Util::UTF8ToUCS4(begin, end, mblen);
if (first_char != 0x304D && first_char != 0x3057 &&
first_char != 0x3061 && first_char != 0x306B &&
first_char != 0x3072 && first_char != 0x308A) { // !"きしちにひり"
*mblen = 0;
return false;
}
if (begin + *mblen >= end) {
*mblen = 0;
return false;
}
size_t mblen2 = 0;
const char32 next_char = Util::UTF8ToUCS4(begin + *mblen, end, &mblen2);
if (next_char != 0x3085) { // "ゅ"
*mblen = 0;
return false;
}
if (begin + *mblen + mblen2 >= end) {
*mblen = 0;
return false;
}
size_t mblen3 = 0;
const char32 last_char = Util::UTF8ToUCS4(begin + *mblen + mblen2,
end, &mblen3);
if (last_char == 0x3046) { // "う"
*mblen = 0;
return false;
}
// OK, rewrite
*mblen += mblen2;
Util::UCS4ToUTF8Append(first_char, output);
Util::UCS4ToUTF8Append(next_char, output); // "ゅ"
Util::UCS4ToUTF8Append(0x3046, output); // "う"
return true;
}
} // namespace
KeyCorrector::KeyCorrector(const string &key, InputMode mode,
size_t history_size)
: available_(false), mode_(mode) {
CorrectKey(key, mode, history_size);
}
KeyCorrector::KeyCorrector()
: available_(false), mode_(ROMAN) {}
KeyCorrector::~KeyCorrector() {}
KeyCorrector::InputMode KeyCorrector::mode() const {
return mode_;
}
bool KeyCorrector::IsAvailable() const {
return available_;
}
// return corrected key;
const string &KeyCorrector::corrected_key() const {
return corrected_key_;
}
const string &KeyCorrector::original_key() const {
return original_key_;
}
size_t KeyCorrector::GetCorrectedPosition(const size_t original_key_pos) const {
if (original_key_pos < alignment_.size()) {
return alignment_[original_key_pos];
}
return kInvalidPos;
}
size_t KeyCorrector::GetOriginalPosition(const size_t corrected_key_pos) const {
if (corrected_key_pos < rev_alignment_.size()) {
return rev_alignment_[corrected_key_pos];
}
return kInvalidPos;
}
void KeyCorrector::Clear() {
available_ = false;
original_key_.clear();
corrected_key_.clear();
alignment_.clear();
rev_alignment_.clear();
}
bool KeyCorrector::CorrectKey(const string &key, InputMode mode,
size_t history_size) {
Clear();
// TOOD(taku) support KANA
if (mode == KANA) {
return false;
}
if (key.size() == 0 || key.size() >= kMaxSize) {
VLOG(1) << "invalid key length";
return false;
}
original_key_ = key;
const char *begin = key.data();
const char *end = key.data() + key.size();
const char *input_begin = key.data() + history_size;
size_t key_pos = 0;
while (begin < end) {
size_t mblen = 0;
const size_t org_len = corrected_key_.size();
if (begin < input_begin ||
(!RewriteDoubleNN(key_pos, begin, end, &mblen, &corrected_key_) &&
!RewriteNN(key_pos, begin, end, &mblen, &corrected_key_) &&
!RewriteYu(key_pos, begin, end, &mblen, &corrected_key_) &&
!RewriteNI(key_pos, begin, end, &mblen, &corrected_key_) &&
!RewriteSmallTSU(key_pos, begin, end, &mblen, &corrected_key_) &&
!RewriteM(key_pos, begin, end, &mblen, &corrected_key_))) {
const char32 ucs4 = Util::UTF8ToUCS4(begin, end, &mblen);
Util::UCS4ToUTF8Append(ucs4, &corrected_key_);
}
const size_t corrected_mblen = corrected_key_.size() - org_len;
if (corrected_mblen <= 0 && mblen <= 0) {
LOG(ERROR) << "Invalid pattern: " << key;
Clear();
return false;
}
// one to one mapping
if (mblen == corrected_mblen) {
const size_t len = static_cast<size_t>(begin - key.data());
for (size_t i = 0; i < mblen; ++i) {
alignment_.push_back(org_len + i);
rev_alignment_.push_back(len + i);
}
} else {
// NOT a one to one maping, we take fist/last alignment only
alignment_.push_back(org_len);
for (size_t i = 1; i < mblen; ++i) {
alignment_.push_back(kInvalidPos);
}
rev_alignment_.push_back(static_cast<size_t>(begin - key.data()));
for (size_t i = 1; i < corrected_mblen; ++i) {
rev_alignment_.push_back(kInvalidPos);
}
}
begin += mblen;
++key_pos;
}
DCHECK_EQ(original_key_.size(), alignment_.size());
DCHECK_EQ(corrected_key_.size(), rev_alignment_.size());
available_ = true;
return true;
}
const char *KeyCorrector::GetCorrectedPrefix(const size_t original_key_pos,
size_t *length) const {
if (!IsAvailable()) {
*length = 0;
return NULL;
}
if (mode_ == KANA) {
*length = 0;
return NULL;
}
const size_t corrected_key_pos = GetCorrectedPosition(original_key_pos);
if (!IsValidPosition(corrected_key_pos)) {
*length = 0;
return NULL;
}
const char *corrected_substr = corrected_key_.data() + corrected_key_pos;
const size_t corrected_length = corrected_key_.size() - corrected_key_pos;
const char *original_substr = original_key_.data() + original_key_pos;
const size_t original_length = original_key_.size() - original_key_pos;
// substrs are different
if (original_length != corrected_length ||
memcmp(original_substr, corrected_substr, original_length) != 0) {
*length = corrected_length;
return corrected_substr;
}
*length = 0;
return NULL;
}
size_t KeyCorrector::GetOriginalOffset(const size_t original_key_pos,
const size_t new_key_offset) const {
if (!IsAvailable()) {
return kInvalidPos;
}
if (mode_ == KANA) {
return kInvalidPos;
}
const size_t corrected_key_pos =
GetCorrectedPosition(original_key_pos);
if (!IsValidPosition(corrected_key_pos)) {
return kInvalidPos;
}
if (rev_alignment_.size() == corrected_key_pos + new_key_offset) {
return (alignment_.size() - GetOriginalPosition(corrected_key_pos));
} else {
// treat right edge
const size_t original_key_pos2 =
GetOriginalPosition(corrected_key_pos + new_key_offset);
if (!IsValidPosition(original_key_pos2)) {
return kInvalidPos;
}
// Don't allow NULL matching
if (original_key_pos2 >= original_key_pos) {
return static_cast<size_t>(original_key_pos2 - original_key_pos);
}
}
return kInvalidPos;
}
// static
bool KeyCorrector::IsValidPosition(size_t pos) {
return (pos != kInvalidPos);
}
// static
bool KeyCorrector::IsInvalidPosition(size_t pos) {
return (pos == kInvalidPos);
}
// static
size_t KeyCorrector::InvalidPosition() {
return kInvalidPos;
}
// static
int KeyCorrector::GetCorrectedCostPenalty(const string &key) {
// "んん" and "っっ" must be mis-spelling.
// if (key.find("んん") != string::npos ||
// key.find("っっ") != string::npos) {
if (key.find("\xE3\x82\x93\xE3\x82\x93") != string::npos ||
key.find("\xE3\x81\xA3\xE3\x81\xA3") != string::npos) {
return 0;
}
// add 3000 to the original word cost
const int kCorrectedCostPenalty = 3000;
return kCorrectedCostPenalty;
}
} // namespace mozc