blob: 436647b56b9fea413e1d4e77f505929429774c08 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "rewriter/unicode_rewriter.h"
#include <cstddef>
#include <cstdlib>
#include <string>
#include "base/port.h"
#include "base/system_util.h"
#include "base/util.h"
#include "composer/composer.h"
#include "config/config.pb.h"
#include "config/config_handler.h"
#include "converter/conversion_request.h"
#include "converter/segments.h"
#include "engine/engine_interface.h"
#include "engine/mock_data_engine_factory.h"
#include "session/commands.pb.h"
#include "testing/base/public/gunit.h"
DECLARE_string(test_tmpdir);
namespace mozc {
namespace {
void AddSegment(const string &key, const string &value, Segments *segments) {
Segment *seg = segments->add_segment();
Segment::Candidate *candidate = seg->add_candidate();
seg->set_key(key);
candidate->content_key = key;
candidate->value = value;
candidate->content_value = value;
}
void InitSegments(const string &key, const string &value, Segments *segments) {
segments->Clear();
AddSegment(key, value, segments);
}
bool ContainCandidate(const Segments &segments, const string &candidate) {
const Segment &segment = segments.segment(0);
for (size_t i = 0; i < segment.candidates_size(); ++i) {
if (candidate == segment.candidate(i).value) {
return true;
}
}
return false;
}
} // namespace
class UnicodeRewriterTest : public testing::Test {
protected:
// Workaround for C2512 error (no default appropriate constructor) on MSVS.
UnicodeRewriterTest() {}
virtual ~UnicodeRewriterTest() {}
virtual void SetUp() {
SystemUtil::SetUserProfileDirectory(FLAGS_test_tmpdir);
config::Config config;
config::ConfigHandler::GetDefaultConfig(&config);
config::ConfigHandler::SetConfig(config);
engine_.reset(MockDataEngineFactory::Create());
}
scoped_ptr<EngineInterface> engine_;
const commands::Request &default_request() const {
return default_request_;
}
private:
const commands::Request default_request_;
};
TEST_F(UnicodeRewriterTest, UnicodeConvertionTest) {
Segments segments;
UnicodeRewriter rewriter(engine_->GetConverter());
const ConversionRequest request;
struct UCS4UTF8Data {
const char *ucs4;
const char *utf8;
};
const UCS4UTF8Data kUcs4Utf8Data[] = {
// Hiragana
{ "U+3042", "\xE3\x81\x82" }, // "あ"
{ "U+3044", "\xE3\x81\x84" }, // "い"
{ "U+3046", "\xE3\x81\x86" }, // "う"
{ "U+3048", "\xE3\x81\x88" }, // "え"
{ "U+304A", "\xE3\x81\x8A" }, // "お"
// Katakana
{ "U+30A2", "\xE3\x82\xA2" }, // "ア"
{ "U+30A4", "\xE3\x82\xA4" }, // "イ"
{ "U+30A6", "\xE3\x82\xA6" }, // "ウ"
{ "U+30A8", "\xE3\x82\xA8" }, // "エ"
{ "U+30AA", "\xE3\x82\xAA" }, // "オ"
// half-Katakana
{ "U+FF71", "\xEF\xBD\xB1" }, // "ア"
{ "U+FF72", "\xEF\xBD\xB2" }, // "イ"
{ "U+FF73", "\xEF\xBD\xB3" }, // "ウ"
{ "U+FF74", "\xEF\xBD\xB4" }, // "エ"
{ "U+FF75", "\xEF\xBD\xB5" }, // "オ"
// CJK
{ "U+611B", "\xE6\x84\x9B" }, // "愛"
{ "U+690D", "\xE6\xA4\x8D" }, // "植"
{ "U+7537", "\xE7\x94\xB7" }, // "男"
// Other types (Oriya script)
{ "U+0B00", "\xE0\xAC\x80" }, // "଀"
{ "U+0B01", "\xE0\xAC\x81" }, // "ଁ"
{ "U+0B02", "\xE0\xAC\x82" }, // "ଂ"
// Other types (Arabic)
{ "U+0600", "\xD8\x80" }, // "؀"
{ "U+0601", "\xD8\x81" }, // "؁"
{ "U+0602", "\xD8\x82" }, // "؂"
// Latin-1 support
{ "U+00A0", "\xC2\xA0" }, // " " (nbsp)
{ "U+00A1", "\xC2\xA1" }, // "¡"
};
const char* kMozcUnsupportedUtf8[] = {
// Control characters
"U+0000", "U+001F", "U+007F", "U+0080", "U+009F",
// Out of Unicode
"U+110000",
// Bidirectional text
"U+200E", "U+202D",
};
// All ascii code would be accepted.
for (uint32 ascii = 0x20; ascii < 0x7F; ++ascii) {
const string ucs4 = Util::StringPrintf("U+00%02X", ascii);
InitSegments(ucs4, ucs4, &segments);
EXPECT_TRUE(rewriter.Rewrite(request, &segments));
EXPECT_EQ(ascii, segments.segment(0).candidate(0).value.at(0));
}
// Mozc accepts Japanese characters
for (size_t i = 0; i < arraysize(kUcs4Utf8Data); ++i) {
InitSegments(kUcs4Utf8Data[i].ucs4, kUcs4Utf8Data[i].ucs4, &segments);
EXPECT_TRUE(rewriter.Rewrite(request, &segments));
EXPECT_TRUE(ContainCandidate(segments, kUcs4Utf8Data[i].utf8));
}
// Mozc does not accept other characters
for (size_t i = 0; i < arraysize(kMozcUnsupportedUtf8); ++i) {
InitSegments(kMozcUnsupportedUtf8[i], kMozcUnsupportedUtf8[i], &segments);
EXPECT_FALSE(rewriter.Rewrite(request, &segments));
}
// invlaid style input
InitSegments("U+1234567", "U+12345678", &segments);
EXPECT_FALSE(rewriter.Rewrite(request, &segments));
InitSegments("U+XYZ", "U+XYZ", &segments);
EXPECT_FALSE(rewriter.Rewrite(request, &segments));
InitSegments("12345", "12345", &segments);
EXPECT_FALSE(rewriter.Rewrite(request, &segments));
InitSegments("U12345", "U12345", &segments);
EXPECT_FALSE(rewriter.Rewrite(request, &segments));
}
TEST_F(UnicodeRewriterTest, MultipleSegment) {
Segments segments;
UnicodeRewriter rewriter(engine_->GetConverter());
const ConversionRequest request;
// Multiple segments are combined.
InitSegments("U+0", "U+0", &segments);
AddSegment("02", "02", &segments);
AddSegment("0", "0", &segments);
EXPECT_TRUE(rewriter.Rewrite(request, &segments));
EXPECT_EQ(1, segments.conversion_segments_size());
EXPECT_EQ(' ', segments.conversion_segment(0).candidate(0).value.at(0));
// If the segments is already resized, returns false.
InitSegments("U+0020", "U+0020", &segments);
AddSegment("U+0020", "U+0020", &segments);
segments.set_resized(true);
EXPECT_FALSE(rewriter.Rewrite(request, &segments));
// History segment has to be ignored.
// In this case 1st segment is HISTORY
// so this rewriting returns true.
InitSegments("U+0020", "U+0020", &segments);
AddSegment("U+0020", "U+0020", &segments);
segments.set_resized(true);
segments.mutable_segment(0)->set_segment_type(Segment::HISTORY);
EXPECT_TRUE(rewriter.Rewrite(request, &segments));
EXPECT_EQ(' ', segments.conversion_segment(0).candidate(0).value.at(0));
}
TEST_F(UnicodeRewriterTest, RewriteToUnicodeCharFormat) {
UnicodeRewriter rewriter(engine_->GetConverter());
{ // Typical case
composer::Composer composer(NULL, &default_request());
composer.set_source_text("A");
ConversionRequest request(&composer, &default_request());
Segments segments;
AddSegment("A", "A", &segments);
EXPECT_TRUE(rewriter.Rewrite(request, &segments));
EXPECT_TRUE(ContainCandidate(segments, "U+0041"));
}
{ // If source_text is not set, this rewrite is not triggered.
composer::Composer composer(NULL, &default_request());
ConversionRequest request(&composer, &default_request());
Segments segments;
AddSegment("A", "A", &segments);
EXPECT_FALSE(rewriter.Rewrite(request, &segments));
EXPECT_FALSE(ContainCandidate(segments, "U+0041"));
}
{ // If source_text is not a single character, this rewrite is not
// triggered.
composer::Composer composer(NULL, &default_request());
composer.set_source_text("AB");
ConversionRequest request(&composer, &default_request());
Segments segments;
AddSegment("AB", "AB", &segments);
EXPECT_FALSE(rewriter.Rewrite(request, &segments));
}
{ // Multibyte character is also supported.
composer::Composer composer(NULL, &default_request());
composer.set_source_text("\xE6\x84\x9B"); // "愛"
ConversionRequest request(&composer, &default_request());
Segments segments;
AddSegment("\xE3\x81\x82\xE3\x81\x84", "\xE6\x84\x9B", &segments);
EXPECT_TRUE(rewriter.Rewrite(request, &segments));
EXPECT_TRUE(ContainCandidate(segments, "U+611B"));
}
}
} // namespace mozc