| // Copyright 2010-2015, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "dictionary/system/system_dictionary.h" |
| |
| #include <cstdlib> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/file_util.h" |
| #include "base/logging.h" |
| #include "base/port.h" |
| #include "base/stl_util.h" |
| #include "base/system_util.h" |
| #include "base/util.h" |
| #include "data_manager/user_pos_manager.h" |
| #include "dictionary/dictionary_test_util.h" |
| #include "dictionary/dictionary_token.h" |
| #include "dictionary/pos_matcher.h" |
| #include "dictionary/system/codec_interface.h" |
| #include "dictionary/system/system_dictionary_builder.h" |
| #include "dictionary/text_dictionary_loader.h" |
| #include "testing/base/public/googletest.h" |
| #include "testing/base/public/gunit.h" |
| |
| using mozc::dictionary::CollectTokenCallback; |
| |
| namespace { |
| // We cannot use #ifdef in DEFINE_int32. |
| #ifdef DEBUG |
| const uint32 kDefaultReverseLookupTestSize = 1000; |
| #else |
| const uint32 kDefaultReverseLookupTestSize = 10000; |
| #endif |
| } // namespace |
| |
| // TODO(noriyukit): Ideally, the copy rule of dictionary_oss/dictionary00.txt |
| // can be shared with one in |
| // data_manager/dictionary_oss/oss_data_manager_test.gyp. However, to avoid |
| // conflict of copy destination name, the copy destination here is changed from |
| // the original one. See also comments in system_dictionary_test.gyp. |
| DEFINE_string( |
| dictionary_source, |
| "data/system_dictionary_test/dictionary00.txt", |
| "source dictionary file to run test"); |
| |
| DEFINE_int32(dictionary_test_size, 100000, |
| "Dictionary size for this test."); |
| DEFINE_int32(dictionary_reverse_lookup_test_size, kDefaultReverseLookupTestSize, |
| "Number of tokens to run reverse lookup test."); |
| DECLARE_string(test_srcdir); |
| DECLARE_string(test_tmpdir); |
| DECLARE_int32(min_key_length_to_use_small_cost_encoding); |
| |
| namespace mozc { |
| namespace dictionary { |
| |
| namespace { |
| |
| const bool kEnableKanaModiferInsensitiveLookup = true; |
| const bool kDisableKanaModiferInsensitiveLookup = false; |
| |
| } // namespace |
| |
| class SystemDictionaryTest : public testing::Test { |
| protected: |
| SystemDictionaryTest() |
| : text_dict_(new TextDictionaryLoader( |
| *UserPosManager::GetUserPosManager()->GetPOSMatcher())), |
| dic_fn_(FLAGS_test_tmpdir + "/mozc.dic") { |
| const string dic_path = FileUtil::JoinPath(FLAGS_test_srcdir, |
| FLAGS_dictionary_source); |
| text_dict_->LoadWithLineLimit(dic_path, "", FLAGS_dictionary_test_size); |
| } |
| |
| virtual void SetUp() { |
| SystemUtil::SetUserProfileDirectory(FLAGS_test_tmpdir); |
| |
| // Don't use small cost encoding by default. |
| original_flags_min_key_length_to_use_small_cost_encoding_ = |
| FLAGS_min_key_length_to_use_small_cost_encoding; |
| FLAGS_min_key_length_to_use_small_cost_encoding = kint32max; |
| } |
| |
| virtual void TearDown() { |
| FLAGS_min_key_length_to_use_small_cost_encoding = |
| original_flags_min_key_length_to_use_small_cost_encoding_; |
| } |
| |
| void BuildSystemDictionary(const vector <Token *>& tokens, |
| int num_tokens); |
| Token* CreateToken(const string& key, const string& value) const; |
| bool CompareTokensForLookup(const Token &a, const Token &b, |
| bool reverse) const; |
| |
| scoped_ptr<TextDictionaryLoader> text_dict_; |
| const string dic_fn_; |
| int original_flags_min_key_length_to_use_small_cost_encoding_; |
| }; |
| |
| void SystemDictionaryTest::BuildSystemDictionary(const vector<Token *>& source, |
| int num_tokens) { |
| SystemDictionaryBuilder builder; |
| vector<Token *> tokens; |
| // Picks up first tokens. |
| for (vector<Token *>::const_iterator it = source.begin(); |
| tokens.size() < num_tokens && it != source.end(); ++it) { |
| tokens.push_back(*it); |
| } |
| builder.BuildFromTokens(tokens); |
| builder.WriteToFile(dic_fn_); |
| } |
| |
| Token* SystemDictionaryTest::CreateToken(const string& key, |
| const string& value) const { |
| Token* t = new Token; |
| t->key = key; |
| t->value = value; |
| t->cost = 0; |
| t->lid = 0; |
| t->rid = 0; |
| return t; |
| } |
| |
| // Returns true if they seem to be same |
| bool SystemDictionaryTest::CompareTokensForLookup( |
| const Token &a, const Token &b, bool reverse) const { |
| const bool key_value_check = reverse ? |
| (a.key == b.value && a.value == b.key) : |
| (a.key == b.key && a.value == b.value); |
| if (!key_value_check) { |
| return false; |
| } |
| const bool comp_cost = a.cost == b.cost; |
| if (!comp_cost) { |
| return false; |
| } |
| const bool spelling_match = |
| (a.attributes & Token::SPELLING_CORRECTION) == |
| (b.attributes & Token::SPELLING_CORRECTION); |
| if (!spelling_match) { |
| return false; |
| } |
| const bool id_match = (a.lid == b.lid) && (a.rid == b.rid); |
| if (!id_match) { |
| return false; |
| } |
| return true; |
| } |
| |
| TEST_F(SystemDictionaryTest, HasValue) { |
| vector<Token *> tokens; |
| for (int i = 0; i < 4; ++i) { |
| Token *token = new Token; |
| // "きー%d" |
| token->key = Util::StringPrintf("\xE3\x81\x8D\xE3\x83\xBC%d", i); |
| // "バリュー%d" |
| token->value = Util::StringPrintf( |
| "\xE3\x83\x90\xE3\x83\xAA\xE3\x83\xA5\xE3\x83\xBC%d", i); |
| tokens.push_back(token); |
| } |
| |
| { // Alphabet |
| Token *token = new Token; |
| token->key = "Mozc"; |
| token->value = "Mozc"; |
| tokens.push_back(token); |
| } |
| |
| { // Alphabet upper case |
| Token *token = new Token; |
| token->key = "upper"; |
| token->value = "UPPER"; |
| tokens.push_back(token); |
| } |
| |
| // "full" |
| const string kFull = "\xEF\xBD\x86\xEF\xBD\x95\xEF\xBD\x8C\xEF\xBD\x8C"; |
| // "ひらがな" |
| const string kHiragana = "\xE3\x81\xB2\xE3\x82\x89\xE3\x81\x8C\xE3\x81\xAA"; |
| // "かたかな" |
| const string kKatakanaKey = |
| "\xE3\x81\x8B\xE3\x81\x9F\xE3\x81\x8B\xE3\x81\xAA"; |
| // "カタカナ" |
| const string kKatakanaValue = |
| "\xE3\x82\xAB\xE3\x82\xBF\xE3\x82\xAB\xE3\x83\x8A"; |
| |
| { // Alphabet full width |
| Token *token = new Token; |
| token->key = "full"; |
| token->value = kFull; // "full" |
| tokens.push_back(token); |
| } |
| |
| { // Hiragana |
| Token *token = new Token; |
| token->key = kHiragana; // "ひらがな" |
| token->value = kHiragana; // "ひらがな" |
| tokens.push_back(token); |
| } |
| |
| { // Katakana |
| Token *token = new Token; |
| token->key = kKatakanaKey; // "かたかな" |
| token->value = kKatakanaValue; // "カタカナ" |
| tokens.push_back(token); |
| } |
| |
| BuildSystemDictionary(tokens, tokens.size()); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| EXPECT_TRUE(system_dic->HasValue( |
| // "バリュー0" |
| "\xE3\x83\x90\xE3\x83\xAA\xE3\x83\xA5\xE3\x83\xBC\x30")); |
| EXPECT_TRUE(system_dic->HasValue( |
| // "バリュー1" |
| "\xE3\x83\x90\xE3\x83\xAA\xE3\x83\xA5\xE3\x83\xBC\x31")); |
| EXPECT_TRUE(system_dic->HasValue( |
| // "バリュー2" |
| "\xE3\x83\x90\xE3\x83\xAA\xE3\x83\xA5\xE3\x83\xBC\x32")); |
| EXPECT_TRUE(system_dic->HasValue( |
| // "バリュー3" |
| "\xE3\x83\x90\xE3\x83\xAA\xE3\x83\xA5\xE3\x83\xBC\x33")); |
| EXPECT_FALSE(system_dic->HasValue( |
| // "バリュー4" |
| "\xE3\x83\x90\xE3\x83\xAA\xE3\x83\xA5\xE3\x83\xBC\x34")); |
| EXPECT_FALSE(system_dic->HasValue( |
| // "バリュー5" |
| "\xE3\x83\x90\xE3\x83\xAA\xE3\x83\xA5\xE3\x83\xBC\x35")); |
| EXPECT_FALSE(system_dic->HasValue( |
| // "バリュー6" |
| "\xE3\x83\x90\xE3\x83\xAA\xE3\x83\xA5\xE3\x83\xBC\x36")); |
| |
| EXPECT_TRUE(system_dic->HasValue("Mozc")); |
| EXPECT_FALSE(system_dic->HasValue("mozc")); |
| |
| EXPECT_TRUE(system_dic->HasValue("UPPER")); |
| EXPECT_FALSE(system_dic->HasValue("upper")); |
| |
| EXPECT_TRUE(system_dic->HasValue(kFull)); // "full" |
| EXPECT_FALSE(system_dic->HasValue("full")); |
| |
| EXPECT_TRUE(system_dic->HasValue(kHiragana)); //"ひらがな" |
| EXPECT_FALSE(system_dic->HasValue( |
| "\xE3\x83\x92\xE3\x83\xA9\xE3\x82\xAC\xE3\x83\x8A\x0A")); // "ヒラガナ" |
| |
| EXPECT_TRUE(system_dic->HasValue(kKatakanaValue)); // "カタカナ" |
| EXPECT_FALSE(system_dic->HasValue(kKatakanaKey)); // "かたかな" |
| |
| STLDeleteElements(&tokens); |
| } |
| |
| TEST_F(SystemDictionaryTest, NormalWord) { |
| vector<Token *> source_tokens; |
| scoped_ptr<Token> t0(new Token); |
| // "あ" |
| t0->key = "\xe3\x81\x82"; |
| // "亜" |
| t0->value = "\xe4\xba\x9c"; |
| t0->cost = 100; |
| t0->lid = 50; |
| t0->rid = 70; |
| source_tokens.push_back(t0.get()); |
| BuildSystemDictionary(source_tokens, FLAGS_dictionary_test_size); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| CollectTokenCallback callback; |
| |
| // Look up by exact key. |
| system_dic->LookupPrefix(t0->key, false, &callback); |
| ASSERT_EQ(1, callback.tokens().size()); |
| EXPECT_TOKEN_EQ(*t0, callback.tokens().front()); |
| |
| // Look up by prefix. |
| callback.Clear(); |
| system_dic->LookupPrefix( |
| "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86", // "あいう" |
| false, &callback); |
| ASSERT_EQ(1, callback.tokens().size()); |
| EXPECT_TOKEN_EQ(*t0, callback.tokens().front()); |
| |
| // Nothing should be looked up. |
| callback.Clear(); |
| system_dic->LookupPrefix( |
| "\xE3\x81\x8B\xE3\x81\x8D\xE3\x81\x8F", // "かきく" |
| false, &callback); |
| EXPECT_TRUE(callback.tokens().empty()); |
| } |
| |
| TEST_F(SystemDictionaryTest, SameWord) { |
| vector<Token> tokens(4); |
| |
| tokens[0].key = "\xe3\x81\x82"; // "あ" |
| tokens[0].value = "\xe4\xba\x9c"; // "亜" |
| tokens[0].cost = 100; |
| tokens[0].lid = 50; |
| tokens[0].rid = 70; |
| |
| tokens[1].key = "\xe3\x81\x82"; // "あ" |
| tokens[1].value = "\xe4\xba\x9c"; // "亜" |
| tokens[1].cost = 150; |
| tokens[1].lid = 100; |
| tokens[1].rid = 200; |
| |
| tokens[2].key = "\xe3\x81\x82"; // "あ" |
| tokens[2].value = "\xe3\x81\x82"; // "あ" |
| tokens[2].cost = 100; |
| tokens[2].lid = 1000; |
| tokens[2].rid = 2000; |
| |
| tokens[3].key = "\xe3\x81\x82"; // "あ" |
| tokens[3].value = "\xe4\xba\x9c"; // "亜" |
| tokens[3].cost = 1000; |
| tokens[3].lid = 2000; |
| tokens[3].rid = 3000; |
| |
| vector<Token *> source_tokens; |
| for (size_t i = 0; i < tokens.size(); ++i) { |
| source_tokens.push_back(&tokens[i]); |
| } |
| BuildSystemDictionary(source_tokens, FLAGS_dictionary_test_size); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| // All the tokens should be looked up. |
| CollectTokenCallback callback; |
| system_dic->LookupPrefix("\xe3\x81\x82", // "あ" |
| false, &callback); |
| EXPECT_TOKENS_EQ_UNORDERED(source_tokens, callback.tokens()); |
| } |
| |
| TEST_F(SystemDictionaryTest, LookupAllWords) { |
| const vector<Token *> &source_tokens = text_dict_->tokens(); |
| BuildSystemDictionary(source_tokens, FLAGS_dictionary_test_size); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| // All the tokens should be looked up. |
| for (size_t i = 0; i < source_tokens.size(); ++i) { |
| CheckTokenExistenceCallback callback(source_tokens[i]); |
| system_dic->LookupPrefix(source_tokens[i]->key, false, &callback); |
| EXPECT_TRUE(callback.found()) |
| << "Token was not found: " << PrintToken(*source_tokens[i]); |
| } |
| } |
| |
| TEST_F(SystemDictionaryTest, SimpleLookupPrefix) { |
| // "は" |
| const string k0 = "\xe3\x81\xaf"; |
| // "はひふへほ" |
| const string k1 = "\xe3\x81\xaf\xe3\x81\xb2\xe3\x81\xb5\xe3\x81\xb8\xe3\x81" |
| "\xbb"; |
| scoped_ptr<Token> t0(CreateToken(k0, "aa")); |
| scoped_ptr<Token> t1(CreateToken(k1, "bb")); |
| |
| vector<Token *> source_tokens; |
| source_tokens.push_back(t0.get()); |
| source_tokens.push_back(t1.get()); |
| text_dict_->CollectTokens(&source_tokens); |
| BuildSystemDictionary(source_tokens, 100); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| // |t0| should be looked up from |k1|. |
| CheckTokenExistenceCallback callback(t0.get()); |
| system_dic->LookupPrefix(k1, false, &callback); |
| EXPECT_TRUE(callback.found()); |
| } |
| |
| namespace { |
| |
| class LookupPrefixTestCallback : public SystemDictionary::Callback { |
| public: |
| virtual ResultType OnKey(StringPiece key) { |
| if (key == "\xE3\x81\x8B\xE3\x81\x8D") { // key == "かき" |
| return TRAVERSE_CULL; |
| } else if (key == "\xE3\x81\x95") { // key == "さ" |
| return TRAVERSE_NEXT_KEY; |
| } else if (key == "\xE3\x81\x9F") { // key == "た" |
| return TRAVERSE_DONE; |
| } |
| return TRAVERSE_CONTINUE; |
| } |
| |
| virtual ResultType OnToken(StringPiece key, StringPiece actual_key, |
| const Token &token) { |
| result_.insert(make_pair(token.key, token.value)); |
| return TRAVERSE_CONTINUE; |
| } |
| |
| const set<pair<string, string> > &result() const { |
| return result_; |
| } |
| |
| private: |
| set<pair<string, string> > result_; |
| }; |
| |
| } // namespace |
| |
| TEST_F(SystemDictionaryTest, LookupPrefix) { |
| // Set up a test dictionary. |
| struct { |
| const char *key; |
| const char *value; |
| } kKeyValues[] = { |
| // "あ", "亜" |
| { "\xE3\x81\x82", "\xE4\xBA\x9C" }, |
| // "あ", "安" |
| { "\xE3\x81\x82", "\xE5\xAE\x89" }, |
| // "あ", "在" |
| { "\xE3\x81\x82", "\xE5\x9C\xA8" }, |
| // "あい", "愛" |
| { "\xE3\x81\x82\xE3\x81\x84", "\xE6\x84\x9B" }, |
| // "あい", "藍" |
| { "\xE3\x81\x82\xE3\x81\x84", "\xE8\x97\x8D" }, |
| // "あいう", "藍雨" |
| { "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86", "\xE8\x97\x8D\xE9\x9B\xA8" }, |
| // "か", "可" |
| { "\xE3\x81\x8B", "\xE5\x8F\xAF" }, |
| // "かき", "牡蠣" |
| { "\xE3\x81\x8B\xE3\x81\x8D", "\xE7\x89\xA1\xE8\xA0\xA3" }, |
| // "かき", "夏季" |
| { "\xE3\x81\x8B\xE3\x81\x8D", "\xE5\xA4\x8F\xE5\xAD\xA3" }, |
| // "かきく", "柿久" |
| { "\xE3\x81\x8B\xE3\x81\x8D\xE3\x81\x8F", "\xE6\x9F\xBF\xE4\xB9\x85" }, |
| // "さ", "差" |
| { "\xE3\x81\x95", "\xE5\xB7\xAE" }, |
| // "さ", "左" |
| { "\xE3\x81\x95", "\xE5\xB7\xA6" }, |
| // "さし", "刺" |
| { "\xE3\x81\x95\xE3\x81\x97", "\xE5\x88\xBA" }, |
| // "た", "田" |
| { "\xE3\x81\x9F", "\xE7\x94\xB0" }, |
| // "た", "多" |
| { "\xE3\x81\x9F", "\xE5\xA4\x9A" }, |
| // "たち", 多値" |
| { "\xE3\x81\x9F\xE3\x81\xA1", "\xE5\xA4\x9A\xE5\x80\xA4" }, |
| // "たちつ", "タチツ" |
| { "\xE3\x81\x9F\xE3\x81\xA1\xE3\x81\xA4", |
| "\xE3\x82\xBF\xE3\x83\x81\xE3\x83\x84" }, |
| // "は", "葉" |
| { "\xE3\x81\xAF", "\xE8\x91\x89" }, |
| // "は", "歯" |
| { "\xE3\x81\xAF", "\xE6\xAD\xAF" }, |
| // "はひ", "ハヒ" |
| { "\xE3\x81\xAF\xE3\x81\xB2", "\xE3\x83\x8F\xE3\x83\x92" }, |
| // "ば", "場" |
| { "\xE3\x81\xB0", "\xE5\xA0\xB4" }, |
| // "はび", "波美" |
| { "\xE3\x81\xAF\xE3\x81\xB3", "\xE6\xB3\xA2\xE7\xBE\x8E" }, |
| // "ばび", "馬尾" |
| { "\xE3\x81\xB0\xE3\x81\xB3", "\xE9\xA6\xAC\xE5\xB0\xBE" }, |
| // "ばびぶ", "バビブ" |
| { "\xE3\x81\xB0\xE3\x81\xB3\xE3\x81\xB6", |
| "\xE3\x83\x90\xE3\x83\x93\xE3\x83\x96" }, |
| }; |
| const size_t kKeyValuesSize = arraysize(kKeyValues); |
| scoped_ptr<Token> tokens[kKeyValuesSize]; |
| vector<Token *> source_tokens(kKeyValuesSize); |
| for (size_t i = 0; i < kKeyValuesSize; ++i) { |
| tokens[i].reset(CreateToken(kKeyValues[i].key, kKeyValues[i].value)); |
| source_tokens[i] = tokens[i].get(); |
| } |
| text_dict_->CollectTokens(&source_tokens); |
| BuildSystemDictionary(source_tokens, kKeyValuesSize); |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| // Test for normal prefix lookup without key expansion. |
| { |
| LookupPrefixTestCallback callback; |
| system_dic->LookupPrefix("\xE3\x81\x82\xE3\x81\x84", // "あい" |
| false, &callback); |
| const set<pair<string, string> > &result = callback.result(); |
| // "あ" -- "あい" should be found. |
| for (size_t i = 0; i < 5; ++i) { |
| const pair<string, string> entry( |
| kKeyValues[i].key, kKeyValues[i].value); |
| EXPECT_TRUE(result.end() != result.find(entry)); |
| } |
| // The others should not be found. |
| for (size_t i = 5; i < arraysize(kKeyValues); ++i) { |
| const pair<string, string> entry( |
| kKeyValues[i].key, kKeyValues[i].value); |
| EXPECT_TRUE(result.end() == result.find(entry)); |
| } |
| } |
| |
| // Test for normal prefix lookup without key expansion, but with culling |
| // feature. |
| { |
| LookupPrefixTestCallback callback; |
| system_dic->LookupPrefix( |
| "\xE3\x81\x8B\xE3\x81\x8D\xE3\x81\x8F", //"かきく" |
| false, |
| &callback); |
| const set<pair<string, string> > &result = callback.result(); |
| // Only "か" should be found as the callback doesn't traverse the subtree of |
| // "かき" due to culling request from LookupPrefixTestCallback::OnKey(). |
| for (size_t i = 0; i < kKeyValuesSize; ++i) { |
| const pair<string, string> entry( |
| kKeyValues[i].key, kKeyValues[i].value); |
| EXPECT_EQ(entry.first == "\xE3\x81\x8B", // "か" |
| result.find(entry) != result.end()); |
| } |
| } |
| |
| // Test for TRAVERSE_NEXT_KEY. |
| { |
| LookupPrefixTestCallback callback; |
| system_dic->LookupPrefix( |
| "\xE3\x81\x95\xE3\x81\x97\xE3\x81\x99", // "さしす" |
| false, |
| &callback); |
| const set<pair<string, string> > &result = callback.result(); |
| // Only "さし" should be found as tokens for "さ" is skipped (see |
| // LookupPrefixTestCallback::OnKey()). |
| for (size_t i = 0; i < kKeyValuesSize; ++i) { |
| const pair<string, string> entry( |
| kKeyValues[i].key, kKeyValues[i].value); |
| EXPECT_EQ(entry.first == "\xE3\x81\x95\xE3\x81\x97", // "さし" |
| result.find(entry) != result.end()); |
| } |
| } |
| |
| // Test for TRAVERSE_DONE. |
| { |
| LookupPrefixTestCallback callback; |
| system_dic->LookupPrefix( |
| "\xE3\x81\x9F\xE3\x81\xA1\xE3\x81\xA4", // "たちつ" |
| false, |
| &callback); |
| const set<pair<string, string> > &result = callback.result(); |
| // Nothing should be found as the traversal is immediately done after seeing |
| // "た"; see LookupPrefixTestCallback::OnKey(). |
| EXPECT_TRUE(result.empty()); |
| } |
| |
| // Test for prefix lookup with key expansion. |
| { |
| LookupPrefixTestCallback callback; |
| system_dic->LookupPrefix( |
| "\xE3\x81\xAF\xE3\x81\xB2", // "はひ" |
| true, // Use kana modifier insensitive lookup |
| &callback); |
| const set<pair<string, string> > &result = callback.result(); |
| const char *kExpectedKeys[] = { |
| "\xE3\x81\xAF", // "は" |
| "\xE3\x81\xB0", // "ば" |
| "\xE3\x81\xAF\xE3\x81\xB2", // "はひ" |
| "\xE3\x81\xB0\xE3\x81\xB2", // "ばひ" |
| "\xE3\x81\xAF\xE3\x81\xB3", // "はび" |
| "\xE3\x81\xB0\xE3\x81\xB3", // "ばび" |
| }; |
| const set<string> expected(kExpectedKeys, |
| kExpectedKeys + arraysize(kExpectedKeys)); |
| for (size_t i = 0; i < kKeyValuesSize; ++i) { |
| const bool to_be_found = |
| expected.find(kKeyValues[i].key) != expected.end(); |
| const pair<string, string> entry( |
| kKeyValues[i].key, kKeyValues[i].value); |
| EXPECT_EQ(to_be_found, result.find(entry) != result.end()); |
| } |
| } |
| } |
| |
| TEST_F(SystemDictionaryTest, LookupPredictive) { |
| vector<Token *> tokens; |
| ScopedElementsDeleter<vector<Token *> > deleter(&tokens); |
| |
| // "まみむめもや" -> "value0" |
| tokens.push_back(CreateToken("\xe3\x81\xbe\xe3\x81\xbf\xe3\x82\x80" |
| "\xe3\x82\x81\xe3\x82\x82\xe3\x82\x84", |
| "value0")); |
| // "まみむめもやゆよ" -> "value1" |
| tokens.push_back(CreateToken("\xe3\x81\xbe\xe3\x81\xbf\xe3\x82\x80" |
| "\xe3\x82\x81\xe3\x82\x82\xe3\x82\x84" |
| "\xe3\x82\x86\xe3\x82\x88", |
| "value1")); |
| // Build a dictionary with the above two tokens plus those from test data. |
| { |
| vector<Token *> source_tokens = tokens; |
| text_dict_->CollectTokens(&source_tokens); // Load test data. |
| BuildSystemDictionary(source_tokens, 10000); |
| } |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source: " << dic_fn_; |
| |
| // All the tokens in |tokens| should be looked up by "まみむめも". |
| const char *kMamimumemo = |
| "\xe3\x81\xbe\xe3\x81\xbf\xe3\x82\x80\xe3\x82\x81\xe3\x82\x82"; |
| CheckMultiTokensExistenceCallback callback(tokens); |
| system_dic->LookupPredictive(kMamimumemo, false, &callback); |
| EXPECT_TRUE(callback.AreAllFound()); |
| } |
| |
| TEST_F(SystemDictionaryTest, LookupPredictive_KanaModifierInsensitiveLookup) { |
| vector<Token *> tokens; |
| ScopedElementsDeleter<vector<Token *> > deleter(&tokens); |
| |
| // "がっこう" -> "学校" |
| tokens.push_back(CreateToken( |
| "\xE3\x81\x8C\xE3\x81\xA3\xE3\x81\x93\xE3\x81\x86", |
| "\xE5\xAD\xA6\xE6\xA0\xA1")); |
| // "かっこう" -> "格好" |
| tokens.push_back(CreateToken( |
| "\xE3\x81\x8B\xE3\x81\xA3\xE3\x81\x93\xE3\x81\x86", |
| "\xE6\xA0\xBC\xE5\xA5\xBD")); |
| |
| BuildSystemDictionary(tokens, 100); |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source: " << dic_fn_; |
| |
| // "かつこう" |
| const string kKey = "\xE3\x81\x8B\xE3\x81\xA4\xE3\x81\x93\xE3\x81\x86"; |
| |
| // Without Kana modifier insensitive lookup flag, nothing is looked up. |
| CollectTokenCallback callback; |
| system_dic->LookupPredictive(kKey, false, &callback); |
| EXPECT_TRUE(callback.tokens().empty()); |
| |
| // With Kana modifier insensitive lookup flag, every token is looked up. |
| callback.Clear(); |
| system_dic->LookupPredictive(kKey, true, &callback); |
| EXPECT_TOKENS_EQ_UNORDERED(tokens, callback.tokens()); |
| } |
| |
| TEST_F(SystemDictionaryTest, LookupPredictive_CutOffEmulatingBFS) { |
| vector<Token *> tokens; |
| ScopedElementsDeleter<vector<Token *> > deleter(&tokens); |
| |
| // "あい" -> "ai" |
| tokens.push_back(CreateToken("\xe3\x81\x82\xe3\x81\x84", "ai")); |
| // "あいうえお" -> "aiueo" |
| tokens.push_back(CreateToken( |
| "\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a", |
| "aiueo")); |
| // Build a dictionary with the above two tokens plus those from test data. |
| { |
| vector<Token *> source_tokens = tokens; |
| text_dict_->CollectTokens(&source_tokens); // Load test data. |
| BuildSystemDictionary(source_tokens, 10000); |
| } |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source: " << dic_fn_; |
| |
| // Since there are many entries starting with "あ" in test dictionary, it's |
| // expected that "あいうえお" is not looked up because of longer key cut-off |
| // mechanism. However, "あい" is looked up as it's short. |
| CheckMultiTokensExistenceCallback callback(tokens); |
| system_dic->LookupPredictive("\xe3\x81\x82", // "あ" |
| false, &callback); |
| EXPECT_TRUE(callback.IsFound(tokens[0])); |
| EXPECT_FALSE(callback.IsFound(tokens[1])); |
| } |
| |
| TEST_F(SystemDictionaryTest, LookupExact) { |
| vector<Token *> source_tokens; |
| |
| // "は" |
| const string k0 = "\xe3\x81\xaf"; |
| // "はひふへほ" |
| const string k1 = "\xe3\x81\xaf\xe3\x81\xb2\xe3\x81\xb5\xe3\x81\xb8\xe3\x81" |
| "\xbb"; |
| |
| scoped_ptr<Token> t0(CreateToken(k0, "aa")); |
| scoped_ptr<Token> t1(CreateToken(k1, "bb")); |
| source_tokens.push_back(t0.get()); |
| source_tokens.push_back(t1.get()); |
| text_dict_->CollectTokens(&source_tokens); |
| BuildSystemDictionary(source_tokens, 100); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| // |t0| should not be looked up from |k1|. |
| CheckTokenExistenceCallback callback0(t0.get()); |
| system_dic->LookupExact(k1, &callback0); |
| EXPECT_FALSE(callback0.found()); |
| // But |t1| should be found. |
| CheckTokenExistenceCallback callback1(t1.get()); |
| system_dic->LookupExact(k1, &callback1); |
| EXPECT_TRUE(callback1.found()); |
| |
| // Nothing should be found from "hoge". |
| CollectTokenCallback callback_hoge; |
| system_dic->LookupExact("hoge", &callback_hoge); |
| EXPECT_TRUE(callback_hoge.tokens().empty()); |
| } |
| |
| TEST_F(SystemDictionaryTest, LookupReverse) { |
| scoped_ptr<Token> t0(new Token); |
| // "ど" |
| t0->key = "\xe3\x81\xa9"; |
| // "ド" |
| t0->value = "\xe3\x83\x89"; |
| t0->cost = 1; |
| t0->lid = 2; |
| t0->rid = 3; |
| scoped_ptr<Token> t1(new Token); |
| // "どらえもん" |
| t1->key = "\xe3\x81\xa9\xe3\x82\x89\xe3\x81\x88\xe3\x82\x82\xe3\x82\x93"; |
| // "ドラえもん" |
| t1->value = "\xe3\x83\x89\xe3\x83\xa9\xe3\x81\x88\xe3\x82\x82\xe3\x82\x93"; |
| t1->cost = 1; |
| t1->lid = 2; |
| t1->rid = 3; |
| scoped_ptr<Token> t2(new Token); |
| // "といざらす®" |
| t2->key = "\xe3\x81\xa8\xe3\x81\x84\xe3\x81\x96\xe3\x82\x89\xe3\x81\x99\xc2" |
| "\xae"; |
| // "トイザらス®" |
| t2->value = "\xe3\x83\x88\xe3\x82\xa4\xe3\x82\xb6\xe3\x82\x89\xe3\x82\xb9\xc2" |
| "\xae"; |
| t2->cost = 1; |
| t2->lid = 2; |
| t2->rid = 3; |
| scoped_ptr<Token> t3(new Token); |
| // "ああああああ" |
| // Both t3 and t4 will be encoded into 3 bytes. |
| t3->key = "\xe3\x81\x82\xe3\x81\x82\xe3\x81\x82" |
| "\xe3\x81\x82\xe3\x81\x82\xe3\x81\x82"; |
| t3->value = t3->key; |
| t3->cost = 32000; |
| t3->lid = 1; |
| t3->rid = 1; |
| scoped_ptr<Token> t4(new Token); |
| *t4 = *t3; |
| t4->lid = 1; |
| t4->rid = 2; |
| scoped_ptr<Token> t5(new Token); |
| // "いいいいいい" |
| // t5 will be encoded into 3 bytes. |
| t5->key = "\xe3\x81\x84\xe3\x81\x84\xe3\x81\x84" |
| "\xe3\x81\x84\xe3\x81\x84\xe3\x81\x84"; |
| t5->value = t5->key; |
| t5->cost = 32000; |
| t5->lid = 1; |
| t5->rid = 1; |
| // spelling correction token should not be retrieved by reverse lookup. |
| scoped_ptr<Token> t6(new Token); |
| // "どらえもん" |
| t6->key = "\xe3\x81\xa9\xe3\x82\x89\xe3\x81\x88\xe3\x82\x82\xe3\x82\x93"; |
| // "ドラえもん" |
| t6->value = "\xe3\x83\x89\xe3\x83\xa9\xe3\x81\x88\xe3\x82\x82\xe3\x82\x93"; |
| t6->cost = 1; |
| t6->lid = 2; |
| t6->rid = 3; |
| t6->attributes = Token::SPELLING_CORRECTION; |
| scoped_ptr<Token> t7(new Token); |
| // "こんさーと" |
| t7->key = "\xe3\x81\x93\xe3\x82\x93\xe3\x81\x95\xe3\x83\xbc\xe3\x81\xa8"; |
| // "コンサート" |
| t7->value = "\xe3\x82\xb3\xe3\x83\xb3\xe3\x82\xb5\xe3\x83\xbc\xe3\x83\x88"; |
| t7->cost = 1; |
| t7->lid = 1; |
| t7->rid = 1; |
| // "バージョン" should not return a result with the key "ヴァージョン". |
| scoped_ptr<Token> t8(new Token); |
| // "ばーじょん" |
| t8->key = "\xE3\x81\xB0\xE3\x83\xBC\xE3\x81\x98\xE3\x82\x87\xE3\x82\x93"; |
| // "バージョン" |
| t8->value = "\xE3\x83\x90\xE3\x83\xBC\xE3\x82\xB8\xE3\x83\xA7\xE3\x83\xB3"; |
| t8->cost = 1; |
| t8->lid = 1; |
| t8->rid = 1; |
| |
| vector<Token *> source_tokens; |
| source_tokens.push_back(t0.get()); |
| source_tokens.push_back(t1.get()); |
| source_tokens.push_back(t2.get()); |
| source_tokens.push_back(t3.get()); |
| source_tokens.push_back(t4.get()); |
| source_tokens.push_back(t5.get()); |
| source_tokens.push_back(t6.get()); |
| source_tokens.push_back(t7.get()); |
| source_tokens.push_back(t8.get()); |
| |
| text_dict_->CollectTokens(&source_tokens); |
| BuildSystemDictionary(source_tokens, source_tokens.size()); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| const size_t test_size = min( |
| static_cast<size_t>(FLAGS_dictionary_reverse_lookup_test_size), |
| source_tokens.size()); |
| for (size_t source_index = 0; source_index < test_size; ++source_index) { |
| const Token &source_token = *source_tokens[source_index]; |
| CollectTokenCallback callback; |
| system_dic->LookupReverse(source_token.value, &callback); |
| const vector<Token> &tokens = callback.tokens(); |
| |
| bool found = false; |
| for (size_t i = 0; i < tokens.size(); ++i) { |
| const Token &token = tokens[i]; |
| // Make sure any of the key lengths of the lookup results |
| // doesn't exceed the original key length. |
| // It happened once |
| // when called with "バージョン", returning "ヴァージョン". |
| EXPECT_LE(token.key.size(), source_token.value.size()) |
| << string(token.key) << ":" << string(token.value) |
| << "\t" << string(source_token.value); |
| if (CompareTokensForLookup(source_token, token, true)) { |
| found = true; |
| } |
| } |
| |
| if ((source_token.attributes & Token::SPELLING_CORRECTION) == |
| Token::SPELLING_CORRECTION) { |
| EXPECT_FALSE(found) << "Spelling correction token was retrieved:" |
| << PrintToken(source_token); |
| if (found) { |
| return; |
| } |
| } else { |
| EXPECT_TRUE(found) |
| << "Failed to find " << source_token.key << ":" << source_token.value; |
| if (!found) { |
| return; |
| } |
| } |
| } |
| |
| { |
| // test for non exact transliterated index string. |
| // append "が" |
| const string key = t7->value + "\xe3\x81\x8c"; |
| CollectTokenCallback callback; |
| system_dic->LookupReverse(key, &callback); |
| const vector<Token> &tokens = callback.tokens(); |
| bool found = false; |
| for (size_t i = 0; i < tokens.size(); ++i) { |
| if (CompareTokensForLookup(*t7, tokens[i], true)) { |
| found = true; |
| } |
| } |
| EXPECT_TRUE(found) |
| << "Missed token for non exact transliterated index " << key; |
| } |
| } |
| |
| TEST_F(SystemDictionaryTest, LookupReverseIndex) { |
| const vector<Token *> &source_tokens = text_dict_->tokens(); |
| BuildSystemDictionary(source_tokens, FLAGS_dictionary_test_size); |
| |
| scoped_ptr<SystemDictionary> system_dic_without_index( |
| SystemDictionary::Builder(dic_fn_) |
| .SetOptions(SystemDictionary::NONE) |
| .Build()); |
| ASSERT_TRUE(system_dic_without_index.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| scoped_ptr<SystemDictionary> system_dic_with_index( |
| SystemDictionary::Builder(dic_fn_) |
| .SetOptions(SystemDictionary::ENABLE_REVERSE_LOOKUP_INDEX) |
| .Build()); |
| ASSERT_TRUE(system_dic_with_index.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| vector<Token *>::const_iterator it; |
| int size = FLAGS_dictionary_reverse_lookup_test_size; |
| for (it = source_tokens.begin(); |
| size > 0 && it != source_tokens.end(); ++it, --size) { |
| const Token &t = **it; |
| CollectTokenCallback callback1, callback2; |
| system_dic_without_index->LookupReverse(t.value, &callback1); |
| system_dic_with_index->LookupReverse(t.value, &callback2); |
| |
| const vector<Token> &tokens1 = callback1.tokens(); |
| const vector<Token> &tokens2 = callback2.tokens(); |
| ASSERT_EQ(tokens1.size(), tokens2.size()); |
| for (size_t i = 0; i < tokens1.size(); ++i) { |
| EXPECT_TOKEN_EQ(tokens1[i], tokens2[i]); |
| } |
| } |
| } |
| |
| TEST_F(SystemDictionaryTest, LookupReverseWithCache) { |
| const string kDoraemon = |
| "\xe3\x83\x89\xe3\x83\xa9\xe3\x81\x88\xe3\x82\x82\xe3\x82\x93"; |
| |
| Token source_token; |
| // "どらえもん" |
| source_token.key = |
| "\xe3\x81\xa9\xe3\x82\x89\xe3\x81\x88\xe3\x82\x82\xe3\x82\x93"; |
| // "ドラえもん" |
| source_token.value = kDoraemon; |
| source_token.cost = 1; |
| source_token.lid = 2; |
| source_token.rid = 3; |
| vector<Token *> source_tokens; |
| source_tokens.push_back(&source_token); |
| text_dict_->CollectTokens(&source_tokens); |
| BuildSystemDictionary(source_tokens, source_tokens.size()); |
| |
| Token target_token = source_token; |
| target_token.key.swap(target_token.value); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| system_dic->PopulateReverseLookupCache(kDoraemon); |
| CheckTokenExistenceCallback callback(&target_token); |
| system_dic->LookupReverse(kDoraemon, &callback); |
| EXPECT_TRUE(callback.found()) |
| << "Could not find " << PrintToken(source_token); |
| system_dic->ClearReverseLookupCache(); |
| } |
| |
| TEST_F(SystemDictionaryTest, SpellingCorrectionTokens) { |
| vector<Token> tokens(3); |
| |
| // "あぼがど" |
| tokens[0].key = "\xe3\x81\x82\xe3\x81\xbc\xe3\x81\x8c\xe3\x81\xa9"; |
| // "アボカド" |
| tokens[0].value = "\xe3\x82\xa2\xe3\x83\x9c\xe3\x82\xab\xe3\x83\x89"; |
| tokens[0].cost = 1; |
| tokens[0].lid = 0; |
| tokens[0].rid = 2; |
| tokens[0].attributes = Token::SPELLING_CORRECTION; |
| |
| // "しゅみれーしょん" |
| tokens[1].key = |
| "\xe3\x81\x97\xe3\x82\x85\xe3\x81\xbf\xe3\x82\x8c" |
| "\xe3\x83\xbc\xe3\x81\x97\xe3\x82\x87\xe3\x82\x93"; |
| // "シミュレーション" |
| tokens[1].value = |
| "\xe3\x82\xb7\xe3\x83\x9f\xe3\x83\xa5\xe3\x83\xac" |
| "\xe3\x83\xbc\xe3\x82\xb7\xe3\x83\xa7\xe3\x83\xb3"; |
| tokens[1].cost = 1; |
| tokens[1].lid = 100; |
| tokens[1].rid = 3; |
| tokens[1].attributes = Token::SPELLING_CORRECTION; |
| |
| // "あきはばら" |
| tokens[2].key = |
| "\xe3\x81\x82\xe3\x81\x8d\xe3\x81\xaf\xe3\x81\xb0\xe3\x82\x89"; |
| // "秋葉原" |
| tokens[2].value = "\xe7\xa7\x8b\xe8\x91\x89\xe5\x8e\x9f"; |
| tokens[2].cost = 1000; |
| tokens[2].lid = 1; |
| tokens[2].rid = 2; |
| |
| vector<Token *> source_tokens; |
| for (size_t i = 0; i < tokens.size(); ++i) { |
| source_tokens.push_back(&tokens[i]); |
| } |
| BuildSystemDictionary(source_tokens, source_tokens.size()); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| for (size_t i = 0; i < source_tokens.size(); ++i) { |
| CheckTokenExistenceCallback callback(source_tokens[i]); |
| system_dic->LookupPrefix(source_tokens[i]->key, false, &callback); |
| EXPECT_TRUE(callback.found()) |
| << "Token " << i << " was not found: " << PrintToken(*source_tokens[i]); |
| } |
| } |
| |
| TEST_F(SystemDictionaryTest, EnableNoModifierTargetWithLoudsTrie) { |
| // "かつ" |
| const string k0 = "\xE3\x81\x8B\xE3\x81\xA4"; |
| // "かっこ" |
| const string k1 = "\xE3\x81\x8B\xE3\x81\xA3\xE3\x81\x93"; |
| // "かつこう" |
| const string k2 = "\xE3\x81\x8B\xE3\x81\xA4\xE3\x81\x93\xE3\x81\x86"; |
| // "かっこう" |
| const string k3 = "\xE3\x81\x8B\xE3\x81\xA3\xE3\x81\x93\xE3\x81\x86"; |
| // "がっこう" |
| const string k4 = "\xE3\x81\x8C\xE3\x81\xA3\xE3\x81\x93\xE3\x81\x86"; |
| |
| scoped_ptr<Token> tokens[5]; |
| tokens[0].reset(CreateToken(k0, "aa")); |
| tokens[1].reset(CreateToken(k1, "bb")); |
| tokens[2].reset(CreateToken(k2, "cc")); |
| tokens[3].reset(CreateToken(k3, "dd")); |
| tokens[4].reset(CreateToken(k4, "ee")); |
| |
| vector<Token *> source_tokens; |
| for (size_t i = 0; i < arraysize(tokens); ++i) { |
| source_tokens.push_back(tokens[i].get()); |
| } |
| text_dict_->CollectTokens(&source_tokens); |
| BuildSystemDictionary(source_tokens, 100); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| // Prefix search |
| for (size_t i = 0; i < arraysize(tokens); ++i) { |
| CheckTokenExistenceCallback callback(tokens[i].get()); |
| // "かつこう" -> "かつ", "かっこ", "かつこう", "かっこう" and "がっこう" |
| system_dic->LookupPrefix( |
| k2, kEnableKanaModiferInsensitiveLookup, &callback); |
| EXPECT_TRUE(callback.found()) |
| << "Token " << i << " was not found: " << PrintToken(*tokens[i]); |
| } |
| |
| // Predictive searches |
| { |
| // "かつ" -> "かつ", "かっこ", "かつこう", "かっこう" and "がっこう" |
| vector<Token *> expected; |
| for (size_t i = 0; i < arraysize(tokens); ++i) { |
| expected.push_back(tokens[i].get()); |
| } |
| CheckMultiTokensExistenceCallback callback(expected); |
| system_dic->LookupPredictive( |
| k0, kEnableKanaModiferInsensitiveLookup, &callback); |
| EXPECT_TRUE(callback.AreAllFound()); |
| } |
| { |
| // "かっこ" -> "かっこ", "かっこう" and "がっこう" |
| vector<Token *> expected; |
| expected.push_back(tokens[1].get()); |
| expected.push_back(tokens[3].get()); |
| expected.push_back(tokens[4].get()); |
| CheckMultiTokensExistenceCallback callback(expected); |
| system_dic->LookupPredictive( |
| k1, kEnableKanaModiferInsensitiveLookup, &callback); |
| EXPECT_TRUE(callback.AreAllFound()); |
| } |
| } |
| |
| TEST_F(SystemDictionaryTest, NoModifierForKanaEntries) { |
| // "ていすてぃんぐ", "テイスティング" |
| scoped_ptr<Token> t0(CreateToken( |
| "\xe3\x81\xa6\xe3\x81\x84\xe3\x81\x99\xe3\x81\xa6" |
| "\xe3\x81\x83\xe3\x82\x93\xe3\x81\x90", |
| "\xe3\x83\x86\xe3\x82\xa4\xe3\x82\xb9\xe3\x83\x86" |
| "\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0")); |
| // "てすとです", "てすとです" |
| scoped_ptr<Token> t1(CreateToken( |
| "\xe3\x81\xa6\xe3\x81\x99\xe3\x81\xa8\xe3\x81\xa7\xe3\x81\x99", |
| "\xe3\x81\xa6\xe3\x81\x99\xe3\x81\xa8\xe3\x81\xa7\xe3\x81\x99")); |
| |
| vector<Token *> source_tokens; |
| source_tokens.push_back(t0.get()); |
| source_tokens.push_back(t1.get()); |
| |
| text_dict_->CollectTokens(&source_tokens); |
| BuildSystemDictionary(source_tokens, 100); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| // Lookup |t0| from "ていすていんぐ" |
| const string k = "\xe3\x81\xa6\xe3\x81\x84\xe3\x81\x99\xe3\x81\xa6" |
| "\xe3\x81\x84\xe3\x82\x93\xe3\x81\x90"; |
| CheckTokenExistenceCallback callback(t0.get()); |
| system_dic->LookupPrefix(k, kEnableKanaModiferInsensitiveLookup, |
| &callback); |
| EXPECT_TRUE(callback.found()) << "Not found: " << PrintToken(*t0); |
| } |
| |
| TEST_F(SystemDictionaryTest, DoNotReturnNoModifierTargetWithLoudsTrie) { |
| // "かつ" |
| const string k0 = "\xE3\x81\x8B\xE3\x81\xA4"; |
| // "かっこ" |
| const string k1 = "\xE3\x81\x8B\xE3\x81\xA3\xE3\x81\x93"; |
| // "かつこう" |
| const string k2 = "\xE3\x81\x8B\xE3\x81\xA4\xE3\x81\x93\xE3\x81\x86"; |
| // "かっこう" |
| const string k3 = "\xE3\x81\x8B\xE3\x81\xA3\xE3\x81\x93\xE3\x81\x86"; |
| // "がっこう" |
| const string k4 = "\xE3\x81\x8C\xE3\x81\xA3\xE3\x81\x93\xE3\x81\x86"; |
| |
| scoped_ptr<Token> t0(CreateToken(k0, "aa")); |
| scoped_ptr<Token> t1(CreateToken(k1, "bb")); |
| scoped_ptr<Token> t2(CreateToken(k2, "cc")); |
| scoped_ptr<Token> t3(CreateToken(k3, "dd")); |
| scoped_ptr<Token> t4(CreateToken(k4, "ee")); |
| |
| vector<Token *> source_tokens; |
| source_tokens.push_back(t0.get()); |
| source_tokens.push_back(t1.get()); |
| source_tokens.push_back(t2.get()); |
| source_tokens.push_back(t3.get()); |
| source_tokens.push_back(t4.get()); |
| |
| text_dict_->CollectTokens(&source_tokens); |
| BuildSystemDictionary(source_tokens, 100); |
| |
| scoped_ptr<SystemDictionary> system_dic( |
| SystemDictionary::Builder(dic_fn_).Build()); |
| ASSERT_TRUE(system_dic.get() != NULL) |
| << "Failed to open dictionary source:" << dic_fn_; |
| |
| // Prefix search |
| { |
| // "かっこう" (k3) -> "かっこ" (k1) and "かっこう" (k3) |
| // Make sure "がっこう" is not in the results when searched by "かっこう" |
| vector<Token *> to_be_looked_up, not_to_be_looked_up; |
| to_be_looked_up.push_back(t1.get()); |
| to_be_looked_up.push_back(t3.get()); |
| not_to_be_looked_up.push_back(t0.get()); |
| not_to_be_looked_up.push_back(t2.get()); |
| not_to_be_looked_up.push_back(t4.get()); |
| for (size_t i = 0; i < to_be_looked_up.size(); ++i) { |
| CheckTokenExistenceCallback callback(to_be_looked_up[i]); |
| system_dic->LookupPrefix( |
| k3, kDisableKanaModiferInsensitiveLookup, &callback); |
| EXPECT_TRUE(callback.found()) |
| << "Token is not found: " << PrintToken(*to_be_looked_up[i]); |
| } |
| for (size_t i = 0; i < not_to_be_looked_up.size(); ++i) { |
| CheckTokenExistenceCallback callback(not_to_be_looked_up[i]); |
| system_dic->LookupPrefix( |
| k3, kDisableKanaModiferInsensitiveLookup, &callback); |
| EXPECT_FALSE(callback.found()) |
| << "Token should not be found: " |
| << PrintToken(*not_to_be_looked_up[i]); |
| } |
| } |
| |
| // Predictive search |
| { |
| // "かっこ" -> "かっこ" and "かっこう" |
| // Make sure "がっこう" is not in the results when searched by "かっこ" |
| vector<Token *> to_be_looked_up, not_to_be_looked_up; |
| to_be_looked_up.push_back(t1.get()); |
| to_be_looked_up.push_back(t3.get()); |
| not_to_be_looked_up.push_back(t0.get()); |
| not_to_be_looked_up.push_back(t2.get()); |
| not_to_be_looked_up.push_back(t4.get()); |
| for (size_t i = 0; i < to_be_looked_up.size(); ++i) { |
| CheckTokenExistenceCallback callback(to_be_looked_up[i]); |
| system_dic->LookupPredictive( |
| k1, kDisableKanaModiferInsensitiveLookup, &callback); |
| EXPECT_TRUE(callback.found()) |
| << "Token is not found: " << PrintToken(*to_be_looked_up[i]); |
| } |
| for (size_t i = 0; i < not_to_be_looked_up.size(); ++i) { |
| CheckTokenExistenceCallback callback(not_to_be_looked_up[i]); |
| system_dic->LookupPredictive( |
| k3, kDisableKanaModiferInsensitiveLookup, &callback); |
| EXPECT_FALSE(callback.found()) |
| << "Token should not be found: " |
| << PrintToken(*not_to_be_looked_up[i]); |
| } |
| } |
| } |
| |
| } // namespace dictionary |
| } // namespace mozc |