Implement rule based zero query suggestion.
BUG=none
TEST=unittest
diff --git a/src/data/zero_query/zero_query.def b/src/data/zero_query/zero_query.def
new file mode 100644
index 0000000..af4b034
--- /dev/null
+++ b/src/data/zero_query/zero_query.def
@@ -0,0 +1,8 @@
+# Rules for triggering zero query suggestion/prediction.
+# File format:
+# trigger<TAB>candidate_1,candidate_2,...,candidate_n
+# ...
+# Note that '#' is special caracter for comment line, so it cannot be placed
+# at the beginning of a line.
+
+@ gmail.com
diff --git a/src/mozc_version_template.txt b/src/mozc_version_template.txt
index c2a5339..7556748 100644
--- a/src/mozc_version_template.txt
+++ b/src/mozc_version_template.txt
@@ -1,6 +1,6 @@
MAJOR=2
MINOR=17
-BUILD=2079
+BUILD=2080
REVISION=102
# NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
# downloaded by NaCl Mozc.
diff --git a/src/prediction/dictionary_predictor.cc b/src/prediction/dictionary_predictor.cc
index fd01cde..75dffe5 100644
--- a/src/prediction/dictionary_predictor.cc
+++ b/src/prediction/dictionary_predictor.cc
@@ -58,6 +58,7 @@
#include "dictionary/pos_matcher.h"
#include "prediction/predictor_interface.h"
#include "prediction/suggestion_filter.h"
+#include "prediction/zero_query_data.h"
#include "prediction/zero_query_number_data.h"
#include "session/commands.pb.h"
@@ -98,22 +99,22 @@
int default_num = -1;
int suffix_num = -1;
- for (int i = 0; ZeroQueryNum[i]; ++i) {
- if (default_str == ZeroQueryNum[i][0]) {
+ for (int i = 0; i < kZeroQueryNum_size; ++i) {
+ if (default_str == kZeroQueryNum_data[i][0]) {
default_num = i;
- } else if (history_input == ZeroQueryNum[i][0]) {
+ } else if (history_input == kZeroQueryNum_data[i][0]) {
suffix_num = i;
}
}
DCHECK_GE(default_num, 0);
if (suffix_num != -1) {
- for (int j = 1; ZeroQueryNum[suffix_num][j]; ++j) {
- suffixes->push_back(ZeroQueryNum[suffix_num][j]);
+ for (int j = 1; kZeroQueryNum_data[suffix_num][j]; ++j) {
+ suffixes->push_back(kZeroQueryNum_data[suffix_num][j]);
}
}
- for (int j = 1; ZeroQueryNum[default_num][j]; ++j) {
- suffixes->push_back(ZeroQueryNum[default_num][j]);
+ for (int j = 1; kZeroQueryNum_data[default_num][j]; ++j) {
+ suffixes->push_back(kZeroQueryNum_data[default_num][j]);
}
}
@@ -163,6 +164,11 @@
FLAGS_enable_typing_correction;
}
+struct ZeroQueryRuleCompare {
+ bool operator()(const char **lhs, const char **rhs) const {
+ return (strcmp(lhs[0], rhs[0]) < 0);
+ }
+};
} // namespace
class DictionaryPredictor::PredictiveLookupCallback :
@@ -1617,6 +1623,88 @@
}
}
+// Returns true if we add zero query result.
+bool DictionaryPredictor::AggregateNumberZeroQueryPrediction(
+ const Segments &segments, vector<Result> *results) const {
+ string number_key;
+ if (!GetNumberHistory(segments, &number_key)) {
+ return false;
+ }
+
+ // Use number suffixes and do not add normal zero query.
+ vector<string> suffixes;
+ GetNumberSuffixArray(number_key, &suffixes);
+ DCHECK_GT(suffixes.size(), 0);
+ int cost = 0;
+
+ for (size_t i = 0; i < suffixes.size(); ++i) {
+ const auto &suffix = suffixes[i];
+ // Increment cost to show the candidates in order.
+ const int kSuffixPenalty = 10;
+
+ results->push_back(Result());
+ Result *result = &results->back();
+ result->SetTypesAndTokenAttributes(SUFFIX, Token::NONE);
+ result->key = suffix;
+ result->value = suffix;
+ result->wcost = cost;
+ result->lid = counter_suffix_word_id_;
+ result->rid = counter_suffix_word_id_;
+
+ cost += kSuffixPenalty;
+ }
+ return true;
+}
+
+// Returns true if we add zero query result.
+bool DictionaryPredictor::AggregateZeroQueryPrediction(
+ const Segments &segments, vector<Result> *results) const {
+ const size_t history_size = segments.history_segments_size();
+ if (history_size <= 0) {
+ return false;
+ }
+
+ const Segment &last_segment = segments.history_segment(history_size - 1);
+ DCHECK_GT(last_segment.candidates_size(), 0);
+ const string &history_value = last_segment.candidate(0).value;
+
+ const char *key_item[] = {history_value.c_str(), 0};
+ const char **key = key_item;
+ // kZeroQueryData_data is a 2-dimensional string array and
+ // sorted by the first string.
+ // For each string array, the first item is a key for zero query prediction,
+ // the rest items are candidates, and the last item is 0.
+ const char ***result_rule =
+ lower_bound(
+ kZeroQueryData_data, kZeroQueryData_data + kZeroQueryData_size,
+ key, ZeroQueryRuleCompare());
+ if (result_rule == (kZeroQueryData_data + kZeroQueryData_size) ||
+ history_value != (*result_rule)[0]) {
+ return false;
+ }
+
+ int cost = 0;
+ for (int i = 1; (*result_rule)[i]; ++i) {
+ string candidate = (*result_rule)[i];
+
+ // Increment cost to show the candidates in order.
+ const int kPenalty = 10;
+
+ results->push_back(Result());
+ Result *result = &results->back();
+
+ result->SetTypesAndTokenAttributes(SUFFIX, Token::NONE);
+ result->key = candidate;
+ result->value = candidate;
+ result->wcost = cost;
+ result->lid = 0; // EOS
+ result->rid = 0; // EOS
+
+ cost += kPenalty;
+ }
+ return true;
+}
+
void DictionaryPredictor::AggregateSuffixPrediction(
PredictionTypes types,
const ConversionRequest &request,
@@ -1630,30 +1718,10 @@
const bool is_zero_query = segments.conversion_segment(0).key().empty();
if (is_zero_query) {
- string number_key;
- if (GetNumberHistory(segments, &number_key)) {
- // Use number suffixes and do not add normal zero query.
- vector<string> suffixes;
- GetNumberSuffixArray(number_key, &suffixes);
- DCHECK_GT(suffixes.size(), 0);
- int cost = 0;
-
- for (vector<string>::const_iterator it = suffixes.begin();
- it != suffixes.end(); ++it) {
- // Increment cost to show the candidates in order.
- const int kSuffixPenalty = 10;
-
- results->push_back(Result());
- Result *result = &results->back();
- result->SetTypesAndTokenAttributes(SUFFIX, Token::NONE);
- result->key = *it;
- result->value = *it;
- result->wcost = cost;
- result->lid = counter_suffix_word_id_;
- result->rid = counter_suffix_word_id_;
-
- cost += kSuffixPenalty;
- }
+ if (AggregateNumberZeroQueryPrediction(segments, results)) {
+ return;
+ }
+ if (AggregateZeroQueryPrediction(segments, results)) {
return;
}
// Fall through
diff --git a/src/prediction/dictionary_predictor.h b/src/prediction/dictionary_predictor.h
index 9c2a0e2..8c7ae69 100644
--- a/src/prediction/dictionary_predictor.h
+++ b/src/prediction/dictionary_predictor.h
@@ -176,6 +176,12 @@
const Segments &segments,
vector<Result> *results) const;
+ bool AggregateNumberZeroQueryPrediction(const Segments &segments,
+ vector<Result> *results) const;
+
+ bool AggregateZeroQueryPrediction(const Segments &segments,
+ vector<Result> *result) const;
+
void ApplyPenaltyForKeyExpansion(const Segments &segments,
vector<Result> *results) const;
@@ -200,6 +206,7 @@
FRIEND_TEST(DictionaryPredictorTest, AggregateSuffixPrediction);
FRIEND_TEST(DictionaryPredictorTest, ZeroQuerySuggestionAfterNumbers);
FRIEND_TEST(DictionaryPredictorTest, TriggerNumberZeroQuerySuggestion);
+ FRIEND_TEST(DictionaryPredictorTest, TriggerZeroQuerySuggestion);
FRIEND_TEST(DictionaryPredictorTest, GetHistoryKeyAndValue);
FRIEND_TEST(DictionaryPredictorTest, RealtimeConversionStartingWithAlphabets);
FRIEND_TEST(DictionaryPredictorTest, IsAggressiveSuggestion);
diff --git a/src/prediction/dictionary_predictor_test.cc b/src/prediction/dictionary_predictor_test.cc
index 9eed0c4..995a8e1 100644
--- a/src/prediction/dictionary_predictor_test.cc
+++ b/src/prediction/dictionary_predictor_test.cc
@@ -2097,7 +2097,54 @@
break;
}
}
- EXPECT_EQ(test_case.expected_result, found);
+ EXPECT_EQ(test_case.expected_result, found) << test_case.history_value;
+ }
+}
+
+TEST_F(DictionaryPredictorTest, TriggerZeroQuerySuggestion) {
+ scoped_ptr<MockDataAndPredictor> data_and_predictor(
+ CreateDictionaryPredictorWithMockData());
+ const DictionaryPredictor *predictor =
+ data_and_predictor->dictionary_predictor();
+ const ConversionRequest conversion_request;
+
+ const struct TestCase {
+ const char *history_key;
+ const char *history_value;
+ const char *find_value;
+ bool expected_result;
+ } kTestCases[] = {
+ { "@", "@",
+ "gmail.com", true },
+ { "!", "!",
+ "?", false },
+ };
+
+ for (size_t i = 0; i < arraysize(kTestCases); ++i) {
+ Segments segments;
+ MakeSegmentsForSuggestion("", &segments);
+
+ const TestCase &test_case = kTestCases[i];
+ PrependHistorySegments(
+ test_case.history_key, test_case.history_value, &segments);
+ vector<DictionaryPredictor::Result> results;
+ predictor->AggregateSuffixPrediction(
+ DictionaryPredictor::SUFFIX,
+ conversion_request, segments, &results);
+ EXPECT_FALSE(results.empty());
+
+ bool found = false;
+ for (vector<DictionaryPredictor::Result>::const_iterator it =
+ results.begin();
+ it != results.end(); ++it) {
+ EXPECT_EQ(it->types, DictionaryPredictor::SUFFIX);
+ if (it->value == test_case.find_value &&
+ it->lid == 0 /* EOS */) {
+ found = true;
+ break;
+ }
+ }
+ EXPECT_EQ(test_case.expected_result, found) << test_case.history_value;
}
}
diff --git a/src/prediction/gen_embedded_string_array_for_zero_query.py b/src/prediction/gen_embedded_string_array_for_zero_query.py
new file mode 100644
index 0000000..f149f37
--- /dev/null
+++ b/src/prediction/gen_embedded_string_array_for_zero_query.py
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+# Copyright 2010-2015, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Generate header file of a string array for zero query suggestion.
+
+Usage:
+gen_embedded_string_array_for_zero_query.py --input input.def \
+ --output /path/to/output/zero_query_hoge.h --var_name ZeroQueryHoge
+
+Input format:
+<key> <TAB> <candidate_1>,<candidate_2>,..,<candidate_n>
+...
+For more details, please refer to definition files under mozc/data/zero_query/
+
+Output format:
+const char *Var0[] = {"Key0", "Cand00", "Cand01", .., 0};
+const char *Var1[] = {"Key1", "Cand10", "Cand11", .., 0};
+
+const char **Var[] = {Var0, Var1, .., VarN};
+
+Here, (Cand00, Cand10, ...) is sorted so that we can use binary search.
+"""
+
+__author__ = "toshiyuki"
+
+import optparse
+import os
+
+
+_MOZC_DIR_FOR_DEFINE_GUARD = 'MOZC'
+
+
+def EscapeString(string):
+ """Escapes string."""
+ return '"' + string.encode('string_escape') + '"'
+
+
+def GetDefineGuardSymbol(file_name):
+ """Returns define guard symbol for .h file.
+
+ For example, returns 'SOME_EXAMPLE_H' for '/path/to/some_example.h'
+
+ Args:
+ file_name: a string indicating output file path.
+ Returns:
+ A string for define guard.
+ """
+ return os.path.basename(file_name).upper().replace('.', '_')
+
+
+def GetDefineGuardHeaderLines(output_file_name):
+ """Returns define guard header for .h file."""
+ result = []
+ result.append(
+ '#ifndef %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD,
+ GetDefineGuardSymbol(output_file_name)))
+ result.append(
+ '#define %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD,
+ GetDefineGuardSymbol(output_file_name)))
+ return result
+
+
+def GetDefineGuardFooterLines(output_file_name):
+ """Returns define guard footer for .h file."""
+ return [
+ '#endif // %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD,
+ GetDefineGuardSymbol(output_file_name))]
+
+
+def GetZeroQueryRules(input_file_name):
+ """Returns zero query trigerring rules. The list is sorted by key."""
+ rules = []
+ with open(input_file_name, 'r') as input_file:
+ for line in input_file:
+ if line.startswith('#'):
+ continue
+ line = line.rstrip('\r\n')
+ if not line:
+ continue
+
+ tokens = line.split('\t')
+ key = tokens[0]
+ values = tokens[1].split(',')
+
+ rules.append((key, values))
+ rules.sort(lambda x, y: cmp(x[0], y[0])) # For binary search
+ return rules
+
+
+def GetHeaderContents(input_file_name, var_name, output_file_name):
+ """Returns contents for header file that contains a string array."""
+ zero_query_rules = GetZeroQueryRules(input_file_name)
+
+ result = []
+ result.extend(GetDefineGuardHeaderLines(output_file_name))
+ result.append('namespace mozc {')
+ result.append('namespace {')
+
+ for i, rule in enumerate(zero_query_rules):
+ result.append('const char *%s%d[] = {' % (var_name, i))
+ result.append(' ' + ', '.join(
+ [EscapeString(s) for s in [rule[0]] + rule[1]] + ['0']))
+ result.append('};')
+
+ result.append('} // namespace')
+
+ result.append('const char **%s_data[] = {' % var_name)
+ result.append(' ' + ', '.join(
+ ['%s%d' % (var_name, c) for c in range(len(zero_query_rules))]))
+ result.append('};')
+ result.append(
+ 'const size_t %s_size = %d;' % (var_name, len(zero_query_rules)))
+
+ result.append('} // namespace mozc')
+ result.extend(GetDefineGuardFooterLines(output_file_name))
+ return result
+
+
+def ParseOption():
+ """Parses command line options."""
+ parser = optparse.OptionParser()
+ parser.add_option('--input', dest='input', help='Input file path')
+ parser.add_option('--output', dest='output', help='Output file path')
+ parser.add_option(
+ '--var_name', dest='var_name', help='Var name for the array')
+ return parser.parse_args()[0]
+
+
+def main():
+ options = ParseOption()
+ lines = GetHeaderContents(options.input, options.var_name, options.output)
+ with open(options.output, 'w') as out_file:
+ out_file.write('\n'.join(lines))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/prediction/gen_zero_query_number_data.py b/src/prediction/gen_zero_query_number_data.py
deleted file mode 100644
index 724b177..0000000
--- a/src/prediction/gen_zero_query_number_data.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2010-2015, Google Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following disclaimer
-# in the documentation and/or other materials provided with the
-# distribution.
-# * Neither the name of Google Inc. nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-__author__ = "manabe"
-
-import sys
-
-
-def EscapeString(result):
- return '"' + result.encode('string_escape') + '"'
-
-
-def main():
- print "#ifndef MOZC_PREDICTION_ZERO_QUERY_NUM_DATA_H_"
- print "#define MOZC_PREDICTION_ZERO_QUERY_NUM_DATA_H_"
- print "namespace mozc {"
- print "namespace {"
-
- count = 0
- for line in open(sys.argv[1], "r"):
- if line.startswith("#"):
- continue
-
- line = line.rstrip("\r\n")
- if line == "":
- continue
-
- fields = line.split("\t")
- key = fields[0]
- values = [key] + fields[1].split(",")
- print "const char *ZeroQueryNum%d[] = {" % count
- print " " + ", ".join([EscapeString(s) for s in values] + ["0"])
- print "};"
- count += 1
-
- print "} // namespace"
- print "const char **ZeroQueryNum[] = {"
- print " " + ", ".join(["ZeroQueryNum%d" % c for c in range(count)] + ["0"])
- print "};"
- print "} // namespace mozc"
- print "#endif // MOZC_PREDICTION_ZERO_QUERY_NUM_DATA_H_"
-
-
-if __name__ == "__main__":
- main()
diff --git a/src/prediction/prediction.gyp b/src/prediction/prediction.gyp
index a9761d1..21badd8 100644
--- a/src/prediction/prediction.gyp
+++ b/src/prediction/prediction.gyp
@@ -58,6 +58,7 @@
'../session/session_base.gyp:session_protocol',
'../storage/storage.gyp:storage',
'../usage_stats/usage_stats_base.gyp:usage_stats',
+ 'gen_zero_query_data#host',
'gen_zero_query_number_data#host',
'prediction_base.gyp:suggestion_filter',
'prediction_protocol',
@@ -76,23 +77,52 @@
],
},
'inputs': [
- 'gen_zero_query_number_data.py',
+ 'gen_embedded_string_array_for_zero_query.py',
'<@(input_files)',
],
'outputs': [
'<(gen_out_dir)/zero_query_number_data.h',
],
'action': [
- 'python', '../build_tools/redirect.py',
- '<(gen_out_dir)/zero_query_number_data.h',
- 'gen_zero_query_number_data.py',
- '<@(input_files)',
+ 'python', 'gen_embedded_string_array_for_zero_query.py',
+ '--input=<@(input_files)',
+ '--var_name=kZeroQueryNum',
+ '--output=<(gen_out_dir)/zero_query_number_data.h',
],
'message': 'Generating <(gen_out_dir)/zero_query_number_data.h',
},
],
},
{
+ 'target_name': 'gen_zero_query_data',
+ 'type': 'none',
+ 'toolsets': ['host'],
+ 'actions': [
+ {
+ 'action_name': 'gen_zero_query_data',
+ 'variables': {
+ 'input_files': [
+ '../data/zero_query/zero_query.def',
+ ],
+ },
+ 'inputs': [
+ 'gen_embedded_string_array_for_zero_query.py',
+ '<@(input_files)',
+ ],
+ 'outputs': [
+ '<(gen_out_dir)/zero_query_data.h',
+ ],
+ 'action': [
+ 'python', 'gen_embedded_string_array_for_zero_query.py',
+ '--input=<@(input_files)',
+ '--var_name=kZeroQueryData',
+ '--output=<(gen_out_dir)/zero_query_data.h',
+ ],
+ 'message': 'Generating <(gen_out_dir)/zero_query_data.h',
+ },
+ ],
+ },
+ {
'target_name': 'genproto_prediction',
'type': 'none',
'toolsets': ['host'],