Make the conditions to learn candidates with a punctuation mark more strict Learning a candidate with a punctuation was originally introduced for the desktop users so that sentence like suggestions such as "いつもお世話になっております" can be learned with punctuation marks like "。". However, especially in mobile where zero query suggestions are suggested aggressively, the current algorithm is sometimes too aggressive in practice. To reduce the risk of polluting history entries, this CL imposes some condition on the history candidate to check if it is a sentence or not. Currently the candidate is handles as "sentence-like" when all the following conditions are met. - the reading character length is 8 or greater. - the word ends with Hiragana character. Closes Issue 275. BUG=Issue mozc:275 TEST=unittest git-svn-id: https://mozc.googlecode.com/svn/trunk@501 a6090854-d499-a067-5803-1114d4e51264

commit: aa0abb94ff46e28b12245f75c35f7b248e173c88 [log] [tgz]
author: Noriyuki Takahashi <noriyukit@google.com> Sun Jan 25 06:37:36 2015 +0000
committer: Yohei Yukawa <yukawa@google.com> Sun Jan 25 06:37:36 2015 +0000
tree: 10455fe8750f2fac359635af1cf5762562c96926
parent: 0f8b0e76e7a59b78e2033bfcc1941d0055e2ec1e [diff]
diff --git a/src/mozc_version_template.txt b/src/mozc_version_template.txt
index 030ba22..1ddd71c 100644
--- a/src/mozc_version_template.txt
+++ b/src/mozc_version_template.txt

@@ -1,6 +1,6 @@
 MAJOR=2
 MINOR=16
-BUILD=2027
+BUILD=2028
 REVISION=102
 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
 # downloaded by NaCl Mozc.

diff --git a/src/prediction/user_history_predictor.cc b/src/prediction/user_history_predictor.cc
index 57394da..bacf17e 100644
--- a/src/prediction/user_history_predictor.cc
+++ b/src/prediction/user_history_predictor.cc

@@ -132,6 +132,19 @@
           value == "\xEF\xBC\x8C" || value == "\xEF\xBC\x8E");
 }
 
+bool IsSentenceLikeCandidate(const Segment::Candidate &candidate) {
+  // A sentence should have a long reading.  Length check is done using key to
+  // absorb length difference in value variation, e.g.,
+  // "〜ください" and "〜下さい".
+  if (candidate.value.empty() || Util::CharsLen(candidate.key) < 8) {
+    return false;
+  }
+  // Our primary target sentence ends with Hiragana, e.g., "〜ます".
+  const ConstChar32ReverseIterator iter(candidate.value);
+  bool ret = Util::GetScriptType(iter.Get()) == Util::HIRAGANA;
+  return ret;
+}
+
 // Return romanaized string.
 string ToRoman(const string &str) {
   string result;
@@ -1636,7 +1649,9 @@
       // Check if the previous value looks like a sentence.
       segments->history_segments_size() > 0 &&
       segments->history_segment(
-          segments->history_segments_size() - 1).candidates_size() > 0) {
+          segments->history_segments_size() - 1).candidates_size() > 0 &&
+      IsSentenceLikeCandidate(segments->history_segment(
+          segments->history_segments_size() - 1).candidate(0))) {
     const Entry *entry = &(dic_->Head()->value);
     DCHECK(entry);
     const string &last_value =
@@ -1798,6 +1813,12 @@
             segments->history_segments_size() - 1);
     const SegmentForLearning &conversion_segment =
         learning_segments.conversion_segment(0);
+    // Don't learn a link from/to a punctuation.  Note that another piece of
+    // code handles learning for (sentence + punctuation) form; see Finish().
+    if (IsPunctuation(history_segment.value) ||
+        IsPunctuation(conversion_segment.value)) {
+      return;
+    }
     Entry *history_entry = dic_->MutableLookupWithoutInsert(
         LearningSegmentFingerprint(history_segment));
     NextEntry next_entry;

diff --git a/src/prediction/user_history_predictor.h b/src/prediction/user_history_predictor.h
index 145a429..f0b22ca 100644
--- a/src/prediction/user_history_predictor.h
+++ b/src/prediction/user_history_predictor.h

@@ -215,6 +215,7 @@
   FRIEND_TEST(UserHistoryPredictorTest, UserHistoryPredictorClearTest);
   FRIEND_TEST(UserHistoryPredictorTest,
               UserHistoryPredictorTrailingPunctuation);
+  FRIEND_TEST(UserHistoryPredictorTest, HistoryToPunctuation);
   FRIEND_TEST(UserHistoryPredictorTest,
               UserHistoryPredictorPreceedingPunctuation);
   FRIEND_TEST(UserHistoryPredictorTest, StartsWithPunctuations);

diff --git a/src/prediction/user_history_predictor_test.cc b/src/prediction/user_history_predictor_test.cc
index 0d642d4..a7756bd 100644
--- a/src/prediction/user_history_predictor_test.cc
+++ b/src/prediction/user_history_predictor_test.cc

@@ -1001,6 +1001,92 @@
             segments.segment(0).candidate(1).value);
 }
 
+TEST_F(UserHistoryPredictorTest, HistoryToPunctuation) {
+  UserHistoryPredictor *predictor = GetUserHistoryPredictor();
+  predictor->WaitForSyncer();
+  predictor->ClearAllHistory();
+  predictor->WaitForSyncer();
+
+  Segments segments;
+
+  // Scenario 1: A user have commited "亜" by prediction and then commit "。".
+  // Then, the unigram "亜" is learned but the bigram "亜。" shouldn't.
+  // "あ"
+  MakeSegmentsForPrediction("\xE3\x81\x82", &segments);
+  // "亜"
+  AddCandidate(0, "\xE4\xBA\x9C", &segments);
+  predictor->Finish(&segments);
+  segments.mutable_segment(0)->set_segment_type(Segment::HISTORY);
+
+  // "。"
+  MakeSegmentsForPrediction("\xE3\x80\x82", &segments);
+  AddCandidate(1, "\xE3\x80\x82", &segments);
+  predictor->Finish(&segments);
+
+  segments.Clear();
+  MakeSegmentsForPrediction("\xE3\x81\x82", &segments);  // "あ"
+  ASSERT_TRUE(predictor->Predict(&segments)) << segments.DebugString();
+  // "亜"
+  EXPECT_EQ("\xE4\xBA\x9C", segments.segment(0).candidate(0).value);
+
+  segments.Clear();
+
+  // Scenario 2: the opposite case to Scenario 1, i.e., "。亜".  Nothing is
+  // suggested from symbol "。".
+  // "。"
+  MakeSegmentsForPrediction("\xE3\x80\x82", &segments);
+  // "。"
+  AddCandidate(0, "\xE3\x80\x82", &segments);
+  predictor->Finish(&segments);
+  segments.mutable_segment(0)->set_segment_type(Segment::HISTORY);
+
+  // "あ"
+  MakeSegmentsForPrediction("\xE3\x81\x82", &segments);
+  // "亜"
+  AddCandidate(1, "\xE4\xBA\x9C", &segments);
+  predictor->Finish(&segments);
+
+  segments.Clear();
+  MakeSegmentsForPrediction("\xE3\x80\x82", &segments);  // "。"
+  EXPECT_FALSE(predictor->Predict(&segments)) << segments.DebugString();
+
+  segments.Clear();
+
+  // Scenario 3: If the history segment looks like a sentence and committed
+  // value is a punctuation, the concatenated entry is also learned.
+  MakeSegmentsForPrediction(
+      // "おつかれさまです"
+      "\xE3\x81\x8A\xE3\x81\xA4\xE3\x81\x8B\xE3\x82\x8C\xE3\x81\x95"
+      "\xE3\x81\xBE\xE3\x81\xA7\xE3\x81\x99",
+      &segments);
+  AddCandidate(0,
+               // "お疲れ様です"
+               "\xE3\x81\x8A\xE7\x96\xB2\xE3\x82\x8C\xE6\xA7\x98"
+               "\xE3\x81\xA7\xE3\x81\x99",
+               &segments);
+  predictor->Finish(&segments);
+  segments.mutable_segment(0)->set_segment_type(Segment::HISTORY);
+
+  // "。"
+  MakeSegmentsForPrediction("\xE3\x80\x82", &segments);
+  AddCandidate(1, "\xE3\x80\x82", &segments);
+  predictor->Finish(&segments);
+
+  segments.Clear();
+  // "おつかれ"
+  MakeSegmentsForPrediction("\xE3\x81\x8A\xE3\x81\xA4\xE3\x81\x8B\xE3\x82\x8C",
+                            &segments);
+  ASSERT_TRUE(predictor->Predict(&segments)) << segments.DebugString();
+  // "お疲れ様です"
+  EXPECT_EQ("\xE3\x81\x8A\xE7\x96\xB2\xE3\x82\x8C\xE6\xA7\x98"
+            "\xE3\x81\xA7\xE3\x81\x99",
+            segments.segment(0).candidate(0).value);
+  // "お疲れ様です。"
+  EXPECT_EQ("\xE3\x81\x8A\xE7\x96\xB2\xE3\x82\x8C\xE6\xA7\x98"
+            "\xE3\x81\xA7\xE3\x81\x99\xE3\x80\x82",
+            segments.segment(0).candidate(1).value);
+}
+
 TEST_F(UserHistoryPredictorTest, UserHistoryPredictorPreceedingPunctuation) {
   UserHistoryPredictor *predictor = GetUserHistoryPredictor();
   predictor->WaitForSyncer();
commit	aa0abb94ff46e28b12245f75c35f7b248e173c88	[log] [tgz]
author	Noriyuki Takahashi <noriyukit@google.com>	Sun Jan 25 06:37:36 2015 +0000
committer	Yohei Yukawa <yukawa@google.com>	Sun Jan 25 06:37:36 2015 +0000
tree	10455fe8750f2fac359635af1cf5762562c96926
parent	0f8b0e76e7a59b78e2033bfcc1941d0055e2ec1e [diff]