Make the conditions to learn candidates with a punctuation mark more strict

Learning a candidate with a punctuation was originally introduced for the desktop users so that sentence like suggestions such as "いつもお世話になっております" can be learned with punctuation marks like "。".  However, especially in mobile where zero query suggestions are suggested aggressively, the current algorithm is sometimes too aggressive in practice.

To reduce the risk of polluting history entries, this CL imposes some condition on the history candidate to check if it is a sentence or not.  Currently the candidate is handles as "sentence-like" when all the following conditions are met.
- the reading character length is 8 or greater.
- the word ends with Hiragana character.

Closes Issue 275.

BUG=Issue mozc:275
TEST=unittest

git-svn-id: https://mozc.googlecode.com/svn/trunk@501 a6090854-d499-a067-5803-1114d4e51264
diff --git a/src/mozc_version_template.txt b/src/mozc_version_template.txt
index 030ba22..1ddd71c 100644
--- a/src/mozc_version_template.txt
+++ b/src/mozc_version_template.txt
@@ -1,6 +1,6 @@
 MAJOR=2
 MINOR=16
-BUILD=2027
+BUILD=2028
 REVISION=102
 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
 # downloaded by NaCl Mozc.
diff --git a/src/prediction/user_history_predictor.cc b/src/prediction/user_history_predictor.cc
index 57394da..bacf17e 100644
--- a/src/prediction/user_history_predictor.cc
+++ b/src/prediction/user_history_predictor.cc
@@ -132,6 +132,19 @@
           value == "\xEF\xBC\x8C" || value == "\xEF\xBC\x8E");
 }
 
+bool IsSentenceLikeCandidate(const Segment::Candidate &candidate) {
+  // A sentence should have a long reading.  Length check is done using key to
+  // absorb length difference in value variation, e.g.,
+  // "〜ください" and "〜下さい".
+  if (candidate.value.empty() || Util::CharsLen(candidate.key) < 8) {
+    return false;
+  }
+  // Our primary target sentence ends with Hiragana, e.g., "〜ます".
+  const ConstChar32ReverseIterator iter(candidate.value);
+  bool ret = Util::GetScriptType(iter.Get()) == Util::HIRAGANA;
+  return ret;
+}
+
 // Return romanaized string.
 string ToRoman(const string &str) {
   string result;
@@ -1636,7 +1649,9 @@
       // Check if the previous value looks like a sentence.
       segments->history_segments_size() > 0 &&
       segments->history_segment(
-          segments->history_segments_size() - 1).candidates_size() > 0) {
+          segments->history_segments_size() - 1).candidates_size() > 0 &&
+      IsSentenceLikeCandidate(segments->history_segment(
+          segments->history_segments_size() - 1).candidate(0))) {
     const Entry *entry = &(dic_->Head()->value);
     DCHECK(entry);
     const string &last_value =
@@ -1798,6 +1813,12 @@
             segments->history_segments_size() - 1);
     const SegmentForLearning &conversion_segment =
         learning_segments.conversion_segment(0);
+    // Don't learn a link from/to a punctuation.  Note that another piece of
+    // code handles learning for (sentence + punctuation) form; see Finish().
+    if (IsPunctuation(history_segment.value) ||
+        IsPunctuation(conversion_segment.value)) {
+      return;
+    }
     Entry *history_entry = dic_->MutableLookupWithoutInsert(
         LearningSegmentFingerprint(history_segment));
     NextEntry next_entry;
diff --git a/src/prediction/user_history_predictor.h b/src/prediction/user_history_predictor.h
index 145a429..f0b22ca 100644
--- a/src/prediction/user_history_predictor.h
+++ b/src/prediction/user_history_predictor.h
@@ -215,6 +215,7 @@
   FRIEND_TEST(UserHistoryPredictorTest, UserHistoryPredictorClearTest);
   FRIEND_TEST(UserHistoryPredictorTest,
               UserHistoryPredictorTrailingPunctuation);
+  FRIEND_TEST(UserHistoryPredictorTest, HistoryToPunctuation);
   FRIEND_TEST(UserHistoryPredictorTest,
               UserHistoryPredictorPreceedingPunctuation);
   FRIEND_TEST(UserHistoryPredictorTest, StartsWithPunctuations);
diff --git a/src/prediction/user_history_predictor_test.cc b/src/prediction/user_history_predictor_test.cc
index 0d642d4..a7756bd 100644
--- a/src/prediction/user_history_predictor_test.cc
+++ b/src/prediction/user_history_predictor_test.cc
@@ -1001,6 +1001,92 @@
             segments.segment(0).candidate(1).value);
 }
 
+TEST_F(UserHistoryPredictorTest, HistoryToPunctuation) {
+  UserHistoryPredictor *predictor = GetUserHistoryPredictor();
+  predictor->WaitForSyncer();
+  predictor->ClearAllHistory();
+  predictor->WaitForSyncer();
+
+  Segments segments;
+
+  // Scenario 1: A user have commited "亜" by prediction and then commit "。".
+  // Then, the unigram "亜" is learned but the bigram "亜。" shouldn't.
+  // "あ"
+  MakeSegmentsForPrediction("\xE3\x81\x82", &segments);
+  // "亜"
+  AddCandidate(0, "\xE4\xBA\x9C", &segments);
+  predictor->Finish(&segments);
+  segments.mutable_segment(0)->set_segment_type(Segment::HISTORY);
+
+  // "。"
+  MakeSegmentsForPrediction("\xE3\x80\x82", &segments);
+  AddCandidate(1, "\xE3\x80\x82", &segments);
+  predictor->Finish(&segments);
+
+  segments.Clear();
+  MakeSegmentsForPrediction("\xE3\x81\x82", &segments);  // "あ"
+  ASSERT_TRUE(predictor->Predict(&segments)) << segments.DebugString();
+  // "亜"
+  EXPECT_EQ("\xE4\xBA\x9C", segments.segment(0).candidate(0).value);
+
+  segments.Clear();
+
+  // Scenario 2: the opposite case to Scenario 1, i.e., "。亜".  Nothing is
+  // suggested from symbol "。".
+  // "。"
+  MakeSegmentsForPrediction("\xE3\x80\x82", &segments);
+  // "。"
+  AddCandidate(0, "\xE3\x80\x82", &segments);
+  predictor->Finish(&segments);
+  segments.mutable_segment(0)->set_segment_type(Segment::HISTORY);
+
+  // "あ"
+  MakeSegmentsForPrediction("\xE3\x81\x82", &segments);
+  // "亜"
+  AddCandidate(1, "\xE4\xBA\x9C", &segments);
+  predictor->Finish(&segments);
+
+  segments.Clear();
+  MakeSegmentsForPrediction("\xE3\x80\x82", &segments);  // "。"
+  EXPECT_FALSE(predictor->Predict(&segments)) << segments.DebugString();
+
+  segments.Clear();
+
+  // Scenario 3: If the history segment looks like a sentence and committed
+  // value is a punctuation, the concatenated entry is also learned.
+  MakeSegmentsForPrediction(
+      // "おつかれさまです"
+      "\xE3\x81\x8A\xE3\x81\xA4\xE3\x81\x8B\xE3\x82\x8C\xE3\x81\x95"
+      "\xE3\x81\xBE\xE3\x81\xA7\xE3\x81\x99",
+      &segments);
+  AddCandidate(0,
+               // "お疲れ様です"
+               "\xE3\x81\x8A\xE7\x96\xB2\xE3\x82\x8C\xE6\xA7\x98"
+               "\xE3\x81\xA7\xE3\x81\x99",
+               &segments);
+  predictor->Finish(&segments);
+  segments.mutable_segment(0)->set_segment_type(Segment::HISTORY);
+
+  // "。"
+  MakeSegmentsForPrediction("\xE3\x80\x82", &segments);
+  AddCandidate(1, "\xE3\x80\x82", &segments);
+  predictor->Finish(&segments);
+
+  segments.Clear();
+  // "おつかれ"
+  MakeSegmentsForPrediction("\xE3\x81\x8A\xE3\x81\xA4\xE3\x81\x8B\xE3\x82\x8C",
+                            &segments);
+  ASSERT_TRUE(predictor->Predict(&segments)) << segments.DebugString();
+  // "お疲れ様です"
+  EXPECT_EQ("\xE3\x81\x8A\xE7\x96\xB2\xE3\x82\x8C\xE6\xA7\x98"
+            "\xE3\x81\xA7\xE3\x81\x99",
+            segments.segment(0).candidate(0).value);
+  // "お疲れ様です。"
+  EXPECT_EQ("\xE3\x81\x8A\xE7\x96\xB2\xE3\x82\x8C\xE6\xA7\x98"
+            "\xE3\x81\xA7\xE3\x81\x99\xE3\x80\x82",
+            segments.segment(0).candidate(1).value);
+}
+
 TEST_F(UserHistoryPredictorTest, UserHistoryPredictorPreceedingPunctuation) {
   UserHistoryPredictor *predictor = GetUserHistoryPredictor();
   predictor->WaitForSyncer();