Learn content word for real time conversion in the mobile mode

To propagate the segmentation information in mobile use, this CL extends Segment::Candidate::inner_segment_boundary to include content key and value lengths.  This extra information is utilized in UserHistoryPredictor to reconstruct content word.

Motivation:
In mobile mode, it would be useful if content words can be learned.  Here are some examples.

Case 1:
1. Type がっこうに
2. Select 学校に
学校に (which is one segment) is learned but 学校 (content word) is not learned.  Here learning 学校 makes sense for mobile since particles are suggested by zero query suggestion in the mobile mode.

Case 2:
1. Type とうきょうかなごやにいきたい
2. Select 東京か名古屋に行きたい
This is similar to the case 1 but we can learn both 東京 and 名古屋.

Note that this CL should not change anything in the desktop mode.

BUG=none
TEST=unittest

git-svn-id: https://mozc.googlecode.com/svn/trunk@449 a6090854-d499-a067-5803-1114d4e51264
diff --git a/src/converter/converter_main.cc b/src/converter/converter_main.cc
index 26c0219..7a60513 100644
--- a/src/converter/converter_main.cc
+++ b/src/converter/converter_main.cc
@@ -168,20 +168,22 @@
   if (cand.inner_segment_boundary.empty()) {
     return "";
   }
-  vector<StringPiece> pieces;
-  const char *boundary_begin = cand.value.data();
-  const char *boundary_end = boundary_begin;
-  for (size_t i = 0; i < cand.inner_segment_boundary.size(); ++i) {
-    for (int j = 0; j < cand.inner_segment_boundary[i].second; ++j) {
-      boundary_end += Util::OneCharLen(boundary_end);
-    }
-    pieces.push_back(StringPiece(boundary_begin,
-                                 boundary_end - boundary_begin));
-    boundary_begin = boundary_end;
+  vector<string> pieces;
+  for (Segment::Candidate::InnerSegmentIterator iter(&cand);
+       !iter.Done(); iter.Next()) {
+    string s = "<";
+    iter.GetKey().AppendToString(&s);
+    s.append(", ");
+    iter.GetValue().AppendToString(&s);
+    s.append(", ");
+    iter.GetContentKey().AppendToString(&s);
+    s.append(", ");
+    iter.GetContentValue().AppendToString(&s);
+    s.append(1, '>');
+    pieces.push_back(s);
   }
-  CHECK_EQ(cand.value.data() + cand.value.size(), boundary_begin);
   string s;
-  Util::JoinStringPieces(pieces, " | ", &s);
+  Util::JoinStrings(pieces, " | ", &s);
   return s;
 }
 
diff --git a/src/converter/converter_test.cc b/src/converter/converter_test.cc
index 5898407..b64c1fc 100644
--- a/src/converter/converter_test.cc
+++ b/src/converter/converter_test.cc
@@ -238,7 +238,7 @@
   EngineInterface *CreateEngineWithMobilePredictor() {
     Engine *engine = new Engine;
     testing::MockDataManager data_manager;
-    engine->Init(&data_manager, MobilePredictor::CreateMobilePredictor);
+    engine->Init(&data_manager, MobilePredictor::CreateMobilePredictor, true);
     return engine;
   }
 
@@ -1308,7 +1308,8 @@
                           suggegstion_filter.get()),
                       new UserHistoryPredictor(dictionary.get(),
                                                data_manager.GetPOSMatcher(),
-                                               suppression_dictionary.get())),
+                                               suppression_dictionary.get(),
+                                               false)),
                   new RewriterImpl(converter.get(),
                                    &data_manager,
                                    pos_group.get(),
diff --git a/src/converter/immutable_converter.h b/src/converter/immutable_converter.h
index b28303e..ed255a8 100644
--- a/src/converter/immutable_converter.h
+++ b/src/converter/immutable_converter.h
@@ -78,6 +78,7 @@
   friend class NBestGeneratorTest;
   FRIEND_TEST(NBestGeneratorTest, MultiSegmentConnectionTest);
   FRIEND_TEST(NBestGeneratorTest, SingleSegmentConnectionTest);
+  FRIEND_TEST(NBestGeneratorTest, InnerSegmentBoundary);
 
   enum InsertCandidatesType {
     MULTI_SEGMENTS,  // Normal conversion ("私の|名前は|中野です")
diff --git a/src/converter/immutable_converter_test.cc b/src/converter/immutable_converter_test.cc
index aa2a446..5b982eb 100644
--- a/src/converter/immutable_converter_test.cc
+++ b/src/converter/immutable_converter_test.cc
@@ -228,8 +228,8 @@
   // "てすと"
   SetCandidate("\xE3\x81\xA6\xE3\x81\x99\xE3\x81\xA8", "test", &segment);
   Segment::Candidate *c = segment.mutable_candidate(0);
-  c->inner_segment_boundary.push_back(pair<int, int>(1, 2));
-  c->inner_segment_boundary.push_back(pair<int, int>(2, 2));
+  c->PushBackInnerSegmentBoundary(3, 2, 3, 2);
+  c->PushBackInnerSegmentBoundary(6, 2, 6, 2);
   EXPECT_TRUE(c->IsValid());
 
   data_and_converter->GetConverter()->InsertDummyCandidates(&segment, 10);
@@ -374,14 +374,38 @@
 
   // Result will be, "私の|名前は|中ノです" with mock dictionary.
   const Segment::Candidate &cand = segments.segment(0).candidate(0);
-  ASSERT_EQ(3, cand.inner_segment_boundary.size());
-  EXPECT_EQ(4, cand.inner_segment_boundary[0].first);
-  EXPECT_EQ(4, cand.inner_segment_boundary[1].first);
-  EXPECT_EQ(5, cand.inner_segment_boundary[2].first);
+  vector<StringPiece> keys, values, content_keys, content_values;
+  for (Segment::Candidate::InnerSegmentIterator iter(&cand);
+       !iter.Done(); iter.Next()) {
+    keys.push_back(iter.GetKey());
+    values.push_back(iter.GetValue());
+    content_keys.push_back(iter.GetContentKey());
+    content_values.push_back(iter.GetContentValue());
+  }
+  ASSERT_EQ(3, keys.size());
+  // "わたしの" | "なまえは" | "なかのです"
+  EXPECT_EQ("\xe3\x82\x8f\xe3\x81\x9f\xe3\x81\x97\xe3\x81\xae", keys[0]);
+  EXPECT_EQ("\xe3\x81\xaa\xe3\x81\xbe\xe3\x81\x88\xe3\x81\xaf", keys[1]);
+  EXPECT_EQ("\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xae\xe3\x81\xa7\xe3\x81\x99",
+            keys[2]);
 
-  EXPECT_EQ(2, cand.inner_segment_boundary[0].second);
-  EXPECT_EQ(3, cand.inner_segment_boundary[1].second);
-  EXPECT_EQ(4, cand.inner_segment_boundary[2].second);
+  // "私の" | "名前は" | "中ノです"
+  ASSERT_EQ(3, values.size());
+  EXPECT_EQ("\xe7\xa7\x81\xe3\x81\xae", values[0]);
+  EXPECT_EQ("\xe5\x90\x8d\xe5\x89\x8d\xe3\x81\xaf", values[1]);
+  EXPECT_EQ("\xe4\xb8\xad\xe3\x83\x8e\xe3\x81\xa7\xe3\x81\x99", values[2]);
+
+  ASSERT_EQ(3, content_keys.size());
+  // "わたし" | "なまえ" | "なかの"
+  EXPECT_EQ("\xe3\x82\x8f\xe3\x81\x9f\xe3\x81\x97", content_keys[0]);
+  EXPECT_EQ("\xe3\x81\xaa\xe3\x81\xbe\xe3\x81\x88", content_keys[1]);
+  EXPECT_EQ("\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xae", content_keys[2]);
+
+  // "私" | "名前" | "中ノ"
+  ASSERT_EQ(3, content_values.size());
+  EXPECT_EQ("\xe7\xa7\x81", content_values[0]);
+  EXPECT_EQ("\xe5\x90\x8d\xe5\x89\x8d", content_values[1]);
+  EXPECT_EQ("\xe4\xb8\xad\xe3\x83\x8e", content_values[2]);
 }
 
 TEST_F(ImmutableConverterTest, NoInnerSegmenBoundaryForConversion) {
diff --git a/src/converter/nbest_generator.cc b/src/converter/nbest_generator.cc
index 0fe1dc8..ec37a34 100644
--- a/src/converter/nbest_generator.cc
+++ b/src/converter/nbest_generator.cc
@@ -223,27 +223,49 @@
 
   candidate->inner_segment_boundary.clear();
   if (check_mode_ == ONLY_EDGE) {
-    // For realtime conversion.
-    // Set inner segment boundary for user history prediction from
-    // realtime conversion result.
-    int key_len, value_len;
-    key_len = Util::CharsLen(nodes[0]->key);
-    value_len = Util::CharsLen(nodes[0]->value);
+    // For realtime conversion.  Set inner segment boundary for user history
+    // prediction from realtime conversion result.
+    size_t key_len = nodes[0]->key.size(), value_len = nodes[0]->value.size();
+    size_t content_key_len = key_len, content_value_len = value_len;
+    bool is_content_boundary = false;
+    if (pos_matcher_->IsFunctional(nodes[0]->rid)) {
+      is_content_boundary = true;
+      content_key_len = 0;
+      content_value_len = 0;
+    }
     for (size_t i = 1; i < nodes.size(); ++i) {
       const Node *lnode = nodes[i - 1];
       const Node *rnode = nodes[i];
       const bool kMultipleSegments = false;
       if (segmenter_->IsBoundary(lnode, rnode, kMultipleSegments)) {
-        candidate->inner_segment_boundary.push_back(
-            pair<int, int>(key_len, value_len));
+        candidate->PushBackInnerSegmentBoundary(
+            key_len, value_len, content_key_len, content_value_len);
         key_len = 0;
         value_len = 0;
+        content_key_len = 0;
+        content_value_len = 0;
+        is_content_boundary = false;
       }
-      key_len += Util::CharsLen(rnode->key);
-      value_len += Util::CharsLen(rnode->value);
+      key_len += rnode->key.size();
+      value_len += rnode->value.size();
+      if (is_content_boundary) {
+        continue;
+      }
+      // Set boundary only after content nouns or pronouns.  For example,
+      // "走った" is formed as
+      //     "走っ" (content word) + "た" (functional).
+      // Since the content word is incomplete, we don't want to learn "走っ".
+      if ((pos_matcher_->IsContentNoun(lnode->rid) ||
+           pos_matcher_->IsPronoun(lnode->rid)) &&
+          pos_matcher_->IsFunctional(rnode->lid)) {
+        is_content_boundary = true;
+      } else {
+        content_key_len += rnode->key.size();
+        content_value_len += rnode->value.size();
+      }
     }
-    candidate->inner_segment_boundary.push_back(
-        pair<int, int>(key_len, value_len));
+    candidate->PushBackInnerSegmentBoundary(
+        key_len, value_len, content_key_len, content_value_len);
   }
 }
 
diff --git a/src/converter/nbest_generator_test.cc b/src/converter/nbest_generator_test.cc
index 7104e99..6517f1d 100644
--- a/src/converter/nbest_generator_test.cc
+++ b/src/converter/nbest_generator_test.cc
@@ -331,4 +331,105 @@
               result_segment.candidate(0).value);
   }
 }
+
+TEST_F(NBestGeneratorTest, InnerSegmentBoundary) {
+  scoped_ptr<MockDataAndImmutableConverter> data_and_converter(
+      new MockDataAndImmutableConverter);
+  ImmutableConverterImpl *converter = data_and_converter->GetConverter();
+
+  Segments segments;
+  segments.set_request_type(Segments::PREDICTION);
+  // "とうきょうかなごやにいきたい"
+  const string kInput =
+      "\xe3\x81\xa8\xe3\x81\x86\xe3\x81\x8d\xe3\x82\x87\xe3\x81\x86"
+      "\xe3\x81\x8b\xe3\x81\xaa\xe3\x81\x94\xe3\x82\x84\xe3\x81\xab"
+      "\xe3\x81\x84\xe3\x81\x8d\xe3\x81\x9f\xe3\x81\x84";
+  {
+    Segment *segment = segments.add_segment();
+    segment->set_segment_type(Segment::FREE);
+    segment->set_key(kInput);
+  }
+
+  Lattice lattice;
+  lattice.SetKey(kInput);
+  const ConversionRequest request;
+  converter->MakeLattice(request, &segments, &lattice);
+
+  vector<uint16> group;
+  converter->MakeGroup(segments, &group);
+  converter->Viterbi(segments, &lattice);
+
+  scoped_ptr<NBestGenerator> nbest_generator(
+      data_and_converter->CreateNBestGenerator(&lattice));
+
+  const bool kSingleSegment = true;  // For realtime conversion
+  const Node *begin_node = lattice.bos_nodes();
+  const Node *end_node = GetEndNode(
+      *converter, segments, *begin_node, group, kSingleSegment);
+
+  nbest_generator->Reset(begin_node, end_node, NBestGenerator::ONLY_EDGE);
+  Segment result_segment;
+  GatherCandidates(
+      10, Segments::PREDICTION, nbest_generator.get(), &result_segment);
+  ASSERT_LE(1, result_segment.candidates_size());
+
+  const Segment::Candidate &top_cand = result_segment.candidate(0);
+  EXPECT_EQ(kInput, top_cand.key);
+  // "東京か名古屋に行きたい
+  EXPECT_EQ("\xe6\x9d\xb1\xe4\xba\xac\xe3\x81\x8b\xe5\x90\x8d\xe5\x8f\xa4"
+            "\xe5\xb1\x8b\xe3\x81\xab\xe8\xa1\x8c\xe3\x81\x8d\xe3\x81\x9f"
+            "\xe3\x81\x84",
+            top_cand.value);
+
+  vector<StringPiece> keys, values, content_keys, content_values;
+  for (Segment::Candidate::InnerSegmentIterator iter(&top_cand);
+       !iter.Done(); iter.Next()) {
+    keys.push_back(iter.GetKey());
+    values.push_back(iter.GetValue());
+    content_keys.push_back(iter.GetContentKey());
+    content_values.push_back(iter.GetContentValue());
+  }
+  ASSERT_EQ(3, keys.size());
+  ASSERT_EQ(3, values.size());
+  ASSERT_EQ(3, content_keys.size());
+  ASSERT_EQ(3, content_values.size());
+
+  // Inner segment 0
+  // "とうきょうか"
+  EXPECT_EQ("\xe3\x81\xa8\xe3\x81\x86\xe3\x81\x8d\xe3\x82\x87\xe3\x81\x86"
+            "\xe3\x81\x8b", keys[0]);
+  // "東京か"
+  EXPECT_EQ("\xe6\x9d\xb1\xe4\xba\xac\xe3\x81\x8b", values[0]);
+  // "とうきょう"
+  EXPECT_EQ("\xe3\x81\xa8\xe3\x81\x86\xe3\x81\x8d\xe3\x82\x87\xe3\x81\x86",
+            content_keys[0]);
+  // "東京"
+  EXPECT_EQ("\xe6\x9d\xb1\xe4\xba\xac", content_values[0]);
+
+  // Inner segment 1
+  // "なごやに"
+  EXPECT_EQ("\xe3\x81\xaa\xe3\x81\x94\xe3\x82\x84\xe3\x81\xab", keys[1]);
+  // "名古屋に"
+  EXPECT_EQ("\xe5\x90\x8d\xe5\x8f\xa4\xe5\xb1\x8b\xe3\x81\xab", values[1]);
+  // "なごや"
+  EXPECT_EQ("\xe3\x81\xaa\xe3\x81\x94\xe3\x82\x84", content_keys[1]);
+  // "名古屋"
+  EXPECT_EQ("\xe5\x90\x8d\xe5\x8f\xa4\xe5\xb1\x8b", content_values[1]);
+
+  // Inner segment 2: In the original segment, "行きたい" has the form
+  // "行き" (content word) + "たい" (functional).  However, since "行き" is
+  // Yougen, our rule for inner segment boundary doesn't handle it as a content
+  // value.  Thus, "行きたい" becomes the content value.
+  // "いきたい"
+  EXPECT_EQ("\xe3\x81\x84\xe3\x81\x8d\xe3\x81\x9f\xe3\x81\x84", keys[2]);
+  // "行きたい"
+  EXPECT_EQ("\xe8\xa1\x8c\xe3\x81\x8d\xe3\x81\x9f\xe3\x81\x84", values[2]);
+  // "いきたい"
+  EXPECT_EQ("\xe3\x81\x84\xe3\x81\x8d\xe3\x81\x9f\xe3\x81\x84",
+            content_keys[2]);
+  // "行きたい"
+  EXPECT_EQ("\xe8\xa1\x8c\xe3\x81\x8d\xe3\x81\x9f\xe3\x81\x84",
+            content_values[2]);
+}
+
 }  // namespace mozc
diff --git a/src/converter/segments.cc b/src/converter/segments.cc
index dc035d9..af2e764 100644
--- a/src/converter/segments.cc
+++ b/src/converter/segments.cc
@@ -91,18 +91,43 @@
     // The sums of the lengths of key and value components must coincide with
     // those of key and value, respectively.
     size_t sum_key_len = 0, sum_value_len = 0;
-    for (size_t i = 0; i < inner_segment_boundary.size(); ++i) {
-      sum_key_len += inner_segment_boundary[i].first;
-      sum_value_len += inner_segment_boundary[i].second;
+    for (InnerSegmentIterator iter(this); !iter.Done(); iter.Next()) {
+      sum_key_len += iter.GetKey().size();
+      sum_value_len += iter.GetValue().size();
     }
-    if (sum_key_len != Util::CharsLen(key) ||
-        sum_value_len != Util::CharsLen(value)) {
+    if (sum_key_len != key.size() || sum_value_len != value.size()) {
       return false;
     }
   }
   return true;
 }
 
+bool Segment::Candidate::EncodeLengths(
+    size_t key_len, size_t value_len,
+    size_t content_key_len, size_t content_value_len, uint32 *result) {
+  if (key_len > kuint8max || value_len > kuint8max ||
+      content_key_len > kuint8max || content_value_len > kuint8max) {
+    return false;
+  }
+  *result = (static_cast<uint32>(key_len) << 24) |
+            (static_cast<uint32>(value_len) << 16) |
+            (static_cast<uint32>(content_key_len) << 8) |
+            static_cast<uint32>(content_value_len);
+  return true;
+}
+
+bool Segment::Candidate::PushBackInnerSegmentBoundary(
+    size_t key_len, size_t value_len,
+    size_t content_key_len, size_t content_value_len) {
+  uint32 encoded;
+  if (EncodeLengths(key_len, value_len, content_key_len, content_value_len,
+                    &encoded)) {
+    inner_segment_boundary.push_back(encoded);
+    return true;
+  }
+  return false;
+}
+
 string Segment::Candidate::DebugString() const {
   stringstream os;
   os << "(key=" << key
@@ -128,15 +153,50 @@
   if (!inner_segment_boundary.empty()) {
     os << " segbdd=";
     for (size_t i = 0; i < inner_segment_boundary.size(); ++i) {
-      os << Util::StringPrintf("<%d,%d>",
-                               inner_segment_boundary[i].first,
-                               inner_segment_boundary[i].second);
+      const uint32 encoded_lengths = inner_segment_boundary[i];
+      const int key_len = encoded_lengths >> 24;
+      const int value_len = (encoded_lengths >> 16) & 0xff;
+      const int content_key_len = (encoded_lengths >> 8) & 0xff;
+      const int content_value_len = encoded_lengths & 0xff;
+      os << Util::StringPrintf("<%d,%d,%d,%d>", key_len, value_len,
+                               content_key_len, content_value_len);
     }
   }
   os << ")" << endl;
   return os.str();
 }
 
+void Segment::Candidate::InnerSegmentIterator::Next() {
+  DCHECK_LT(index_, candidate_->inner_segment_boundary.size());
+  const uint32 encoded_lengths = candidate_->inner_segment_boundary[index_++];
+  key_offset_ += encoded_lengths >> 24;
+  value_offset_ += (encoded_lengths >> 16) & 0xff;
+}
+
+StringPiece Segment::Candidate::InnerSegmentIterator::GetKey() const {
+  DCHECK_LT(index_, candidate_->inner_segment_boundary.size());
+  const uint32 encoded_lengths = candidate_->inner_segment_boundary[index_];
+  return StringPiece(key_offset_, encoded_lengths >> 24);
+}
+
+StringPiece Segment::Candidate::InnerSegmentIterator::GetValue() const {
+  DCHECK_LT(index_, candidate_->inner_segment_boundary.size());
+  const uint32 encoded_lengths = candidate_->inner_segment_boundary[index_];
+  return StringPiece(value_offset_, (encoded_lengths >> 16) & 0xff);
+}
+
+StringPiece Segment::Candidate::InnerSegmentIterator::GetContentKey() const {
+  DCHECK_LT(index_, candidate_->inner_segment_boundary.size());
+  const uint32 encoded_lengths = candidate_->inner_segment_boundary[index_];
+  return StringPiece(key_offset_, (encoded_lengths >> 8) & 0xff);
+}
+
+StringPiece Segment::Candidate::InnerSegmentIterator::GetContentValue() const {
+  DCHECK_LT(index_, candidate_->inner_segment_boundary.size());
+  const uint32 encoded_lengths = candidate_->inner_segment_boundary[index_];
+  return StringPiece(value_offset_, encoded_lengths & 0xff);
+}
+
 Segment::Segment()
     : segment_type_(FREE),
       pool_(new ObjectPool<Candidate>(16)) {}
diff --git a/src/converter/segments.h b/src/converter/segments.h
index c78913f..2c87628 100644
--- a/src/converter/segments.h
+++ b/src/converter/segments.h
@@ -170,13 +170,60 @@
     // The style is defined in enum |Command|.
     Command command;
 
-    // Boundary information for realtime conversion.
-    // This will be set only for realtime conversion result candidates.
-    // This contains inner segment size for key and value.
-    // If the candidate key and value are
-    // "わたしの|なまえは|なかのです", " 私の|名前は|中野です",
-    // |inner_segment_boundary| have [(4,2), (4, 3), (5, 4)].
-    vector<pair<int, int> > inner_segment_boundary;
+    // Boundary information for realtime conversion.  This will be set only for
+    // realtime conversion result candidates.  Each element is the encoded
+    // lengths of key, value, content key and content value.
+    vector<uint32> inner_segment_boundary;
+
+    static bool EncodeLengths(size_t key_len, size_t value_len,
+                              size_t content_key_len,
+                              size_t content_value_len,
+                              uint32 *result);
+
+    // This function ignores error, so be careful when using this.
+    static uint32 EncodeLengths(size_t key_len, size_t value_len,
+                                size_t content_key_len,
+                                size_t content_value_len) {
+      uint32 result;
+      EncodeLengths(key_len, value_len, content_key_len, content_value_len,
+                    &result);
+      return result;
+    }
+
+    // Inserts a new element to |inner_segment_boundary|.  If one of four
+    // lengths is longer than 255, this method returns false.
+    bool PushBackInnerSegmentBoundary(size_t key_len, size_t value_len,
+                                      size_t content_key_len,
+                                      size_t content_value_len);
+
+    // Iterates inner segments.  Usage example:
+    // for (InnerSegmentIterator iter(&cand); !iter.Done(); iter.Next()) {
+    //   StringPiece s = iter.GetContentKey();
+    //   ...
+    // }
+    class InnerSegmentIterator {
+     public:
+      explicit InnerSegmentIterator(const Candidate *candidate)
+          : candidate_(candidate), key_offset_(candidate->key.data()),
+            value_offset_(candidate->value.data()),
+            index_(0) {}
+
+      bool Done() const {
+        return index_ == candidate_->inner_segment_boundary.size();
+      }
+
+      void Next();
+      StringPiece GetKey() const;
+      StringPiece GetValue() const;
+      StringPiece GetContentKey() const;
+      StringPiece GetContentValue() const;
+
+     private:
+      const Candidate *candidate_;
+      const char *key_offset_;
+      const char *value_offset_;
+      size_t index_;
+    };
 
     void Init() {
       key.clear();
diff --git a/src/converter/segments_test.cc b/src/converter/segments_test.cc
index 2cf4f1d..fce5aa1 100644
--- a/src/converter/segments_test.cc
+++ b/src/converter/segments_test.cc
@@ -290,7 +290,7 @@
   src.attributes = 6;
   src.style = NumberUtil::NumberString::NUMBER_CIRCLED;
   src.command = Segment::Candidate::DISABLE_PRESENTATION_MODE;
-  src.inner_segment_boundary.push_back(pair<int, int>(1, 3));
+  src.PushBackInnerSegmentBoundary(1, 3, 5, 7);
 
   dest.CopyFrom(src);
 
@@ -339,15 +339,20 @@
   EXPECT_TRUE(c.IsValid());  // Empty inner_segment_boundary
 
   // Valid inner_segment_boundary.
-  c.inner_segment_boundary.push_back(pair<int, int>(1, 3));
-  c.inner_segment_boundary.push_back(pair<int, int>(2, 2));
+  c.inner_segment_boundary.push_back(
+      Segment::Candidate::EncodeLengths(1, 3, 8, 8));
+  c.inner_segment_boundary.push_back(
+      Segment::Candidate::EncodeLengths(2, 2, 3, 3));
   EXPECT_TRUE(c.IsValid());
 
   // Invalid inner_segment_boundary.
   c.inner_segment_boundary.clear();
-  c.inner_segment_boundary.push_back(pair<int, int>(1, 1));
-  c.inner_segment_boundary.push_back(pair<int, int>(2, 2));
-  c.inner_segment_boundary.push_back(pair<int, int>(3, 3));
+  c.inner_segment_boundary.push_back(
+      Segment::Candidate::EncodeLengths(1, 1, 2, 2));
+  c.inner_segment_boundary.push_back(
+      Segment::Candidate::EncodeLengths(2, 2, 3, 3));
+  c.inner_segment_boundary.push_back(
+      Segment::Candidate::EncodeLengths(3, 3, 4, 4));
   EXPECT_FALSE(c.IsValid());
 }
 
@@ -512,6 +517,54 @@
   EXPECT_EQ("", candidate.functional_value());
 }
 
+TEST_F(CandidateTest, InnerSegmentIterator) {
+  {
+    // For empty inner_segment_boundary, the initial state is done.
+    Segment::Candidate candidate;
+    candidate.Init();
+    candidate.key = "testfoobar";
+    candidate.value = "redgreenblue";
+    Segment::Candidate::InnerSegmentIterator iter(&candidate);
+    EXPECT_TRUE(iter.Done());
+  }
+  {
+    //           key: test | foobar
+    //         value:  red | greenblue
+    //   content key: test | foo
+    // content value:  red | green
+    Segment::Candidate candidate;
+    candidate.Init();
+    candidate.key = "testfoobar";
+    candidate.value = "redgreenblue";
+    candidate.PushBackInnerSegmentBoundary(4, 3, 4, 3);
+    candidate.PushBackInnerSegmentBoundary(6, 9, 3, 5);
+    vector<StringPiece> keys, values, content_keys, content_values;
+    for (Segment::Candidate::InnerSegmentIterator iter(&candidate);
+         !iter.Done(); iter.Next()) {
+      keys.push_back(iter.GetKey());
+      values.push_back(iter.GetValue());
+      content_keys.push_back(iter.GetContentKey());
+      content_values.push_back(iter.GetContentValue());
+    }
+
+    ASSERT_EQ(2, keys.size());
+    EXPECT_EQ("test", keys[0]);
+    EXPECT_EQ("foobar", keys[1]);
+
+    ASSERT_EQ(2, values.size());
+    EXPECT_EQ("red", values[0]);
+    EXPECT_EQ("greenblue", values[1]);
+
+    ASSERT_EQ(2, content_keys.size());
+    EXPECT_EQ("test", content_keys[0]);
+    EXPECT_EQ("foo", content_keys[1]);
+
+    ASSERT_EQ(2, content_values.size());
+    EXPECT_EQ("red", content_values[0]);
+    EXPECT_EQ("green", content_values[1]);
+  }
+}
+
 TEST_F(SegmentTest, CopyFrom) {
   Segment src, dest;
 
diff --git a/src/data/rules/pos_matcher_rule.def b/src/data/rules/pos_matcher_rule.def
index d5cc6b7..1e60577 100644
--- a/src/data/rules/pos_matcher_rule.def
+++ b/src/data/rules/pos_matcher_rule.def
@@ -68,6 +68,9 @@
 # 一般名詞
 GeneralNoun ^名詞,一般,*,*,*,*,*$
 
+# 代名詞
+Pronoun ^名詞,代名詞,
+
 # Content Noun
 ContentNoun ^名詞,(一般|固有名詞|副詞可能|サ変接続),
 
diff --git a/src/engine/chromeos_engine_factory.cc b/src/engine/chromeos_engine_factory.cc
index 8539788..024a9b6 100644
--- a/src/engine/chromeos_engine_factory.cc
+++ b/src/engine/chromeos_engine_factory.cc
@@ -63,7 +63,8 @@
   Engine *engine = new Engine;
   DCHECK(engine);
   ScopedDataManager data_manager(chromeos::CreateDataManager());
-  engine->Init(data_manager.Get(), DefaultPredictor::CreateDefaultPredictor);
+  engine->Init(data_manager.Get(), DefaultPredictor::CreateDefaultPredictor,
+               false);
   return engine;
 }
 
diff --git a/src/engine/engine.cc b/src/engine/engine.cc
index 188f804..232ee8f 100644
--- a/src/engine/engine.cc
+++ b/src/engine/engine.cc
@@ -136,7 +136,8 @@
 void Engine::Init(
     const DataManagerInterface *data_manager,
     PredictorInterface *(*predictor_factory)(PredictorInterface *,
-                                             PredictorInterface *)) {
+                                             PredictorInterface *),
+    bool enable_content_word_learning) {
   CHECK(data_manager);
   CHECK(predictor_factory);
 
@@ -225,7 +226,8 @@
     PredictorInterface *user_history_predictor =
         new UserHistoryPredictor(dictionary_.get(),
                                  data_manager->GetPOSMatcher(),
-                                 suppression_dictionary_.get());
+                                 suppression_dictionary_.get(),
+                                 enable_content_word_learning);
     CHECK(user_history_predictor);
 
     predictor_ = (*predictor_factory)(dictionary_predictor,
diff --git a/src/engine/engine.h b/src/engine/engine.h
index fb82fe1..b8083b8 100644
--- a/src/engine/engine.h
+++ b/src/engine/engine.h
@@ -61,7 +61,8 @@
   // Predictor factory is used to select DefaultPredictor and MobilePredictor.
   void Init(const DataManagerInterface *data_manager,
             PredictorInterface *(*predictor_factory)(PredictorInterface *,
-                                                     PredictorInterface *));
+                                                     PredictorInterface *),
+            bool enable_content_word_learning);
 
   virtual ConverterInterface *GetConverter() const { return converter_.get(); }
   virtual PredictorInterface *GetPredictor() const { return predictor_; }
diff --git a/src/engine/mock_data_engine_factory.cc b/src/engine/mock_data_engine_factory.cc
index 111b9ff..0520a0f 100644
--- a/src/engine/mock_data_engine_factory.cc
+++ b/src/engine/mock_data_engine_factory.cc
@@ -40,7 +40,7 @@
   Engine *engine = new Engine;
   DCHECK(engine);
   const testing::MockDataManager data_manager;
-  engine->Init(&data_manager, DefaultPredictor::CreateDefaultPredictor);
+  engine->Init(&data_manager, DefaultPredictor::CreateDefaultPredictor, false);
   return engine;
 }
 
diff --git a/src/engine/oss_engine_factory.cc b/src/engine/oss_engine_factory.cc
index 63aea32..3925fa6 100644
--- a/src/engine/oss_engine_factory.cc
+++ b/src/engine/oss_engine_factory.cc
@@ -41,9 +41,9 @@
   DCHECK(engine);
   const oss::OssDataManager data_manager;
 #ifdef OS_ANDROID
-  engine->Init(&data_manager, MobilePredictor::CreateMobilePredictor);
+  engine->Init(&data_manager, MobilePredictor::CreateMobilePredictor, true);
 #else  // OS_ANDROID
-  engine->Init(&data_manager, DefaultPredictor::CreateDefaultPredictor);
+  engine->Init(&data_manager, DefaultPredictor::CreateDefaultPredictor, false);
 #endif  // OS_ANDROID
   return engine;
 }
diff --git a/src/engine/packed_engine_factory.cc b/src/engine/packed_engine_factory.cc
index e6858fe..4cd7b89 100644
--- a/src/engine/packed_engine_factory.cc
+++ b/src/engine/packed_engine_factory.cc
@@ -40,7 +40,8 @@
   Engine *engine = new Engine;
   DCHECK(engine);
   engine->Init(packed::GetPackedDataManager(),
-               DefaultPredictor::CreateDefaultPredictor);
+               DefaultPredictor::CreateDefaultPredictor,
+               false);
   return engine;
 }
 
diff --git a/src/mozc_version_template.txt b/src/mozc_version_template.txt
index 3128d50..db1324f 100644
--- a/src/mozc_version_template.txt
+++ b/src/mozc_version_template.txt
@@ -1,6 +1,6 @@
 MAJOR=2
 MINOR=16
-BUILD=1988
+BUILD=1989
 REVISION=102
 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be
 # downloaded by NaCl Mozc.
diff --git a/src/prediction/dictionary_predictor.cc b/src/prediction/dictionary_predictor.cc
index 8b29f4e..37948bd 100644
--- a/src/prediction/dictionary_predictor.cc
+++ b/src/prediction/dictionary_predictor.cc
@@ -1063,16 +1063,29 @@
   // construct it manually here.
   // TODO(noriyukit): This is code duplicate in converter/nbest_generator.cc and
   // we should refactor code after finding more good design.
+  bool inner_segment_boundary_success = true;
   for (size_t i = 0; i < tmp_segments.conversion_segments_size(); ++i) {
     const Segment &segment = tmp_segments.conversion_segment(i);
     const Segment::Candidate &candidate = segment.candidate(0);
     result->value.append(candidate.value);
     result->wcost += candidate.cost;
-    result->inner_segment_boundary.push_back(
-        make_pair(Util::CharsLen(candidate.key),
-                  Util::CharsLen(candidate.value)));
-  }
 
+    uint32 encoded_lengths;
+    if (inner_segment_boundary_success &&
+        Segment::Candidate::EncodeLengths(candidate.key.size(),
+                                          candidate.value.size(),
+                                          candidate.content_key.size(),
+                                          candidate.content_value.size(),
+                                          &encoded_lengths)) {
+      result->inner_segment_boundary.push_back(encoded_lengths);
+    } else {
+      inner_segment_boundary_success = false;
+    }
+  }
+  if (!inner_segment_boundary_success) {
+    LOG(WARNING) << "Failed to construct inner segment boundary";
+    result->inner_segment_boundary.clear();
+  }
   return true;
 }
 
diff --git a/src/prediction/dictionary_predictor.h b/src/prediction/dictionary_predictor.h
index 9c666ee..2256160 100644
--- a/src/prediction/dictionary_predictor.h
+++ b/src/prediction/dictionary_predictor.h
@@ -129,7 +129,7 @@
     // If the candidate key and value are
     // "わたしの|なまえは|なかのです", " 私の|名前は|中野です",
     // |inner_segment_boundary| have [(4,2), (4, 3), (5, 4)].
-    vector<pair<int, int> > inner_segment_boundary;
+    vector<uint32> inner_segment_boundary;
     uint32 candidate_attributes;
     size_t consumed_key_size;
   };
diff --git a/src/prediction/dictionary_predictor_test.cc b/src/prediction/dictionary_predictor_test.cc
index a7eae6f..5aa83ed 100644
--- a/src/prediction/dictionary_predictor_test.cc
+++ b/src/prediction/dictionary_predictor_test.cc
@@ -128,12 +128,12 @@
                       "\xe3\x81\xaa\xe3\x81\xbe\xe3\x81\x88\xe3\x81\xaf"
                       "\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xae\xe3\x81\xa7"
                       "\xe3\x81\x99");
-    // "わたしの, 私の"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(4, 2));
-    // "なまえは, 名前は"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(4, 3));
-    // "なかのです, 中野です"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(5, 4));
+    // "わたしの, 私の", "わたし, 私"
+    candidate->PushBackInnerSegmentBoundary(12, 6, 9, 3);
+    // "なまえは, 名前は", "なまえ, 名前"
+    candidate->PushBackInnerSegmentBoundary(12, 9, 9, 6);
+    // "なかのです, 中野です", "なかの, 中野"
+    candidate->PushBackInnerSegmentBoundary(15, 12, 9, 6);
   }
 
   virtual bool ConvertForRequest(
diff --git a/src/prediction/predictor_test.cc b/src/prediction/predictor_test.cc
index 13c812e..49016b0 100644
--- a/src/prediction/predictor_test.cc
+++ b/src/prediction/predictor_test.cc
@@ -293,7 +293,8 @@
           new UserHistoryPredictor(
               &dictionary_mock,
               UserPosManager::GetUserPosManager()->GetPOSMatcher(),
-              Singleton<SuppressionDictionary>::get())));
+              Singleton<SuppressionDictionary>::get(),
+              true)));
   Segments segments;
   {
     segments.set_request_type(Segments::PARTIAL_PREDICTION);
diff --git a/src/prediction/user_history_predictor.cc b/src/prediction/user_history_predictor.cc
index 13481fc..cd54313 100644
--- a/src/prediction/user_history_predictor.cc
+++ b/src/prediction/user_history_predictor.cc
@@ -379,11 +379,13 @@
 UserHistoryPredictor::UserHistoryPredictor(
     const DictionaryInterface *dictionary,
     const POSMatcher *pos_matcher,
-    const SuppressionDictionary *suppression_dictionary)
+    const SuppressionDictionary *suppression_dictionary,
+    bool enable_content_word_learning)
     : dictionary_(dictionary),
       pos_matcher_(pos_matcher),
       suppression_dictionary_(suppression_dictionary),
       predictor_name_("UserHistoryPredictor"),
+      content_word_learning_enabled_(enable_content_word_learning),
       updated_(false),
       dic_(new DicCache(UserHistoryPredictor::cache_size())) {
   AsyncLoad();  // non-blocking
@@ -1695,6 +1697,8 @@
     DCHECK_LE(1, segment.candidates_size());
     learning_segment.key = segment.candidate(0).key;
     learning_segment.value = segment.candidate(0).value;
+    learning_segment.content_key = segment.candidate(0).content_key;
+    learning_segment.content_value = segment.candidate(0).content_value;
     learning_segment.description = GetDescription(segment.candidate(0));
     learning_segments->push_back_history_segment(learning_segment);
   }
@@ -1705,24 +1709,20 @@
       SegmentForLearning learning_segment;
       learning_segment.key = candidate.key;
       learning_segment.value = candidate.value;
+      learning_segment.content_key = candidate.content_key;
+      learning_segment.content_value = candidate.content_value;
       learning_segment.description = GetDescription(candidate);
       learning_segments->push_back_conversion_segment(learning_segment);
     } else {
-      int key_start_pos = 0, value_start_pos = 0;
-      for (size_t j = 0; j < candidate.inner_segment_boundary.size(); ++j) {
-        const int key_len = candidate.inner_segment_boundary[j].first;
-        const int value_len = candidate.inner_segment_boundary[j].second;
-        SegmentForLearning learning_segment;
-        Util::SubString(candidate.key, key_start_pos,
-                        key_len, &learning_segment.key);
-        Util::SubString(candidate.value, value_start_pos,
-                        value_len, &learning_segment.value);
+      SegmentForLearning learning_segment;
+      for (Segment::Candidate::InnerSegmentIterator iter(&candidate);
+           !iter.Done(); iter.Next()) {
+        iter.GetKey().CopyToString(&learning_segment.key);
+        iter.GetValue().CopyToString(&learning_segment.value);
+        iter.GetContentKey().CopyToString(&learning_segment.content_key);
+        iter.GetContentValue().CopyToString(&learning_segment.content_value);
         learning_segments->push_back_conversion_segment(learning_segment);
-        key_start_pos += key_len;
-        value_start_pos += value_len;
       }
-      DCHECK_EQ(key_start_pos, Util::CharsLen(candidate.key));
-      DCHECK_EQ(value_start_pos, Util::CharsLen(candidate.value));
     }
   }
 }
@@ -1770,6 +1770,15 @@
            segment.description,
            is_suggestion_selected, next_fp_to_set,
            last_access_time, segments);
+    if (content_word_learning_enabled_ &&
+        segment.content_key != segment.key &&
+        segment.content_value != segment.value) {
+      Insert(segment.content_key,
+             segment.content_value,
+             segment.description,
+             is_suggestion_selected, 0,
+             last_access_time, segments);
+    }
   }
 
   // Insert all_key/all_value
diff --git a/src/prediction/user_history_predictor.h b/src/prediction/user_history_predictor.h
index 33442ac..ee2e1ac 100644
--- a/src/prediction/user_history_predictor.h
+++ b/src/prediction/user_history_predictor.h
@@ -85,9 +85,14 @@
  public:
   UserHistoryPredictor(const DictionaryInterface *dictionary,
                        const POSMatcher *pos_matcher,
-                       const SuppressionDictionary *suppression_dictionary);
+                       const SuppressionDictionary *suppression_dictionary,
+                       bool enable_content_word_learning);
   virtual ~UserHistoryPredictor();
 
+  void set_content_word_learning_enabled(bool value) {
+    content_word_learning_enabled_ = value;
+  }
+
   virtual bool Predict(Segments *segments) const;
   virtual bool PredictForRequest(const ConversionRequest &request,
                                  Segments *segments) const;
@@ -145,6 +150,8 @@
   struct SegmentForLearning {
     string key;
     string value;
+    string content_key;
+    string content_value;
     string description;
   };
   static uint32 LearningSegmentFingerprint(const SegmentForLearning &segment);
@@ -489,6 +496,7 @@
   const SuppressionDictionary *suppression_dictionary_;
   const string predictor_name_;
 
+  bool content_word_learning_enabled_;
   bool updated_;
   scoped_ptr<DicCache> dic_;
   mutable scoped_ptr<UserHistoryPredictorSyncer> syncer_;
diff --git a/src/prediction/user_history_predictor_test.cc b/src/prediction/user_history_predictor_test.cc
index ec4c8f3..9e181bf 100644
--- a/src/prediction/user_history_predictor_test.cc
+++ b/src/prediction/user_history_predictor_test.cc
@@ -299,7 +299,8 @@
     ret->predictor.reset(
         new UserHistoryPredictor(ret->dictionary.get(),
                                  data_manager.GetPOSMatcher(),
-                                 ret->suppression_dictionary.get()));
+                                 ret->suppression_dictionary.get(),
+                                 false));
     return ret;
   }
 
@@ -3508,12 +3509,12 @@
     candidate->content_value = kValue;
     candidate->key = kKey;
     candidate->content_key = kKey;
-    // "わたしの, 私の"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(4, 2));
-    // "なまえは, 名前は"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(4, 3));
-    // "なかのです, 中野です"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(5, 4));
+    // "わたしの, 私の", "わたし, 私"
+    candidate->PushBackInnerSegmentBoundary(12, 6, 9, 3);
+    // "なまえは, 名前は", "なまえ, 名前"
+    candidate->PushBackInnerSegmentBoundary(12, 9, 9, 6);
+    // "なかのです, 中野です", "なかの, 中野"
+    candidate->PushBackInnerSegmentBoundary(15, 12, 9, 6);
   }
   predictor->Finish(&segments);
   segments.Clear();
@@ -3562,12 +3563,12 @@
     candidate->content_value = kValue;
     candidate->key = kKey;
     candidate->content_key = kKey;
-    // "わたしの, 私の"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(4, 2));
-    // "なまえは, 名前は"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(4, 3));
-    // "なかのです, 中野です"
-    candidate->inner_segment_boundary.push_back(pair<int, int>(5, 4));
+    // "わたしの, 私の", "わたし, 私"
+    candidate->PushBackInnerSegmentBoundary(12, 6, 9, 3);
+    // "なまえは, 名前は", "なまえ, 名前"
+    candidate->PushBackInnerSegmentBoundary(12, 9, 9, 6);
+    // "なかのです, 中野です", "なかの, 中野"
+    candidate->PushBackInnerSegmentBoundary(15, 12, 9, 6);
   }
   predictor->Finish(&segments);
   segments.Clear();
@@ -4342,4 +4343,67 @@
       "\xE3\x81\x84\xE5\xA4\xA9\xE6\xB0\x97\x21"));
 }
 
+TEST_F(UserHistoryPredictorTest, ContentWordLearningFromInnerSegmentBoundary) {
+  UserHistoryPredictor *predictor = GetUserHistoryPredictorWithClearedHistory();
+  predictor->set_content_word_learning_enabled(true);
+
+  Segments segments;
+  {
+    // "とうきょうかなごやにいきたい"
+    const char kKey[] =
+        "\xE3\x81\xA8\xE3\x81\x86\xE3\x81\x8D\xE3\x82\x87\xE3\x81\x86"
+        "\xE3\x81\x8B\xE3\x81\xAA\xE3\x81\x94\xE3\x82\x84\xE3\x81\xAB"
+        "\xE3\x81\x84\xE3\x81\x8D\xE3\x81\x9F\xE3\x81\x84";
+    // "東京か名古屋に行きたい"
+    const char kValue[] =
+        "\xE6\x9D\xB1\xE4\xBA\xAC\xE3\x81\x8B\xE5\x90\x8D\xE5\x8F\xA4"
+        "\xE5\xB1\x8B\xE3\x81\xAB\xE8\xA1\x8C\xE3\x81\x8D\xE3\x81\x9F"
+        "\xE3\x81\x84";
+    MakeSegmentsForPrediction(kKey, &segments);
+    Segment::Candidate *candidate =
+        segments.mutable_segment(0)->add_candidate();
+    candidate->Init();
+    candidate->key = kKey;
+    candidate->value = kValue;
+    candidate->content_key = kKey;
+    candidate->content_value = kValue;
+    // "とうきょうか", "東京か", "とうきょう", "東京"
+    candidate->PushBackInnerSegmentBoundary(18, 9, 15, 6);
+    // "なごやに", "名古屋に", "なごや", "名古屋"
+    candidate->PushBackInnerSegmentBoundary(12, 12, 9, 9);
+    // "いきたい", "行きたい", "いきたい", "行きたい"
+    candidate->PushBackInnerSegmentBoundary(12, 12, 12, 12);
+    predictor->Finish(&segments);
+  }
+
+  segments.Clear();
+  // "と"
+  MakeSegmentsForPrediction("\xE3\x81\xA8", &segments);
+  EXPECT_TRUE(predictor->Predict(&segments));
+  // "東京"
+  EXPECT_TRUE(FindCandidateByValue("\xE6\x9D\xB1\xE4\xBA\xAC", segments));
+  // "東京か"
+  EXPECT_TRUE(FindCandidateByValue("\xE6\x9D\xB1\xE4\xBA\xAC\xE3\x81\x8B",
+                                   segments));
+
+  segments.Clear();
+  // "な"
+  MakeSegmentsForPrediction("\xE3\x81\xAA", &segments);
+  EXPECT_TRUE(predictor->Predict(&segments));
+  // "名古屋"
+  EXPECT_TRUE(FindCandidateByValue("\xE5\x90\x8D\xE5\x8F\xA4\xE5\xB1\x8B",
+                                   segments));
+  // "名古屋に"
+  EXPECT_TRUE(FindCandidateByValue(
+      "\xE5\x90\x8D\xE5\x8F\xA4\xE5\xB1\x8B\xE3\x81\xAB", segments));
+
+  segments.Clear();
+  // "い"
+  MakeSegmentsForPrediction("\xE3\x81\x84", &segments);
+  EXPECT_TRUE(predictor->Predict(&segments));
+  // "行きたい"
+  EXPECT_TRUE(FindCandidateByValue(
+      "\xE8\xA1\x8C\xE3\x81\x8D\xE3\x81\x9F\xE3\x81\x84", segments));
+}
+
 }  // namespace mozc