Improve the efficiency of bytecode generated for CharClass.

Change-Id: Ibf00ad9575ae800eedfc1c0f0a440fae261dce53
Reviewed-on: https://code-review.googlesource.com/4040
Reviewed-by: Russ Cox <rsc@swtch.com>
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/compile.cc b/re2/compile.cc
index 5037524..9882fef 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -113,7 +113,7 @@
 // Input encodings.
 enum Encoding {
   kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
-  kEncodingLatin1,    // Latin1 (0-FF)
+  kEncodingLatin1,    // Latin-1 (0-FF)
 };
 
 class Compiler : public Regexp::Walker<Frag> {
@@ -193,12 +193,28 @@
   void Add_80_10ffff();
 
   // New suffix that matches the byte range lo-hi, then goes to next.
-  int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
   int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
+  int CachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
+
+  // Returns true iff the suffix is cached.
+  bool IsCachedRuneByteSuffix(int id);
 
   // Adds a suffix to alternation.
   void AddSuffix(int id);
 
+  // Adds a suffix to the trie starting from the given root node.
+  // Returns zero iff allocating an instruction fails. Otherwise, returns
+  // the current root node, which might be different from what was given.
+  int AddSuffixRecursive(int root, int id);
+
+  // Finds the trie node for the given suffix. Returns a Frag in order to
+  // distinguish between pointing at the root node directly (end.p == 0)
+  // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
+  Frag FindByteRange(int root, int id);
+
+  // Compares two ByteRanges and returns true iff they are equal.
+  bool ByteRangeEqual(int id1, int id2);
+
   // Returns the alternation of all the added suffixes.
   Frag EndRange();
 
@@ -496,21 +512,17 @@
   return f.begin;
 }
 
-int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
-  // In Latin1 mode, there's no point in caching.
-  // In forward UTF-8 mode, only need to cache continuation bytes.
-  if (encoding_ == kEncodingLatin1 ||
-      (encoding_ == kEncodingUTF8 &&
-       !reversed_ &&
-       !(0x80 <= lo && hi <= 0xbf))) {
-    return UncachedRuneByteSuffix(lo, hi, foldcase, next);
-  }
+static uint64 MakeRuneCacheKey(uint8 lo, uint8 hi, bool foldcase, int next) {
+  return (uint64)next << 17 |
+         (uint64)lo   <<  9 |
+         (uint64)hi   <<  1 |
+         (uint64)foldcase;
+}
 
-  uint64 key = (uint64)next << 17 |
-               (uint64)lo   <<  9 |
-               (uint64)hi   <<  1 |
-               (uint64)foldcase;
-  map<uint64, int>::iterator it = rune_cache_.find(key);
+int Compiler::CachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
+                                   int next) {
+  uint64 key = MakeRuneCacheKey(lo, hi, foldcase, next);
+  map<uint64, int>::const_iterator it = rune_cache_.find(key);
   if (it != rune_cache_.end())
     return it->second;
   int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
@@ -518,12 +530,28 @@
   return id;
 }
 
+bool Compiler::IsCachedRuneByteSuffix(int id) {
+  uint8 lo = inst_[id].lo_;
+  uint8 hi = inst_[id].hi_;
+  bool foldcase = inst_[id].foldcase() != 0;
+  int next = inst_[id].out();
+
+  uint64 key = MakeRuneCacheKey(lo, hi, foldcase, next);
+  return rune_cache_.find(key) != rune_cache_.end();
+}
+
 void Compiler::AddSuffix(int id) {
   if (rune_range_.begin == 0) {
     rune_range_.begin = id;
     return;
   }
 
+  if (encoding_ == kEncodingUTF8) {
+    // Build a trie in order to reduce fanout.
+    rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
+    return;
+  }
+
   int alt = AllocInst(1);
   if (alt < 0) {
     rune_range_.begin = 0;
@@ -533,6 +561,105 @@
   rune_range_.begin = alt;
 }
 
+int Compiler::AddSuffixRecursive(int root, int id) {
+  DCHECK(inst_[root].opcode() == kInstAlt ||
+         inst_[root].opcode() == kInstByteRange);
+
+  Frag f = FindByteRange(root, id);
+  if (IsNoMatch(f)) {
+    int alt = AllocInst(1);
+    if (alt < 0)
+      return 0;
+    inst_[alt].InitAlt(root, id);
+    return alt;
+  }
+
+  int br;
+  if (f.end.p == 0)
+    br = root;
+  else if (f.end.p&1)
+    br = inst_[f.begin].out1();
+  else
+    br = inst_[f.begin].out();
+
+  if (IsCachedRuneByteSuffix(br)) {
+    // We can't fiddle with cached suffixes, so make a clone of the head.
+    int byterange = AllocInst(1);
+    if (byterange < 0)
+      return 0;
+    inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
+                                   inst_[br].foldcase(), inst_[br].out());
+
+    // Ensure that the parent points to the clone, not to the original.
+    // Note that this could leave the head unreachable except via the cache.
+    br = byterange;
+    if (f.end.p == 0)
+      root = br;
+    else if (f.end.p&1)
+      inst_[f.begin].out1_ = br;
+    else
+      inst_[f.begin].set_out(br);
+  }
+
+  // We just saved one ByteRange instruction. :)
+  prog_->byte_inst_count_--;
+
+  int out = inst_[id].out();
+  if (!IsCachedRuneByteSuffix(id)) {
+    // The head should be the instruction most recently allocated, so free it
+    // instead of leaving it unreachable.
+    DCHECK_EQ(id, inst_len_-1);
+    inst_[id].out_opcode_ = 0;
+    inst_[id].out1_ = 0;
+    inst_len_--;
+  }
+
+  out = AddSuffixRecursive(inst_[br].out(), out);
+  if (out == 0)
+    return 0;
+
+  inst_[br].set_out(out);
+  return root;
+}
+
+bool Compiler::ByteRangeEqual(int id1, int id2) {
+  return inst_[id1].lo() == inst_[id2].lo() &&
+         inst_[id1].hi() == inst_[id2].hi() &&
+         inst_[id1].foldcase() == inst_[id2].foldcase();
+}
+
+Frag Compiler::FindByteRange(int root, int id) {
+  if (inst_[root].opcode() == kInstByteRange) {
+    if (ByteRangeEqual(root, id))
+      return Frag(root, nullPatchList);
+    else
+      return NoMatch();
+  }
+
+  while (inst_[root].opcode() == kInstAlt) {
+    int out1 = inst_[root].out1();
+    if (ByteRangeEqual(out1, id))
+      return Frag(root, PatchList::Mk((root << 1) | 1));
+
+    // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
+    // what we're looking for, then we can stop immediately. Unfortunately, we
+    // can't short-circuit the search in reverse mode.
+    if (!reversed_)
+      return NoMatch();
+
+    int out = inst_[root].out();
+    if (inst_[out].opcode() == kInstAlt)
+      root = out;
+    else if (ByteRangeEqual(out, id))
+      return Frag(root, PatchList::Mk(root << 1));
+    else
+      return NoMatch();
+  }
+
+  LOG(DFATAL) << "should never happen";
+  return NoMatch();
+}
+
 Frag Compiler::EndRange() {
   return rune_range_;
 }
@@ -556,13 +683,13 @@
 }
 
 void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
-  // Latin1 is easy: runes *are* bytes.
+  // Latin-1 is easy: runes *are* bytes.
   if (lo > hi || lo > 0xFF)
     return;
   if (hi > 0xFF)
     hi = 0xFF;
-  AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
-                           foldcase, 0));
+  AddSuffix(UncachedRuneByteSuffix(static_cast<uint8>(lo),
+                                   static_cast<uint8>(hi), foldcase, 0));
 }
 
 // Table describing how to make a UTF-8 matching machine
@@ -633,8 +760,8 @@
 
   // ASCII range is always a special case.
   if (hi < Runeself) {
-    AddSuffix(RuneByteSuffix(static_cast<uint8>(lo), static_cast<uint8>(hi),
-                             foldcase, 0));
+    AddSuffix(UncachedRuneByteSuffix(static_cast<uint8>(lo),
+                                     static_cast<uint8>(hi), foldcase, 0));
     return;
   }
 
@@ -662,13 +789,49 @@
   (void)m;  // USED(m)
   DCHECK_EQ(n, m);
 
+  // The logic below encodes this thinking:
+  //
+  // 1. When we have built the whole suffix, we know that it cannot
+  // possibly be a suffix of anything longer: in forward mode, nothing
+  // else can occur before the leading byte; in reverse mode, nothing
+  // else can occur after the last continuation byte or else the leading
+  // byte would have to change. Thus, there is no benefit to caching
+  // the first byte of the suffix whereas there is a cost involved in
+  // cloning it if it begins a common prefix, which is fairly likely.
+  //
+  // 2. Conversely, the last byte of the suffix cannot possibly be a
+  // prefix of anything because next == 0, so we will never want to
+  // clone it, but it is fairly likely to be a common suffix. Perhaps
+  // more so in reverse mode than in forward mode because the former is
+  // "converging" towards lower entropy, but caching is still worthwhile
+  // for the latter in cases such as 80-BF.
+  //
+  // 3. Handling the bytes between the first and the last is less
+  // straightforward and, again, the approach depends on whether we are
+  // "converging" towards lower entropy: in forward mode, a single byte
+  // is unlikely to be part of a common suffix whereas a byte range
+  // is more likely so; in reverse mode, a byte range is unlikely to
+  // be part of a common suffix whereas a single byte is more likely
+  // so. The same benefit versus cost argument applies here.
   int id = 0;
   if (reversed_) {
-    for (int i = 0; i < n; i++)
-      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
+    for (int i = 0; i < n; i++) {
+      // In reverse UTF-8 mode: cache the leading byte; don't cache the last
+      // continuation byte; cache anything else iff it's a single byte (XX-XX).
+      if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
+        id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
+      else
+        id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
+    }
   } else {
-    for (int i = n-1; i >= 0; i--)
-      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
+    for (int i = n-1; i >= 0; i--) {
+      // In forward UTF-8 mode: don't cache the leading byte; cache the last
+      // continuation byte; cache anything else iff it's a byte range (XX-YY).
+      if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
+        id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
+      else
+        id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
+    }
   }
   AddSuffix(id);
 }
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index d438b19..dee90a3 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -172,4 +172,90 @@
   re->Decref();
 }
 
+static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
+                 string* forward, string* reverse) {
+  Regexp* re = Regexp::Parse(pattern, flags, NULL);
+  EXPECT_TRUE(re != NULL);
+
+  if (forward != NULL) {
+    Prog* prog = re->CompileToProg(0);
+    EXPECT_TRUE(prog != NULL);
+    *forward = prog->Dump();
+    delete prog;
+  }
+
+  if (reverse != NULL) {
+    Prog* prog = re->CompileToReverseProg(0);
+    EXPECT_TRUE(prog != NULL);
+    *reverse = prog->Dump();
+    delete prog;
+  }
+
+  re->Decref();
+}
+
+TEST(TestCompile, Bug26705922) {
+  // Bug in the compiler caused inefficient bytecode to be generated for Unicode
+  // groups: common suffixes were cached, but common prefixes were not factored.
+
+  string forward, reverse;
+
+  Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
+  EXPECT_EQ("4. byte [f0-f0] -> 3\n"
+            "3. byte [90-90] -> 2\n"
+            "2. byte [80-80] -> 6\n"
+            "6. alt -> 1 | 5\n"
+            "1. byte [80-80] -> 7\n"
+            "5. byte [90-90] -> 7\n"
+            "7. match! 0\n",
+            forward);
+  EXPECT_EQ("6. alt -> 4 | 5\n"
+            "4. byte [80-80] -> 3\n"
+            "5. byte [90-90] -> 3\n"
+            "3. byte [80-80] -> 2\n"
+            "2. byte [90-90] -> 1\n"
+            "1. byte [f0-f0] -> 7\n"
+            "7. match! 0\n",
+            reverse);
+
+  Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
+  EXPECT_EQ("6. alt -> 3 | 5\n"
+            "3. byte [e8-ef] -> 2\n"
+            "5. byte [f0-f0] -> 4\n"
+            "2. byte [80-bf] -> 1\n"
+            "4. byte [90-90] -> 2\n"
+            "1. byte [80-bf] -> 7\n"
+            "7. match! 0\n",
+            forward);
+  EXPECT_EQ("3. byte [80-bf] -> 2\n"
+            "2. byte [80-bf] -> 6\n"
+            "6. alt -> 1 | 5\n"
+            "1. byte [e8-ef] -> 7\n"
+            "5. byte [90-90] -> 4\n"
+            "7. match! 0\n"
+            "4. byte [f0-f0] -> 7\n",
+            reverse);
+
+  Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
+  EXPECT_EQ("2. byte [80-bf] -> 8\n"
+            "8. alt -> 5 | 7\n"
+            "5. alt -> 1 | 4\n"
+            "7. byte [80-bf] -> 17\n"
+            "1. byte [c2-df] -> 18\n"
+            "4. byte [a0-bf] -> 3\n"
+            "17. alt -> 14 | 16\n"
+            "18. match! 0\n"
+            "3. byte [e0-e0] -> 18\n"
+            "14. alt -> 11 | 13\n"
+            "16. byte [80-8f] -> 15\n"
+            "11. alt -> 6 | 10\n"
+            "13. byte [80-bf] -> 12\n"
+            "15. byte [f4-f4] -> 18\n"
+            "6. byte [e1-ef] -> 18\n"
+            "10. byte [90-bf] -> 9\n"
+            "12. byte [f1-f3] -> 18\n"
+            "9. byte [f0-f0] -> 18\n",
+            reverse);
+}
+
 }  // namespace re2