Simplify the bytecode for the 80-10FFFF rune range. Change-Id: I9185a44413375b207b5a050dda8d4c601c501eff Reviewed-on: https://code-review.googlesource.com/c/re2/+/49270 Reviewed-by: Paul Wankadia <junyer@google.com>

commit: 6a86f6b3f4a6d797886cd4cf6bca73ed25b2d00a [log] [tgz]
author: Paul Wankadia <junyer@google.com> Mon Dec 09 01:43:59 2019 -0800
committer: Paul Wankadia <junyer@google.com> Mon Dec 09 10:05:45 2019 +0000
tree: 6d4f2c54e46e8ddce6761c0a523ce2eba5ce5b68
parent: bb8e777557ddbdeabdedea4f23613c5021ffd7b1 [diff]
diff --git a/re2/compile.cc b/re2/compile.cc
index 7848dfc..1b7b66c 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc

@@ -662,48 +662,43 @@
                                    static_cast<uint8_t>(hi), foldcase, 0));
 }
 
-// Table describing how to make a UTF-8 matching machine
-// for the rune range 80-10FFFF (Runeself-Runemax).
-// This range happens frequently enough (for example /./ and /[^a-z]/)
-// and the rune_cache_ map is slow enough that this is worth
-// special handling.  Makes compilation of a small expression
-// with a dot in it about 10% faster.
-// The * in the comments below mark whole sequences.
-static struct ByteRangeProg {
-  int next;
-  int lo;
-  int hi;
-} prog_80_10ffff[] = {
-  // Two-byte
-  { -1, 0x80, 0xBF, },  // 0:  80-BF
-  {  0, 0xC2, 0xDF, },  // 1:  C2-DF 80-BF*
-
-  // Three-byte
-  {  0, 0xA0, 0xBF, },  // 2:  A0-BF 80-BF
-  {  2, 0xE0, 0xE0, },  // 3:  E0 A0-BF 80-BF*
-  {  0, 0x80, 0xBF, },  // 4:  80-BF 80-BF
-  {  4, 0xE1, 0xEF, },  // 5:  E1-EF 80-BF 80-BF*
-
-  // Four-byte
-  {  4, 0x90, 0xBF, },  // 6:  90-BF 80-BF 80-BF
-  {  6, 0xF0, 0xF0, },  // 7:  F0 90-BF 80-BF 80-BF*
-  {  4, 0x80, 0xBF, },  // 8:  80-BF 80-BF 80-BF
-  {  8, 0xF1, 0xF3, },  // 9: F1-F3 80-BF 80-BF 80-BF*
-  {  4, 0x80, 0x8F, },  // 10: 80-8F 80-BF 80-BF
-  { 10, 0xF4, 0xF4, },  // 11: F4 80-8F 80-BF 80-BF*
-};
-
 void Compiler::Add_80_10ffff() {
-  int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
-  for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) {
-    const ByteRangeProg& p = prog_80_10ffff[i];
-    int next = 0;
-    if (p.next >= 0)
-      next = inst[p.next];
-    inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
-                                     static_cast<uint8_t>(p.hi), false, next);
-    if ((p.lo & 0xC0) != 0x80)
-      AddSuffix(inst[i]);
+  // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
+  // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
+  // permitting overlong encodings in E0 and F0 sequences and code points
+  // over 10FFFF in F4 sequences, the size of the bytecode and the number
+  // of equivalence classes are reduced significantly.
+  int id;
+  if (reversed_) {
+    // Prefix factoring matters, but we don't have to handle it here
+    // because the rune range trie logic takes care of that already.
+    id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
+    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+    AddSuffix(id);
+
+    id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
+    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+    AddSuffix(id);
+
+    id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
+    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+    AddSuffix(id);
+  } else {
+    // Suffix factoring matters - and we do have to handle it here.
+    int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
+    id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
+    AddSuffix(id);
+
+    int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
+    id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
+    AddSuffix(id);
+
+    int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
+    id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
+    AddSuffix(id);
   }
 }
 
@@ -711,9 +706,8 @@
   if (lo > hi)
     return;
 
-  // Pick off 80-10FFFF as a common special case
-  // that can bypass the slow rune_cache_.
-  if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
+  // Pick off 80-10FFFF as a common special case.
+  if (lo == 0x80 && hi == 0x10ffff) {
     Add_80_10ffff();
     return;
   }

diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 6b77cf9..4598aa6 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc

@@ -147,10 +147,19 @@
   Regexp* re = Regexp::Parse(pattern, flags, NULL);
   EXPECT_TRUE(re != NULL);
 
-  Prog* prog = re->CompileToProg(0);
-  EXPECT_TRUE(prog != NULL);
-  *bytemap = prog->DumpByteMap();
-  delete prog;
+  {
+    Prog* prog = re->CompileToProg(0);
+    EXPECT_TRUE(prog != NULL);
+    *bytemap = prog->DumpByteMap();
+    delete prog;
+  }
+
+  {
+    Prog* prog = re->CompileToReverseProg(0);
+    EXPECT_TRUE(prog != NULL);
+    EXPECT_EQ(*bytemap, prog->DumpByteMap());
+    delete prog;
+  }
 
   re->Decref();
 }
@@ -213,16 +222,11 @@
   EXPECT_EQ("[00-09] -> 0\n"
             "[0a-0a] -> 1\n"
             "[0b-7f] -> 0\n"
-            "[80-8f] -> 2\n"
-            "[90-9f] -> 3\n"
-            "[a0-bf] -> 4\n"
+            "[80-bf] -> 2\n"
             "[c0-c1] -> 1\n"
-            "[c2-df] -> 5\n"
-            "[e0-e0] -> 6\n"
-            "[e1-ef] -> 7\n"
-            "[f0-f0] -> 8\n"
-            "[f1-f3] -> 9\n"
-            "[f4-f4] -> 10\n"
+            "[c2-df] -> 3\n"
+            "[e0-ef] -> 4\n"
+            "[f0-f4] -> 5\n"
             "[f5-ff] -> 1\n",
             bytemap);
 }
@@ -299,20 +303,22 @@
             "8. byte [f0-f0] 0 -> 7\n",
             reverse);
 
-  Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
-  EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
-            "4+ byte [c2-df] 0 -> 7\n"
-            "5+ byte [a0-bf] 1 -> 8\n"
-            "6. byte [80-bf] 0 -> 9\n"
+  Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
+  EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
+            "4+ byte [e0-ef] 0 -> 8\n"
+            "5. byte [f0-f4] 0 -> 9\n"
+            "6. byte [80-bf] 0 -> 7\n"
             "7. match! 0\n"
-            "8. byte [e0-e0] 0 -> 7\n"
-            "9+ byte [e1-ef] 0 -> 7\n"
-            "10+ byte [90-bf] 1 -> 13\n"
-            "11+ byte [80-bf] 1 -> 14\n"
-            "12. byte [80-8f] 0 -> 15\n"
-            "13. byte [f0-f0] 0 -> 7\n"
-            "14. byte [f1-f3] 0 -> 7\n"
-            "15. byte [f4-f4] 0 -> 7\n",
+            "8. byte [80-bf] 0 -> 6\n"
+            "9. byte [80-bf] 0 -> 8\n",
+            forward);
+  EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
+            "4+ byte [c2-df] 0 -> 6\n"
+            "5. byte [80-bf] 0 -> 7\n"
+            "6. match! 0\n"
+            "7+ byte [e0-ef] 0 -> 6\n"
+            "8. byte [80-bf] 0 -> 9\n"
+            "9. byte [f0-f4] 0 -> 6\n",
             reverse);
 }
 

diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 2f4b90c..e8165bf 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc

@@ -479,22 +479,21 @@
   ASSERT_EQ(3, re1.ProgramFanout(&histogram));
   ASSERT_EQ(1, histogram[3]);
 
-  // 7 is the largest non-empty bucket and has 10 elements.
-  ASSERT_EQ(7, re10.ProgramFanout(&histogram));
-  ASSERT_EQ(10, histogram[7]);
+  // 6 is the largest non-empty bucket and has 10 elements.
+  ASSERT_EQ(6, re10.ProgramFanout(&histogram));
+  ASSERT_EQ(10, histogram[6]);
 
-  // 10 is the largest non-empty bucket and has 100 elements.
-  ASSERT_EQ(10, re100.ProgramFanout(&histogram));
-  ASSERT_EQ(100, histogram[10]);
+  // 9 is the largest non-empty bucket and has 100 elements.
+  ASSERT_EQ(9, re100.ProgramFanout(&histogram));
+  ASSERT_EQ(100, histogram[9]);
 
   // 13 is the largest non-empty bucket and has 1000 elements.
   ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
   ASSERT_EQ(1000, histogram[13]);
 
-  // 2 is the largest non-empty bucket and has 3 elements.
-  // This differs from the others due to how reverse `.' works.
+  // 2 is the largest non-empty bucket and has 1 element.
   ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
-  ASSERT_EQ(3, histogram[2]);
+  ASSERT_EQ(1, histogram[2]);
 
   // 5 is the largest non-empty bucket and has 10 elements.
   ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
commit	6a86f6b3f4a6d797886cd4cf6bca73ed25b2d00a	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Mon Dec 09 01:43:59 2019 -0800
committer	Paul Wankadia <junyer@google.com>	Mon Dec 09 10:05:45 2019 +0000
tree	6d4f2c54e46e8ddce6761c0a523ce2eba5ce5b68
parent	bb8e777557ddbdeabdedea4f23613c5021ffd7b1 [diff]