Simplify the bytecode for the 80-10FFFF rune range.
Change-Id: I9185a44413375b207b5a050dda8d4c601c501eff
Reviewed-on: https://code-review.googlesource.com/c/re2/+/49270
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/compile.cc b/re2/compile.cc
index 7848dfc..1b7b66c 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -662,48 +662,43 @@
static_cast<uint8_t>(hi), foldcase, 0));
}
-// Table describing how to make a UTF-8 matching machine
-// for the rune range 80-10FFFF (Runeself-Runemax).
-// This range happens frequently enough (for example /./ and /[^a-z]/)
-// and the rune_cache_ map is slow enough that this is worth
-// special handling. Makes compilation of a small expression
-// with a dot in it about 10% faster.
-// The * in the comments below mark whole sequences.
-static struct ByteRangeProg {
- int next;
- int lo;
- int hi;
-} prog_80_10ffff[] = {
- // Two-byte
- { -1, 0x80, 0xBF, }, // 0: 80-BF
- { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*
-
- // Three-byte
- { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF
- { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*
- { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF
- { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*
-
- // Four-byte
- { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF
- { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*
- { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF
- { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*
- { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF
- { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
-};
-
void Compiler::Add_80_10ffff() {
- int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
- for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) {
- const ByteRangeProg& p = prog_80_10ffff[i];
- int next = 0;
- if (p.next >= 0)
- next = inst[p.next];
- inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
- static_cast<uint8_t>(p.hi), false, next);
- if ((p.lo & 0xC0) != 0x80)
- AddSuffix(inst[i]);
+ // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
+ // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
+ // permitting overlong encodings in E0 and F0 sequences and code points
+ // over 10FFFF in F4 sequences, the size of the bytecode and the number
+ // of equivalence classes are reduced significantly.
+ int id;
+ if (reversed_) {
+ // Prefix factoring matters, but we don't have to handle it here
+ // because the rune range trie logic takes care of that already.
+ id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ AddSuffix(id);
+
+ id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ AddSuffix(id);
+
+ id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
+ AddSuffix(id);
+ } else {
+ // Suffix factoring matters - and we do have to handle it here.
+ int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
+ id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
+ AddSuffix(id);
+
+ int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
+ id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
+ AddSuffix(id);
+
+ int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
+ id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
+ AddSuffix(id);
}
}
@@ -711,9 +706,8 @@
if (lo > hi)
return;
- // Pick off 80-10FFFF as a common special case
- // that can bypass the slow rune_cache_.
- if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
+ // Pick off 80-10FFFF as a common special case.
+ if (lo == 0x80 && hi == 0x10ffff) {
Add_80_10ffff();
return;
}
diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 6b77cf9..4598aa6 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc
@@ -147,10 +147,19 @@
Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
- Prog* prog = re->CompileToProg(0);
- EXPECT_TRUE(prog != NULL);
- *bytemap = prog->DumpByteMap();
- delete prog;
+ {
+ Prog* prog = re->CompileToProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *bytemap = prog->DumpByteMap();
+ delete prog;
+ }
+
+ {
+ Prog* prog = re->CompileToReverseProg(0);
+ EXPECT_TRUE(prog != NULL);
+ EXPECT_EQ(*bytemap, prog->DumpByteMap());
+ delete prog;
+ }
re->Decref();
}
@@ -213,16 +222,11 @@
EXPECT_EQ("[00-09] -> 0\n"
"[0a-0a] -> 1\n"
"[0b-7f] -> 0\n"
- "[80-8f] -> 2\n"
- "[90-9f] -> 3\n"
- "[a0-bf] -> 4\n"
+ "[80-bf] -> 2\n"
"[c0-c1] -> 1\n"
- "[c2-df] -> 5\n"
- "[e0-e0] -> 6\n"
- "[e1-ef] -> 7\n"
- "[f0-f0] -> 8\n"
- "[f1-f3] -> 9\n"
- "[f4-f4] -> 10\n"
+ "[c2-df] -> 3\n"
+ "[e0-ef] -> 4\n"
+ "[f0-f4] -> 5\n"
"[f5-ff] -> 1\n",
bytemap);
}
@@ -299,20 +303,22 @@
"8. byte [f0-f0] 0 -> 7\n",
reverse);
- Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
- EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
- "4+ byte [c2-df] 0 -> 7\n"
- "5+ byte [a0-bf] 1 -> 8\n"
- "6. byte [80-bf] 0 -> 9\n"
+ Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
+ "4+ byte [e0-ef] 0 -> 8\n"
+ "5. byte [f0-f4] 0 -> 9\n"
+ "6. byte [80-bf] 0 -> 7\n"
"7. match! 0\n"
- "8. byte [e0-e0] 0 -> 7\n"
- "9+ byte [e1-ef] 0 -> 7\n"
- "10+ byte [90-bf] 1 -> 13\n"
- "11+ byte [80-bf] 1 -> 14\n"
- "12. byte [80-8f] 0 -> 15\n"
- "13. byte [f0-f0] 0 -> 7\n"
- "14. byte [f1-f3] 0 -> 7\n"
- "15. byte [f4-f4] 0 -> 7\n",
+ "8. byte [80-bf] 0 -> 6\n"
+ "9. byte [80-bf] 0 -> 8\n",
+ forward);
+ EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
+ "4+ byte [c2-df] 0 -> 6\n"
+ "5. byte [80-bf] 0 -> 7\n"
+ "6. match! 0\n"
+ "7+ byte [e0-ef] 0 -> 6\n"
+ "8. byte [80-bf] 0 -> 9\n"
+ "9. byte [f0-f4] 0 -> 6\n",
reverse);
}
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 2f4b90c..e8165bf 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -479,22 +479,21 @@
ASSERT_EQ(3, re1.ProgramFanout(&histogram));
ASSERT_EQ(1, histogram[3]);
- // 7 is the largest non-empty bucket and has 10 elements.
- ASSERT_EQ(7, re10.ProgramFanout(&histogram));
- ASSERT_EQ(10, histogram[7]);
+ // 6 is the largest non-empty bucket and has 10 elements.
+ ASSERT_EQ(6, re10.ProgramFanout(&histogram));
+ ASSERT_EQ(10, histogram[6]);
- // 10 is the largest non-empty bucket and has 100 elements.
- ASSERT_EQ(10, re100.ProgramFanout(&histogram));
- ASSERT_EQ(100, histogram[10]);
+ // 9 is the largest non-empty bucket and has 100 elements.
+ ASSERT_EQ(9, re100.ProgramFanout(&histogram));
+ ASSERT_EQ(100, histogram[9]);
// 13 is the largest non-empty bucket and has 1000 elements.
ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
ASSERT_EQ(1000, histogram[13]);
- // 2 is the largest non-empty bucket and has 3 elements.
- // This differs from the others due to how reverse `.' works.
+ // 2 is the largest non-empty bucket and has 1 element.
ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
- ASSERT_EQ(3, histogram[2]);
+ ASSERT_EQ(1, histogram[2]);
// 5 is the largest non-empty bucket and has 10 elements.
ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));