(|a)* shouldn't match more text than (|a)+ does! Fixes #310. Change-Id: I16c500e4b4c67a9f01dc9403773f3a40356ecc4d Reviewed-on: https://code-review.googlesource.com/c/re2/+/58590 Reviewed-by: Paul Wankadia <junyer@google.com>

commit: bc423653fdf28618554da96e1532662d1e33eaca [log] [tgz]
author: Paul Wankadia <junyer@google.com> Wed May 12 08:59:23 2021 -0700
committer: Paul Wankadia <junyer@google.com> Wed May 12 16:08:31 2021 +0000
tree: c9980319e300ec3e47ec505d96bcf9c373dad785
parent: aa4563370444e604995ff5fba93e25858f266b79 [diff]
diff --git a/re2/compile.cc b/re2/compile.cc
index 7a9de07..f3624b9 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc

@@ -79,9 +79,11 @@
 struct Frag {
   uint32_t begin;
   PatchList end;
+  bool nullable;
 
-  Frag() : begin(0) { end.head = 0; }  // needed so Frag can go in vector
-  Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
+  Frag() : begin(0), end(kNullPatchList), nullable(false) {}
+  Frag(uint32_t begin, PatchList end, bool nullable)
+      : begin(begin), end(end), nullable(nullable) {}
 };
 
 // Input encodings.
@@ -264,7 +266,7 @@
 
 // Returns an unmatchable fragment.
 Frag Compiler::NoMatch() {
-  return Frag(0, kNullPatchList);
+  return Frag();
 }
 
 // Is a an unmatchable fragment?
@@ -290,11 +292,11 @@
   // To run backward over string, reverse all concatenations.
   if (reversed_) {
     PatchList::Patch(inst_.data(), b.end, a.begin);
-    return Frag(b.begin, a.end);
+    return Frag(b.begin, a.end, b.nullable && a.nullable);
   }
 
   PatchList::Patch(inst_.data(), a.end, b.begin);
-  return Frag(a.begin, b.end);
+  return Frag(a.begin, b.end, a.nullable && b.nullable);
 }
 
 // Given fragments for a and b, returns fragment for a|b.
@@ -310,7 +312,8 @@
     return NoMatch();
 
   inst_[id].InitAlt(a.begin, b.begin);
-  return Frag(id, PatchList::Append(inst_.data(), a.end, b.end));
+  return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
+              a.nullable || b.nullable);
 }
 
 // When capturing submatches in like-Perl mode, a kOpAlt Inst
@@ -320,27 +323,44 @@
 // then the operator is greedy.  If out1_ is the repetition
 // (and out_ moves forward), then the operator is non-greedy.
 
-// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
-Frag Compiler::Star(Frag a, bool nongreedy) {
+// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
+Frag Compiler::Plus(Frag a, bool nongreedy) {
   int id = AllocInst(1);
   if (id < 0)
     return NoMatch();
-  inst_[id].InitAlt(0, 0);
-  PatchList::Patch(inst_.data(), a.end, id);
+  PatchList pl;
   if (nongreedy) {
-    inst_[id].out1_ = a.begin;
-    return Frag(id, PatchList::Mk(id << 1));
+    inst_[id].InitAlt(0, a.begin);
+    pl = PatchList::Mk(id << 1);
   } else {
-    inst_[id].set_out(a.begin);
-    return Frag(id, PatchList::Mk((id << 1) | 1));
+    inst_[id].InitAlt(a.begin, 0);
+    pl = PatchList::Mk((id << 1) | 1);
   }
+  PatchList::Patch(inst_.data(), a.end, id);
+  return Frag(a.begin, pl, a.nullable);
 }
 
-// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
-Frag Compiler::Plus(Frag a, bool nongreedy) {
-  // a+ is just a* with a different entry point.
-  Frag f = Star(a, nongreedy);
-  return Frag(a.begin, f.end);
+// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
+Frag Compiler::Star(Frag a, bool nongreedy) {
+  // When the subexpression is nullable, one Alt isn't enough to guarantee
+  // correct priority ordering within the transitive closure. The simplest
+  // solution is to handle it as (a+)? instead, which adds the second Alt.
+  if (a.nullable)
+    return Quest(Plus(a, nongreedy), nongreedy);
+
+  int id = AllocInst(1);
+  if (id < 0)
+    return NoMatch();
+  PatchList pl;
+  if (nongreedy) {
+    inst_[id].InitAlt(0, a.begin);
+    pl = PatchList::Mk(id << 1);
+  } else {
+    inst_[id].InitAlt(a.begin, 0);
+    pl = PatchList::Mk((id << 1) | 1);
+  }
+  PatchList::Patch(inst_.data(), a.end, id);
+  return Frag(id, pl, true);
 }
 
 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
@@ -358,7 +378,7 @@
     inst_[id].InitAlt(a.begin, 0);
     pl = PatchList::Mk((id << 1) | 1);
   }
-  return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
+  return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
 }
 
 // Returns a fragment for the byte range lo-hi.
@@ -367,7 +387,7 @@
   if (id < 0)
     return NoMatch();
   inst_[id].InitByteRange(lo, hi, foldcase, 0);
-  return Frag(id, PatchList::Mk(id << 1));
+  return Frag(id, PatchList::Mk(id << 1), false);
 }
 
 // Returns a no-op fragment.  Sometimes unavoidable.
@@ -376,7 +396,7 @@
   if (id < 0)
     return NoMatch();
   inst_[id].InitNop(0);
-  return Frag(id, PatchList::Mk(id << 1));
+  return Frag(id, PatchList::Mk(id << 1), true);
 }
 
 // Returns a fragment that signals a match.
@@ -385,7 +405,7 @@
   if (id < 0)
     return NoMatch();
   inst_[id].InitMatch(match_id);
-  return Frag(id, kNullPatchList);
+  return Frag(id, kNullPatchList, false);
 }
 
 // Returns a fragment matching a particular empty-width op (like ^ or $)
@@ -394,7 +414,7 @@
   if (id < 0)
     return NoMatch();
   inst_[id].InitEmptyWidth(empty, 0);
-  return Frag(id, PatchList::Mk(id << 1));
+  return Frag(id, PatchList::Mk(id << 1), true);
 }
 
 // Given a fragment a, returns a fragment with capturing parens around a.
@@ -408,7 +428,7 @@
   inst_[id+1].InitCapture(2*n+1, 0);
   PatchList::Patch(inst_.data(), a.end, id+1);
 
-  return Frag(id, PatchList::Mk((id+1) << 1));
+  return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
 }
 
 // A Rune is a name for a Unicode code point.
@@ -567,7 +587,7 @@
 Frag Compiler::FindByteRange(int root, int id) {
   if (inst_[root].opcode() == kInstByteRange) {
     if (ByteRangeEqual(root, id))
-      return Frag(root, kNullPatchList);
+      return Frag(root, kNullPatchList, false);
     else
       return NoMatch();
   }
@@ -575,7 +595,7 @@
   while (inst_[root].opcode() == kInstAlt) {
     int out1 = inst_[root].out1();
     if (ByteRangeEqual(out1, id))
-      return Frag(root, PatchList::Mk((root << 1) | 1));
+      return Frag(root, PatchList::Mk((root << 1) | 1), false);
 
     // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
     // what we're looking for, then we can stop immediately. Unfortunately, we
@@ -587,7 +607,7 @@
     if (inst_[out].opcode() == kInstAlt)
       root = out;
     else if (ByteRangeEqual(out, id))
-      return Frag(root, PatchList::Mk(root << 1));
+      return Frag(root, PatchList::Mk(root << 1), false);
     else
       return NoMatch();
   }

diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc
index 2096e2f..4718830 100644
--- a/re2/testing/compile_test.cc
+++ b/re2/testing/compile_test.cc

@@ -109,6 +109,20 @@
   { "[[-`]",
     "3. byte [5b-60] 0 -> 4\n"
     "4. match! 0\n" },
+  // Issue 310
+  { "(?:|a)*",
+    "3+ nop -> 7\n"
+    "4. nop -> 9\n"
+    "5+ nop -> 7\n"
+    "6. nop -> 9\n"
+    "7+ nop -> 5\n"
+    "8. byte [61-61] 0 -> 5\n"
+    "9. match! 0\n" },
+  { "(?:|a)+",
+    "3+ nop -> 5\n"
+    "4. byte [61-61] 0 -> 5\n"
+    "5+ nop -> 3\n"
+    "6. match! 0\n" },
 };
 
 TEST(TestRegexpCompileToProg, Simple) {
@@ -338,27 +352,37 @@
             forward);
 
   Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
-  EXPECT_EQ("3+ nop -> 6\n"
-            "4+ nop -> 8\n"
-            "5. nop -> 21\n"
-            "6+ byte [61-61] 1 -> 6\n"
-            "7. nop -> 3\n"
-            "8+ byte [62-62] 1 -> 8\n"
-            "9. nop -> 3\n"
-            "10+ byte [61-61] 1 -> 10\n"
-            "11. nop -> 21\n"
-            "12+ byte [62-62] 1 -> 12\n"
-            "13. nop -> 21\n"
-            "14+ byte [61-61] 1 -> 14\n"
-            "15. nop -> 18\n"
-            "16+ byte [62-62] 1 -> 16\n"
-            "17. nop -> 18\n"
-            "18+ nop -> 14\n"
-            "19+ nop -> 16\n"
-            "20. match! 0\n"
-            "21+ nop -> 10\n"
-            "22+ nop -> 12\n"
-            "23. nop -> 18\n",
+  EXPECT_EQ("3+ nop -> 28\n"
+            "4. nop -> 30\n"
+            "5+ byte [61-61] 1 -> 5\n"
+            "6. nop -> 32\n"
+            "7+ byte [61-61] 1 -> 7\n"
+            "8. nop -> 26\n"
+            "9+ byte [61-61] 1 -> 9\n"
+            "10. nop -> 20\n"
+            "11+ byte [62-62] 1 -> 11\n"
+            "12. nop -> 20\n"
+            "13+ byte [62-62] 1 -> 13\n"
+            "14. nop -> 26\n"
+            "15+ byte [62-62] 1 -> 15\n"
+            "16. nop -> 32\n"
+            "17+ nop -> 9\n"
+            "18. nop -> 11\n"
+            "19. match! 0\n"
+            "20+ nop -> 17\n"
+            "21. nop -> 19\n"
+            "22+ nop -> 7\n"
+            "23. nop -> 13\n"
+            "24+ nop -> 17\n"
+            "25. nop -> 19\n"
+            "26+ nop -> 22\n"
+            "27. nop -> 24\n"
+            "28+ nop -> 5\n"
+            "29. nop -> 15\n"
+            "30+ nop -> 22\n"
+            "31. nop -> 24\n"
+            "32+ nop -> 28\n"
+            "33. nop -> 30\n",
       forward);
 
   Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);

diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 41fccf6..b1f7d73 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc

@@ -485,37 +485,37 @@
 
   std::vector<int> histogram;
 
-  // 3 is the largest non-empty bucket and has 1 element.
+  // 3 is the largest non-empty bucket and has 2 element.
   ASSERT_EQ(3, re1.ProgramFanout(&histogram));
-  ASSERT_EQ(1, histogram[3]);
+  ASSERT_EQ(2, histogram[3]);
 
-  // 6 is the largest non-empty bucket and has 10 elements.
+  // 6 is the largest non-empty bucket and has 11 elements.
   ASSERT_EQ(6, re10.ProgramFanout(&histogram));
-  ASSERT_EQ(10, histogram[6]);
+  ASSERT_EQ(11, histogram[6]);
 
-  // 9 is the largest non-empty bucket and has 100 elements.
+  // 9 is the largest non-empty bucket and has 101 elements.
   ASSERT_EQ(9, re100.ProgramFanout(&histogram));
-  ASSERT_EQ(100, histogram[9]);
+  ASSERT_EQ(101, histogram[9]);
 
-  // 13 is the largest non-empty bucket and has 1000 elements.
+  // 13 is the largest non-empty bucket and has 1001 elements.
   ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
-  ASSERT_EQ(1000, histogram[13]);
+  ASSERT_EQ(1001, histogram[13]);
 
-  // 2 is the largest non-empty bucket and has 1 element.
+  // 2 is the largest non-empty bucket and has 2 element.
   ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
-  ASSERT_EQ(1, histogram[2]);
+  ASSERT_EQ(2, histogram[2]);
 
-  // 5 is the largest non-empty bucket and has 10 elements.
+  // 5 is the largest non-empty bucket and has 11 elements.
   ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
-  ASSERT_EQ(10, histogram[5]);
+  ASSERT_EQ(11, histogram[5]);
 
-  // 9 is the largest non-empty bucket and has 100 elements.
+  // 9 is the largest non-empty bucket and has 101 elements.
   ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
-  ASSERT_EQ(100, histogram[9]);
+  ASSERT_EQ(101, histogram[9]);
 
-  // 12 is the largest non-empty bucket and has 1000 elements.
+  // 12 is the largest non-empty bucket and has 1001 elements.
   ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
-  ASSERT_EQ(1000, histogram[12]);
+  ASSERT_EQ(1001, histogram[12]);
 }
 
 // Issue 956519: handling empty character sets was
@@ -1641,4 +1641,19 @@
   ASSERT_EQ("小人小类小", s);
 }
 
+TEST(RE2, Issue310) {
+  // (?:|a)* matched more text than (?:|a)+ did.
+
+  std::string s = "aaa";
+  StringPiece m;
+
+  RE2 star("(?:|a)*");
+  ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
+  ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
+
+  RE2 plus("(?:|a)+");
+  ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
+  ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
+}
+
 }  // namespace re2

diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc
index c20f501..5d86dbf 100644
--- a/re2/testing/search_test.cc
+++ b/re2/testing/search_test.cc

@@ -308,6 +308,8 @@
   // Former bugs.
   { "a\\C*|ba\\C", "baba" },
   { "\\w*I\\w*", "Inc." },
+  { "(?:|a)*", "aaa" },
+  { "(?:|a)+", "aaa" },
 };
 
 TEST(Regexp, SearchTests) {
commit	bc423653fdf28618554da96e1532662d1e33eaca	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Wed May 12 08:59:23 2021 -0700
committer	Paul Wankadia <junyer@google.com>	Wed May 12 16:08:31 2021 +0000
tree	c9980319e300ec3e47ec505d96bcf9c373dad785
parent	aa4563370444e604995ff5fba93e25858f266b79 [diff]