Fix SimplifyStringSet() for really reals. And the test.
Change-Id: I4c211f8f4eea0a5b2afcbb85319a8467b7f29398
Reviewed-on: https://code-review.googlesource.com/c/re2/+/42518
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index 2ad8c60..f61d54b 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -147,14 +147,11 @@
// is because, when we are performing a string search to filter
// regexps, matching "ab" will already allow this regexp to be a
// candidate for match, so further matching "abc" is redundant.
- // Note that we must erase "" because find() would find it at the
+ // Note that we must ignore "" because find() would find it at the
// start of everything and thus we would end up erasing everything.
- SSIter i = ss->begin();
- while (i != ss->end()) {
- if (i->empty()) {
- i = ss->erase(i);
+ for (SSIter i = ss->begin(); i != ss->end(); ++i) {
+ if (i->empty())
continue;
- }
SSIter j = i;
++j;
while (j != ss->end()) {
@@ -164,7 +161,6 @@
}
++j;
}
- ++i;
}
}
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
index 647ddeb..e54caf7 100644
--- a/re2/testing/filtered_re2_test.cc
+++ b/re2/testing/filtered_re2_test.cc
@@ -103,11 +103,9 @@
}, {
// Test to make sure that any atoms that have another atom as a
// substring in an OR are removed; that is, only the shortest
- // substring is kept. The (|... tests that "" does not cause the
- // removal of every other atom in the OR, which used to happen
- // for two separate reasons.
+ // substring is kept.
"SubstrAtomRemovesSuperStrInOr", {
- "(|abc123|abc|ghi789|abc1234).*[x-z]+",
+ "(abc123|abc|ghi789|abc1234).*[x-z]+",
"abcd..yyy..yyyzzz",
"mnmnpp[a-z]+PPP"
}, {
@@ -279,4 +277,18 @@
EXPECT_EQ(2, matching_regexps.size());
}
+TEST(FilteredRE2Test, EmptyStringInStringSetBug) {
+ // Bug due to find() finding "" at the start of everything in a string
+ // set and thus SimplifyStringSet() would end up erasing everything.
+ // In order to test this, we have to keep PrefilterTree from discarding
+ // the OR entirely, so we have to make the minimum atom length zero.
+
+ FilterTestVars v(0); // override the minimum atom length
+ const char* regexps[] = {"-R.+(|ADD=;AA){12}}"};
+ const char* atoms[] = {"", "-r", "add=;aa", "}"};
+ AddRegexpsAndCompile(regexps, arraysize(regexps), &v);
+ EXPECT_TRUE(CheckExpectedAtoms(atoms, arraysize(atoms),
+ "EmptyStringInStringSetBug", &v));
+}
+
} // namespace re2