Fix SimplifyStringSet() properly. Ensure test coverage.
Change-Id: Ifca0d469c2c0858afdf899381bb27563774cc6a8
Reviewed-on: https://code-review.googlesource.com/c/re2/+/42514
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index f0d9264..ba78ced 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -140,27 +140,31 @@
return AndOr(OR, a, b);
}
-static void SimplifyStringSet(std::set<std::string> *ss) {
+static void SimplifyStringSet(std::set<std::string>* ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
// is because, when we are performing a string search to filter
// regexps, matching "ab" will already allow this regexp to be a
// candidate for match, so further matching "abc" is redundant.
- // Note that we must ignore "" because find() would find it at the
+ // Note that we must erase "" because find() would find it at the
// start of everything and thus we would end up erasing everything.
- for (SSIter i = ss->begin(); i != ss->end(); ++i) {
- if (i->empty())
+ SSIter i = ss->begin();
+ while (i != ss->end()) {
+ if (i->empty()) {
+ i = ss->erase(i);
continue;
+ }
SSIter j = i;
++j;
while (j != ss->end()) {
- // Increment j early so that we can erase the element it points to.
- SSIter old_j = j;
+ if (j->find(*i) != std::string::npos) {
+ j = ss->erase(j);
+ continue;
+ }
++j;
- if (old_j->find(*i) != std::string::npos)
- ss->erase(old_j);
}
+ ++i;
}
}
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
index 835ebcf..647ddeb 100644
--- a/re2/testing/filtered_re2_test.cc
+++ b/re2/testing/filtered_re2_test.cc
@@ -103,9 +103,11 @@
}, {
// Test to make sure that any atoms that have another atom as a
// substring in an OR are removed; that is, only the shortest
- // substring is kept.
+ // substring is kept. The (|... tests that "" does not cause the
+ // removal of every other atom in the OR, which used to happen
+ // for two separate reasons.
"SubstrAtomRemovesSuperStrInOr", {
- "(abc123|abc|ghi789|abc1234).*[x-z]+",
+ "(|abc123|abc|ghi789|abc1234).*[x-z]+",
"abcd..yyy..yyyzzz",
"mnmnpp[a-z]+PPP"
}, {