Fix SimplifyStringSet() properly. Ensure test coverage.

Change-Id: Ifca0d469c2c0858afdf899381bb27563774cc6a8
Reviewed-on: https://code-review.googlesource.com/c/re2/+/42514
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/prefilter.cc b/re2/prefilter.cc
index f0d9264..ba78ced 100644
--- a/re2/prefilter.cc
+++ b/re2/prefilter.cc
@@ -140,27 +140,31 @@
   return AndOr(OR, a, b);
 }
 
-static void SimplifyStringSet(std::set<std::string> *ss) {
+static void SimplifyStringSet(std::set<std::string>* ss) {
   // Now make sure that the strings aren't redundant.  For example, if
   // we know "ab" is a required string, then it doesn't help at all to
   // know that "abc" is also a required string, so delete "abc". This
   // is because, when we are performing a string search to filter
   // regexps, matching "ab" will already allow this regexp to be a
   // candidate for match, so further matching "abc" is redundant.
-  // Note that we must ignore "" because find() would find it at the
+  // Note that we must erase "" because find() would find it at the
   // start of everything and thus we would end up erasing everything.
-  for (SSIter i = ss->begin(); i != ss->end(); ++i) {
-    if (i->empty())
+  SSIter i = ss->begin();
+  while (i != ss->end()) {
+    if (i->empty()) {
+      i = ss->erase(i);
       continue;
+    }
     SSIter j = i;
     ++j;
     while (j != ss->end()) {
-      // Increment j early so that we can erase the element it points to.
-      SSIter old_j = j;
+      if (j->find(*i) != std::string::npos) {
+        j = ss->erase(j);
+        continue;
+      }
       ++j;
-      if (old_j->find(*i) != std::string::npos)
-        ss->erase(old_j);
     }
+    ++i;
   }
 }
 
diff --git a/re2/testing/filtered_re2_test.cc b/re2/testing/filtered_re2_test.cc
index 835ebcf..647ddeb 100644
--- a/re2/testing/filtered_re2_test.cc
+++ b/re2/testing/filtered_re2_test.cc
@@ -103,9 +103,11 @@
   }, {
     // Test to make sure that any atoms that have another atom as a
     // substring in an OR are removed; that is, only the shortest
-    // substring is kept.
+    // substring is kept. The (|... tests that "" does not cause the
+    // removal of every other atom in the OR, which used to happen
+    // for two separate reasons.
     "SubstrAtomRemovesSuperStrInOr", {
-      "(abc123|abc|ghi789|abc1234).*[x-z]+",
+      "(|abc123|abc|ghi789|abc1234).*[x-z]+",
       "abcd..yyy..yyyzzz",
       "mnmnpp[a-z]+PPP"
     }, {