Inspect substrings with a Walker<> instead of Prefilter.

For reasons related to its Boolean operations, Prefilter doesn't
necessarily surface all substrings. As a result, a lot of large
substrings were flying under the radar.

Change-Id: I92eca70108df4ed64b72e0211d3c485a717ce50a
Reviewed-on: https://code-review.googlesource.com/c/re2/+/59270
Reviewed-by: Perry Lorier <perryl@google.com>
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc
index af1129f..effce50 100644
--- a/re2/fuzzing/re2_fuzzer.cc
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -5,20 +5,62 @@
 #include <fuzzer/FuzzedDataProvider.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <memory>
-#include <queue>
+#include <algorithm>
 #include <string>
 #include <vector>
 
-#include "re2/prefilter.h"
 #include "re2/re2.h"
 #include "re2/regexp.h"
+#include "re2/walker-inl.h"
 
 using re2::StringPiece;
 
 // NOT static, NOT signed.
 uint8_t dummy = 0;
 
+// Walks substrings (i.e. kRegexpLiteralString subexpressions)
+// to determine their maximum length... in runes, but avoiding
+// overheads due to UTF-8 encoding is worthwhile when fuzzing.
+class SubstringWalker : public re2::Regexp::Walker<int> {
+ public:
+  SubstringWalker() = default;
+  ~SubstringWalker() override = default;
+
+  int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
+                int* child_args, int nchild_args) override {
+    switch (re->op()) {
+      case re2::kRegexpConcat:
+      case re2::kRegexpAlternate:
+      case re2::kRegexpStar:
+      case re2::kRegexpPlus:
+      case re2::kRegexpQuest:
+      case re2::kRegexpRepeat:
+      case re2::kRegexpCapture: {
+        int max = -1;
+        for (int i = 0; i < nchild_args; i++)
+          max = std::max(max, child_args[i]);
+        return max;
+      }
+
+      case re2::kRegexpLiteralString:
+        return re->nrunes();
+
+      default:
+        break;
+    }
+    return -1;
+  }
+
+  // Should never be called: we use Walk(), not WalkExponential().
+  int ShortVisit(re2::Regexp* re, int parent_arg) override {
+    return parent_arg;
+  }
+
+ private:
+  SubstringWalker(const SubstringWalker&) = delete;
+  SubstringWalker& operator=(const SubstringWalker&) = delete;
+};
+
 void TestOneInput(StringPiece pattern, const RE2::Options& options,
                   StringPiece text) {
   // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
@@ -63,23 +105,9 @@
   // They can cause bug reports due to fuzzer timeouts when they
   // are repetitions (e.g. hundreds of NUL bytes) and matching is
   // unanchored. And they aren't interesting for fuzzing purposes.
-  std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
-  if (prefilter == nullptr)
+  SubstringWalker w;
+  if (w.Walk(re.Regexp(), -1) > 9)
     return;
-  std::queue<re2::Prefilter*> nodes;
-  nodes.push(prefilter.get());
-  while (!nodes.empty()) {
-    re2::Prefilter* node = nodes.front();
-    nodes.pop();
-    if (node->op() == re2::Prefilter::ATOM) {
-      if (node->atom().size() > 9)
-        return;
-    } else if (node->op() == re2::Prefilter::AND ||
-               node->op() == re2::Prefilter::OR) {
-      for (re2::Prefilter* sub : *node->subs())
-        nodes.push(sub);
-    }
-  }
 
   // Don't waste time fuzzing high-size programs.
   // They can cause bug reports due to fuzzer timeouts.