Inspect substrings with a Walker<> instead of Prefilter.
For reasons related to its Boolean operations, Prefilter doesn't
necessarily surface all substrings. As a result, a lot of large
substrings were flying under the radar.
Change-Id: I92eca70108df4ed64b72e0211d3c485a717ce50a
Reviewed-on: https://code-review.googlesource.com/c/re2/+/59270
Reviewed-by: Perry Lorier <perryl@google.com>
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/fuzzing/re2_fuzzer.cc b/re2/fuzzing/re2_fuzzer.cc
index af1129f..effce50 100644
--- a/re2/fuzzing/re2_fuzzer.cc
+++ b/re2/fuzzing/re2_fuzzer.cc
@@ -5,20 +5,62 @@
#include <fuzzer/FuzzedDataProvider.h>
#include <stddef.h>
#include <stdint.h>
-#include <memory>
-#include <queue>
+#include <algorithm>
#include <string>
#include <vector>
-#include "re2/prefilter.h"
#include "re2/re2.h"
#include "re2/regexp.h"
+#include "re2/walker-inl.h"
using re2::StringPiece;
// NOT static, NOT signed.
uint8_t dummy = 0;
+// Walks substrings (i.e. kRegexpLiteralString subexpressions)
+// to determine their maximum length... in runes, but avoiding
+// overheads due to UTF-8 encoding is worthwhile when fuzzing.
+class SubstringWalker : public re2::Regexp::Walker<int> {
+ public:
+ SubstringWalker() = default;
+ ~SubstringWalker() override = default;
+
+ int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
+ int* child_args, int nchild_args) override {
+ switch (re->op()) {
+ case re2::kRegexpConcat:
+ case re2::kRegexpAlternate:
+ case re2::kRegexpStar:
+ case re2::kRegexpPlus:
+ case re2::kRegexpQuest:
+ case re2::kRegexpRepeat:
+ case re2::kRegexpCapture: {
+ int max = -1;
+ for (int i = 0; i < nchild_args; i++)
+ max = std::max(max, child_args[i]);
+ return max;
+ }
+
+ case re2::kRegexpLiteralString:
+ return re->nrunes();
+
+ default:
+ break;
+ }
+ return -1;
+ }
+
+ // Should never be called: we use Walk(), not WalkExponential().
+ int ShortVisit(re2::Regexp* re, int parent_arg) override {
+ return parent_arg;
+ }
+
+ private:
+ SubstringWalker(const SubstringWalker&) = delete;
+ SubstringWalker& operator=(const SubstringWalker&) = delete;
+};
+
void TestOneInput(StringPiece pattern, const RE2::Options& options,
StringPiece text) {
// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
@@ -63,23 +105,9 @@
// They can cause bug reports due to fuzzer timeouts when they
// are repetitions (e.g. hundreds of NUL bytes) and matching is
// unanchored. And they aren't interesting for fuzzing purposes.
- std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
- if (prefilter == nullptr)
+ SubstringWalker w;
+ if (w.Walk(re.Regexp(), -1) > 9)
return;
- std::queue<re2::Prefilter*> nodes;
- nodes.push(prefilter.get());
- while (!nodes.empty()) {
- re2::Prefilter* node = nodes.front();
- nodes.pop();
- if (node->op() == re2::Prefilter::ATOM) {
- if (node->atom().size() > 9)
- return;
- } else if (node->op() == re2::Prefilter::AND ||
- node->op() == re2::Prefilter::OR) {
- for (re2::Prefilter* sub : *node->subs())
- nodes.push(sub);
- }
- }
// Don't waste time fuzzing high-size programs.
// They can cause bug reports due to fuzzer timeouts.