re2/fuzzing/re2_fuzzer.cc - re2 - Git at Google

 // Copyright 2016 The RE2 Authors.  All Rights Reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 #include <stddef.h>
 #include <stdint.h>
 #include <map>
 #include <memory>
 #include <queue>
 #include <string>

 #include "re2/prefilter.h"
 #include "re2/re2.h"

 using re2::StringPiece;
 using std::string;

 // NOT static, NOT signed.
 uint8_t dummy = 0;

 void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
   RE2 re(pattern, options);
   if (!re.ok())
     return;

   // Don't waste time fuzzing high-size programs.
   // They can cause bug reports due to fuzzer timeouts.
   int size = re.ProgramSize();
   if (size > 9999)
     return;
   int rsize = re.ReverseProgramSize();
   if (rsize > 9999)
     return;

   // Don't waste time fuzzing high-fanout programs.
   // They can cause bug reports due to fuzzer timeouts.
   std::map<int, int> histogram;
   int fanout = re.ProgramFanout(&histogram);
   if (fanout > 9)
     return;
   int rfanout = re.ReverseProgramFanout(&histogram);
   if (rfanout > 9)
     return;

   // Don't waste time fuzzing programs with large substrings.
   // They can cause bug reports due to fuzzer timeouts when they
   // are repetitions (e.g. hundreds of NUL bytes) and matching is
   // unanchored. And they aren't interesting for fuzzing purposes.
   std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
   if (prefilter == nullptr)
     return;
   std::queue<re2::Prefilter*> nodes;
   nodes.push(prefilter.get());
   while (!nodes.empty()) {
     re2::Prefilter* node = nodes.front();
     nodes.pop();
     if (node->op() == re2::Prefilter::ATOM) {
       if (node->atom().size() > 9)
         return;
     } else if (node->op() == re2::Prefilter::AND ||
                node->op() == re2::Prefilter::OR) {
       for (re2::Prefilter* sub : *node->subs())
         nodes.push(sub);
     }
   }

   if (re.NumberOfCapturingGroups() == 0) {
     // Avoid early return due to too many arguments.
     StringPiece sp = text;
     RE2::FullMatch(sp, re);
     RE2::PartialMatch(sp, re);
     RE2::Consume(&sp, re);
     sp = text;  // Reset.
     RE2::FindAndConsume(&sp, re);
   } else {
     // Okay, we have at least one capturing group...
     // Try conversion for variously typed arguments.
     StringPiece sp = text;
     short s;
     RE2::FullMatch(sp, re, &s);
     long l;
     RE2::PartialMatch(sp, re, &l);
     float f;
     RE2::Consume(&sp, re, &f);
     sp = text;  // Reset.
     double d;
     RE2::FindAndConsume(&sp, re, &d);
   }

   string s = string(text);
   RE2::Replace(&s, re, "");
   s = string(text);  // Reset.
   RE2::GlobalReplace(&s, re, "");

   string min, max;
   re.PossibleMatchRange(&min, &max, /*maxlen=*/9);

   // Exercise some other API functionality.
   dummy += re.NamedCapturingGroups().size();
   dummy += re.CapturingGroupNames().size();
   dummy += RE2::QuoteMeta(pattern).size();
 }

 // Entry point for libFuzzer.
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
   if (size == 0 || size > 999)
     return 0;

   // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
   // Otherwise, we will waste time on inputs that have long runs of various
   // character classes. The fuzzer has shown itself to be easily capable of
   // generating such patterns that fall within the other limits, but result
   // in timeouts nonetheless. The marginal cost is high - even more so when
   // counted repetition is involved - whereas the marginal benefit is zero.
   // TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
   int cc = 0;
   for (size_t i = 0; i < size; i++) {
     if (data[i] == '.')
       cc++;
     if (data[i] != '\\')
       continue;
     i++;
     if (i >= size)
       break;
     if (data[i] == 'p' || data[i] == 'P' ||
         data[i] == 'd' || data[i] == 'D' ||
         data[i] == 's' || data[i] == 'S' ||
         data[i] == 'w' || data[i] == 'W')
       cc++;
   }
   if (cc > 9)
     return 0;

   // The one-at-a-time hash by Bob Jenkins.
   uint32_t hash = 0;
   for (size_t i = 0; i < size; i++) {
     hash += data[i];
     hash += (hash << 10);
     hash ^= (hash >> 6);
   }
   hash += (hash << 3);
   hash ^= (hash >> 11);
   hash += (hash << 15);

   RE2::Options options;
   options.set_log_errors(false);
   options.set_max_mem(64 << 20);
   options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
                                 : RE2::Options::EncodingUTF8);
   options.set_posix_syntax(hash & 2);
   options.set_longest_match(hash & 4);
   options.set_literal(hash & 8);
   options.set_never_nl(hash & 16);
   options.set_dot_nl(hash & 32);
   options.set_never_capture(hash & 64);
   options.set_case_sensitive(hash & 128);
   options.set_perl_classes(hash & 256);
   options.set_word_boundary(hash & 512);
   options.set_one_line(hash & 1024);

   const char* ptr = reinterpret_cast<const char*>(data);
   int len = static_cast<int>(size);

   StringPiece pattern(ptr, len);
   StringPiece text(ptr, len);
   Test(pattern, options, text);

   return 0;
 }
	// Copyright 2016 The RE2 Authors. All Rights Reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	#include <stddef.h>
	#include <stdint.h>
	#include <map>
	#include <memory>
	#include <queue>
	#include <string>

	#include "re2/prefilter.h"
	#include "re2/re2.h"

	using re2::StringPiece;
	using std::string;

	// NOT static, NOT signed.
	uint8_t dummy = 0;

	void Test(StringPiece pattern, const RE2::Options& options, StringPiece text) {
	RE2 re(pattern, options);
	if (!re.ok())
	return;

	// Don't waste time fuzzing high-size programs.
	// They can cause bug reports due to fuzzer timeouts.
	int size = re.ProgramSize();
	if (size > 9999)
	return;
	int rsize = re.ReverseProgramSize();
	if (rsize > 9999)
	return;

	// Don't waste time fuzzing high-fanout programs.
	// They can cause bug reports due to fuzzer timeouts.
	std::map<int, int> histogram;
	int fanout = re.ProgramFanout(&histogram);
	if (fanout > 9)
	return;
	int rfanout = re.ReverseProgramFanout(&histogram);
	if (rfanout > 9)
	return;

	// Don't waste time fuzzing programs with large substrings.
	// They can cause bug reports due to fuzzer timeouts when they
	// are repetitions (e.g. hundreds of NUL bytes) and matching is
	// unanchored. And they aren't interesting for fuzzing purposes.
	std::unique_ptr<re2::Prefilter> prefilter(re2::Prefilter::FromRE2(&re));
	if (prefilter == nullptr)
	return;
	std::queue<re2::Prefilter*> nodes;
	nodes.push(prefilter.get());
	while (!nodes.empty()) {
	re2::Prefilter* node = nodes.front();
	nodes.pop();
	if (node->op() == re2::Prefilter::ATOM) {
	if (node->atom().size() > 9)
	return;
	} else if (node->op() == re2::Prefilter::AND \|\|
	node->op() == re2::Prefilter::OR) {
	for (re2::Prefilter* sub : *node->subs())
	nodes.push(sub);
	}
	}

	if (re.NumberOfCapturingGroups() == 0) {
	// Avoid early return due to too many arguments.
	StringPiece sp = text;
	RE2::FullMatch(sp, re);
	RE2::PartialMatch(sp, re);
	RE2::Consume(&sp, re);
	sp = text; // Reset.
	RE2::FindAndConsume(&sp, re);
	} else {
	// Okay, we have at least one capturing group...
	// Try conversion for variously typed arguments.
	StringPiece sp = text;
	short s;
	RE2::FullMatch(sp, re, &s);
	long l;
	RE2::PartialMatch(sp, re, &l);
	float f;
	RE2::Consume(&sp, re, &f);
	sp = text; // Reset.
	double d;
	RE2::FindAndConsume(&sp, re, &d);
	}

	string s = string(text);
	RE2::Replace(&s, re, "");
	s = string(text); // Reset.
	RE2::GlobalReplace(&s, re, "");

	string min, max;
	re.PossibleMatchRange(&min, &max, /maxlen=/9);

	// Exercise some other API functionality.
	dummy += re.NamedCapturingGroups().size();
	dummy += re.CapturingGroupNames().size();
	dummy += RE2::QuoteMeta(pattern).size();
	}

	// Entry point for libFuzzer.
	extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
	if (size == 0 \|\| size > 999)
	return 0;

	// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
	// Otherwise, we will waste time on inputs that have long runs of various
	// character classes. The fuzzer has shown itself to be easily capable of
	// generating such patterns that fall within the other limits, but result
	// in timeouts nonetheless. The marginal cost is high - even more so when
	// counted repetition is involved - whereas the marginal benefit is zero.
	// TODO(junyer): Handle [:isalnum:] et al. when they start to cause pain.
	int cc = 0;
	for (size_t i = 0; i < size; i++) {
	if (data[i] == '.')
	cc++;
	if (data[i] != '\\')
	continue;
	i++;
	if (i >= size)
	break;
	if (data[i] == 'p' \|\| data[i] == 'P' \|\|
	data[i] == 'd' \|\| data[i] == 'D' \|\|
	data[i] == 's' \|\| data[i] == 'S' \|\|
	data[i] == 'w' \|\| data[i] == 'W')
	cc++;
	}
	if (cc > 9)
	return 0;

	// The one-at-a-time hash by Bob Jenkins.
	uint32_t hash = 0;
	for (size_t i = 0; i < size; i++) {
	hash += data[i];
	hash += (hash << 10);
	hash ^= (hash >> 6);
	}
	hash += (hash << 3);
	hash ^= (hash >> 11);
	hash += (hash << 15);

	RE2::Options options;
	options.set_log_errors(false);
	options.set_max_mem(64 << 20);
	options.set_encoding(hash & 1 ? RE2::Options::EncodingLatin1
	: RE2::Options::EncodingUTF8);
	options.set_posix_syntax(hash & 2);
	options.set_longest_match(hash & 4);
	options.set_literal(hash & 8);
	options.set_never_nl(hash & 16);
	options.set_dot_nl(hash & 32);
	options.set_never_capture(hash & 64);
	options.set_case_sensitive(hash & 128);
	options.set_perl_classes(hash & 256);
	options.set_word_boundary(hash & 512);
	options.set_one_line(hash & 1024);

	const char* ptr = reinterpret_cast<const char*>(data);
	int len = static_cast<int>(size);

	StringPiece pattern(ptr, len);
	StringPiece text(ptr, len);
	Test(pattern, options, text);

	return 0;
	}