Call CharClassBuilder::RemoveAbove() consistently.
By calling it from Regexp::ParseState::PushRegexp(), we cover every use
of character classes. Previously, Unicode groups were not covered, which
meant that handling them in Latin-1 mode would have bizarre results. As
of this commit, the resulting character classes may be empty and, if so,
simplified to NoMatch. I would argue that this is the expected behaviour
under such nonsensical circumstances.
This bug was discovered by the LLVM fuzzer.
Change-Id: I00e20027045abfbd5d5e0efc56cd33df9c9e4b4f
Reviewed-on: https://code-review.googlesource.com/2622
Reviewed-by: Russ Cox <rsc@swtch.com>
diff --git a/re2/parse.cc b/re2/parse.cc
index 4f45041..e6b27d2 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -216,6 +216,7 @@
// analysis does better with fewer character classes.
// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
if (re->op_ == kRegexpCharClass) {
+ re->ccb_->RemoveAbove(rune_max_);
if (re->ccb_->size() == 1) {
Rune r = re->ccb_->begin()->lo;
re->Decref();
@@ -378,7 +379,6 @@
}
r = CycleFoldRune(r);
} while (r != r1);
- re->ccb_->RemoveAbove(rune_max_);
return PushRegexp(re);
}
@@ -1832,7 +1832,6 @@
if (negated)
re->ccb_->Negate();
- re->ccb_->RemoveAbove(rune_max_);
*out_re = re;
return true;
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 8505195..ad15b34 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -1511,4 +1511,15 @@
CHECK(!RE2::PartialMatch((const char*)a, re, &s1));
}
+TEST(RE2, Bug21371806) {
+ // Bug in parser accepting Unicode groups in Latin-1 mode,
+ // causing compiler to fail in DCHECK in prog.cc.
+
+ RE2::Options opt;
+ opt.set_encoding(RE2::Options::EncodingLatin1);
+
+ RE2 re("g\\p{Zl}]", opt);
+ CHECK(re.ok());
+}
+
} // namespace re2