Call CharClassBuilder::RemoveAbove() consistently. By calling it from Regexp::ParseState::PushRegexp(), we cover every use of character classes. Previously, Unicode groups were not covered, which meant that handling them in Latin-1 mode would have bizarre results. As of this commit, the resulting character classes may be empty and, if so, simplified to NoMatch. I would argue that this is the expected behaviour under such nonsensical circumstances. This bug was discovered by the LLVM fuzzer. Change-Id: I00e20027045abfbd5d5e0efc56cd33df9c9e4b4f Reviewed-on: https://code-review.googlesource.com/2622 Reviewed-by: Russ Cox <rsc@swtch.com>

commit: 7884045c7dcce1606554c6bc0fd4515d98f1f270 [log] [tgz]
author: Paul Wankadia <junyer@google.com> Fri May 22 18:42:37 2015 +1000
committer: Paul Wankadia <junyer@google.com> Fri May 29 02:13:42 2015 +0000
tree: 0797bf70f3be6881d5df436fcf8a4750fc1bf308
parent: e236c93f0bb0cae690c44534537b4c019eb8d214 [diff]
diff --git a/re2/parse.cc b/re2/parse.cc
index 4f45041..e6b27d2 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc

@@ -216,6 +216,7 @@
   // analysis does better with fewer character classes.
   // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
   if (re->op_ == kRegexpCharClass) {
+    re->ccb_->RemoveAbove(rune_max_);
     if (re->ccb_->size() == 1) {
       Rune r = re->ccb_->begin()->lo;
       re->Decref();
@@ -378,7 +379,6 @@
       }
       r = CycleFoldRune(r);
     } while (r != r1);
-    re->ccb_->RemoveAbove(rune_max_);
     return PushRegexp(re);
   }
 
@@ -1832,7 +1832,6 @@
 
   if (negated)
     re->ccb_->Negate();
-  re->ccb_->RemoveAbove(rune_max_);
 
   *out_re = re;
   return true;

diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 8505195..ad15b34 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc

@@ -1511,4 +1511,15 @@
   CHECK(!RE2::PartialMatch((const char*)a, re, &s1));
 }
 
+TEST(RE2, Bug21371806) {
+  // Bug in parser accepting Unicode groups in Latin-1 mode,
+  // causing compiler to fail in DCHECK in prog.cc.
+
+  RE2::Options opt;
+  opt.set_encoding(RE2::Options::EncodingLatin1);
+
+  RE2 re("g\\p{Zl}]", opt);
+  CHECK(re.ok());
+}
+
 }  // namespace re2
commit	7884045c7dcce1606554c6bc0fd4515d98f1f270	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Fri May 22 18:42:37 2015 +1000
committer	Paul Wankadia <junyer@google.com>	Fri May 29 02:13:42 2015 +0000
tree	0797bf70f3be6881d5df436fcf8a4750fc1bf308
parent	e236c93f0bb0cae690c44534537b4c019eb8d214 [diff]