Call CharClassBuilder::RemoveAbove() consistently.

By calling it from Regexp::ParseState::PushRegexp(), we cover every use
of character classes. Previously, Unicode groups were not covered, which
meant that handling them in Latin-1 mode would have bizarre results. As
of this commit, the resulting character classes may be empty and, if so,
simplified to NoMatch. I would argue that this is the expected behaviour
under such nonsensical circumstances.

This bug was discovered by the LLVM fuzzer.

Change-Id: I00e20027045abfbd5d5e0efc56cd33df9c9e4b4f
Reviewed-on: https://code-review.googlesource.com/2622
Reviewed-by: Russ Cox <rsc@swtch.com>
diff --git a/re2/parse.cc b/re2/parse.cc
index 4f45041..e6b27d2 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -216,6 +216,7 @@
   // analysis does better with fewer character classes.
   // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
   if (re->op_ == kRegexpCharClass) {
+    re->ccb_->RemoveAbove(rune_max_);
     if (re->ccb_->size() == 1) {
       Rune r = re->ccb_->begin()->lo;
       re->Decref();
@@ -378,7 +379,6 @@
       }
       r = CycleFoldRune(r);
     } while (r != r1);
-    re->ccb_->RemoveAbove(rune_max_);
     return PushRegexp(re);
   }
 
@@ -1832,7 +1832,6 @@
 
   if (negated)
     re->ccb_->Negate();
-  re->ccb_->RemoveAbove(rune_max_);
 
   *out_re = re;
   return true;
diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc
index 8505195..ad15b34 100644
--- a/re2/testing/re2_test.cc
+++ b/re2/testing/re2_test.cc
@@ -1511,4 +1511,15 @@
   CHECK(!RE2::PartialMatch((const char*)a, re, &s1));
 }
 
+TEST(RE2, Bug21371806) {
+  // Bug in parser accepting Unicode groups in Latin-1 mode,
+  // causing compiler to fail in DCHECK in prog.cc.
+
+  RE2::Options opt;
+  opt.set_encoding(RE2::Options::EncodingLatin1);
+
+  RE2 re("g\\p{Zl}]", opt);
+  CHECK(re.ok());
+}
+
 }  // namespace re2