re2/testing/parse_test.cc - re2 - Git at Google

 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Test parse.cc, dump.cc, and tostring.cc.

 #include <string>

 #include "util/test.h"
 #include "util/logging.h"
 #include "re2/regexp.h"

 namespace re2 {

 // In the past, we used 1<<30 here and zeroed the bit later, but that
 // has undefined behaviour, so now we use an internal-only flag because
 // otherwise we would have to introduce a new flag value just for this.
 static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;

 struct Test {
   const char* regexp;
   const char* parse;
   Regexp::ParseFlags flags;
 };

 static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
                                        Regexp::PerlX |
                                        Regexp::PerlClasses |
                                        Regexp::UnicodeGroups;

 static Test tests[] = {
   // Base cases
   { "a", "lit{a}" },
   { "a.", "cat{lit{a}dot{}}" },
   { "a.b", "cat{lit{a}dot{}lit{b}}" },
   { "ab", "str{ab}" },
   { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
   { "abc", "str{abc}" },
   { "a|^", "alt{lit{a}bol{}}" },
   { "a|b", "cc{0x61-0x62}" },
   { "(a)", "cap{lit{a}}" },
   { "(a)|b", "alt{cap{lit{a}}lit{b}}" },
   { "a*", "star{lit{a}}" },
   { "a+", "plus{lit{a}}" },
   { "a?", "que{lit{a}}" },
   { "a{2}", "rep{2,2 lit{a}}" },
   { "a{2,3}", "rep{2,3 lit{a}}" },
   { "a{2,}", "rep{2,-1 lit{a}}" },
   { "a*?", "nstar{lit{a}}" },
   { "a+?", "nplus{lit{a}}" },
   { "a??", "nque{lit{a}}" },
   { "a{2}?", "nrep{2,2 lit{a}}" },
   { "a{2,3}?", "nrep{2,3 lit{a}}" },
   { "a{2,}?", "nrep{2,-1 lit{a}}" },
   { "", "emp{}" },
   { "|", "alt{emp{}emp{}}" },
   { "|x|", "alt{emp{}lit{x}emp{}}" },
   { ".", "dot{}" },
   { "^", "bol{}" },
   { "$", "eol{}" },
   { "\\|", "lit{|}" },
   { "\\(", "lit{(}" },
   { "\\)", "lit{)}" },
   { "\\*", "lit{*}" },
   { "\\+", "lit{+}" },
   { "\\?", "lit{?}" },
   { "{", "lit{{}" },
   { "}", "lit{}}" },
   { "\\.", "lit{.}" },
   { "\\^", "lit{^}" },
   { "\\$", "lit{$}" },
   { "\\\\", "lit{\\}" },
   { "[ace]", "cc{0x61 0x63 0x65}" },
   { "[abc]", "cc{0x61-0x63}" },
   { "[a-z]", "cc{0x61-0x7a}" },
   { "[a]", "lit{a}" },
   { "\\-", "lit{-}" },
   { "-", "lit{-}" },
   { "\\_", "lit{_}" },

   // Posix and Perl extensions
   { "[[:lower:]]", "cc{0x61-0x7a}" },
   { "[a-z]", "cc{0x61-0x7a}" },
   { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
   { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
   { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
   { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
   { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
   { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
   { "\\d", "cc{0x30-0x39}" },
   { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
   { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
   { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
   { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
   { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
   { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
   { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
   { "\\C", "byte{}" },

   // Unicode, negatives, and a double negative.
   { "\\p{Braille}", "cc{0x2800-0x28ff}" },
   { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
   { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
   { "\\P{^Braille}", "cc{0x2800-0x28ff}" },

   // More interesting regular expressions.
   { "a{,2}", "str{a{,2}}" },
   { "\\.\\^\\$\\\\", "str{.^$\\}" },
   { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
   { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
   { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" },  // utf-8
   { "a*{", "cat{star{lit{a}}lit{{}}" },

   // Test precedences
   { "(?:ab)*", "star{str{ab}}" },
   { "(ab)*", "star{cap{str{ab}}}" },
   { "ab|cd", "alt{str{ab}str{cd}}" },
   { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },

   // Test squashing of **, ++, ?? et cetera.
   { "(?:(?:a)*)*", "star{lit{a}}" },
   { "(?:(?:a)+)+", "plus{lit{a}}" },
   { "(?:(?:a)?)?", "que{lit{a}}" },
   { "(?:(?:a)*)+", "star{lit{a}}" },
   { "(?:(?:a)*)?", "star{lit{a}}" },
   { "(?:(?:a)+)*", "star{lit{a}}" },
   { "(?:(?:a)+)?", "star{lit{a}}" },
   { "(?:(?:a)?)*", "star{lit{a}}" },
   { "(?:(?:a)?)+", "star{lit{a}}" },

   // Test flattening.
   { "(?:a)", "lit{a}" },
   { "(?:ab)(?:cd)", "str{abcd}" },
   { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
   { "a|c", "cc{0x61 0x63}" },
   { "a|[cd]", "cc{0x61 0x63-0x64}" },
   { "a|.", "dot{}" },
   { "[ab]|c", "cc{0x61-0x63}" },
   { "[ab]|[cd]", "cc{0x61-0x64}" },
   { "[ab]|.", "dot{}" },
   { ".|c", "dot{}" },
   { ".|[cd]", "dot{}" },
   { ".|.", "dot{}" },

   // Test Perl quoted literals
   { "\\Q+|*?{[\\E", "str{+|*?{[}" },
   { "\\Q+\\E+", "plus{lit{+}}" },
   { "\\Q\\\\E", "lit{\\}" },
   { "\\Q\\\\\\E", "str{\\\\}" },
   { "\\Qa\\E*", "star{lit{a}}" },
   { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
   { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },

   // Test Perl \A and \z
   { "(?m)^", "bol{}" },
   { "(?m)$", "eol{}" },
   { "(?-m)^", "bot{}" },
   { "(?-m)$", "eot{}" },
   { "(?m)\\A", "bot{}" },
   { "(?m)\\z", "eot{\\z}" },
   { "(?-m)\\A", "bot{}" },
   { "(?-m)\\z", "eot{\\z}" },

   // Test named captures
   { "(?P<name>a)", "cap{name:lit{a}}" },

   // Case-folded literals
   { "[Aa]", "litfold{a}" },

   // Strings
   { "abcde", "str{abcde}" },
   { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },

   // Reported bug involving \n leaking in despite use of NeverNL.
   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
   { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
     Regexp::PerlClasses },
   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
     Regexp::PerlClasses | Regexp::FoldCase },
   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
     Regexp::PerlClasses | Regexp::NeverNL },
   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
     Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
     Regexp::PerlClasses },
   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
     Regexp::PerlClasses | Regexp::FoldCase },
   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
     Regexp::PerlClasses | Regexp::NeverNL },
   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
     Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },

   // Bug in Regexp::ToString() that emitted [^], which
   // would (obviously) fail to parse when fed back in.
   { "[\\s\\S]", "cc{0-0x10ffff}" },
 };

 bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
   return Regexp::Equal(a, b);
 }

 void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
                const std::string& title) {
   Regexp** re = new Regexp*[ntests];
   for (int i = 0; i < ntests; i++) {
     RegexpStatus status;
     Regexp::ParseFlags f = flags;
     if (tests[i].flags != 0) {
       f = tests[i].flags & ~TestZeroFlags;
     }
     re[i] = Regexp::Parse(tests[i].regexp, f, &status);
     ASSERT_TRUE(re[i] != NULL)
       << " " << tests[i].regexp << " " << status.Text();
     std::string s = re[i]->Dump();
     EXPECT_EQ(std::string(tests[i].parse), s)
         << "Regexp: " << tests[i].regexp
         << "\nparse: " << std::string(tests[i].parse)
         << " s: " << s << " flag=" << f;
   }

   for (int i = 0; i < ntests; i++) {
     for (int j = 0; j < ntests; j++) {
       EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse),
                 RegexpEqualTestingOnly(re[i], re[j]))
         << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
     }
   }

   for (int i = 0; i < ntests; i++)
     re[i]->Decref();
   delete[] re;
 }

 // Test that regexps parse to expected structures.
 TEST(TestParse, SimpleRegexps) {
   TestParse(tests, arraysize(tests), kTestFlags, "simple");
 }

 Test foldcase_tests[] = {
   { "AbCdE", "strfold{abcde}" },
   { "[Aa]", "litfold{a}" },
   { "a", "litfold{a}" },

   // 0x17F is an old English long s (looks like an f) and folds to s.
   // 0x212A is the Kelvin symbol and folds to k.
   { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" },  // [Aa][A-z...]
   { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
   { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
 };

 // Test that parsing with FoldCase works.
 TEST(TestParse, FoldCase) {
   TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
 }

 Test literal_tests[] = {
   { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
 };

 // Test that parsing with Literal works.
 TEST(TestParse, Literal) {
   TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
 }

 Test matchnl_tests[] = {
   { ".", "dot{}" },
   { "\n", "lit{\n}" },
   { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
   { "[a\\n]", "cc{0xa 0x61}" },
 };

 // Test that parsing with MatchNL works.
 // (Also tested above during simple cases.)
 TEST(TestParse, MatchNL) {
   TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
 }

 Test nomatchnl_tests[] = {
   { ".", "cc{0-0x9 0xb-0x10ffff}" },
   { "\n", "lit{\n}" },
   { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
   { "[a\\n]", "cc{0xa 0x61}" },
 };

 // Test that parsing without MatchNL works.
 TEST(TestParse, NoMatchNL) {
   TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
 }

 Test prefix_tests[] = {
   { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
   { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
   { "abc|abd|aef|bcx|bcy",
     "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
       "cat{str{bc}cc{0x78-0x79}}}" },
   { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
   { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
   { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
   { ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
   { "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
   { "x{2}|x{2}[0-9]",
     "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
   { "x{2}y|x{2}[0-9]y",
     "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
   { "n|r|rs",
     "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
   { "n|rs|r",
     "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
   { "r|rs|n",
     "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
   { "rs|r|n",
     "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
   { "a\\C*?c|a\\C*?b",
     "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
   { "^/a/bc|^/a/de",
     "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
   // In the past, factoring was limited to kFactorAlternationMaxDepth (8).
   { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa",
     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
     "lit{a}}}}}}}}}}}}}}}}}}}" },
   { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones",
     "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
     "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
     "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
 };

 // Test that prefix factoring works.
 TEST(TestParse, Prefix) {
   TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
 }

 Test nested_tests[] = {
   { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
     "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
   { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
     "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
   { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
     "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
   { "((((((x{2}){2}){2}){5}){5}){5})",
     "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
 };

 // Test that nested repetition works.
 TEST(TestParse, Nested) {
   TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested");
 }

 // Invalid regular expressions
 const char* badtests[] = {
   "(",
   ")",
   "(a",
   "(a|b|",
   "(a|b",
   "[a-z",
   "([a-z)",
   "x{1001}",
   "\xff",      // Invalid UTF-8
   "[\xff]",
   "[\\\xff]",
   "\\\xff",
   "(?P<name>a",
   "(?P<name>",
   "(?P<name",
   "(?P<x y>a)",
   "(?P<>a)",
   "[a-Z]",
   "(?i)[a-Z]",
   "a{100000}",
   "a{100000,}",
   "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
   "(((x{7}){11}){13})",
   "\\Q\\E*",
 };

 // Valid in Perl, bad in POSIX
 const char* only_perl[] = {
  "[a-b-c]",
  "\\Qabc\\E",
  "\\Q*+?{[\\E",
  "\\Q\\\\E",
  "\\Q\\\\\\E",
  "\\Q\\\\\\\\E",
  "\\Q\\\\\\\\\\E",
  "(?:a)",
  "(?P<name>a)",
 };

 // Valid in POSIX, bad in Perl.
 const char* only_posix[] = {
   "a++",
   "a**",
   "a?*",
   "a+*",
   "a{1}*",
 };

 // Test that parser rejects bad regexps.
 TEST(TestParse, InvalidRegexps) {
   for (size_t i = 0; i < arraysize(badtests); i++) {
     ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
       << " " << badtests[i];
     ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
       << " " << badtests[i];
   }
   for (size_t i = 0; i < arraysize(only_posix); i++) {
     ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
       << " " << only_posix[i];
     Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
     ASSERT_TRUE(re != NULL) << " " << only_posix[i];
     re->Decref();
   }
   for (size_t i = 0; i < arraysize(only_perl); i++) {
     ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
       << " " << only_perl[i];
     Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
     ASSERT_TRUE(re != NULL) << " " << only_perl[i];
     re->Decref();
   }
 }

 // Test that ToString produces original regexp or equivalent one.
 TEST(TestToString, EquivalentParse) {
   for (size_t i = 0; i < arraysize(tests); i++) {
     RegexpStatus status;
     Regexp::ParseFlags f = kTestFlags;
     if (tests[i].flags != 0) {
       f = tests[i].flags & ~TestZeroFlags;
     }
     Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
     ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
     std::string s = re->Dump();
     EXPECT_EQ(std::string(tests[i].parse), s)
         << "Regexp: " << tests[i].regexp
         << "\nparse: " << std::string(tests[i].parse)
         << " s: " << s << " flag=" << f;
     std::string t = re->ToString();
     if (t != tests[i].regexp) {
       // If ToString didn't return the original regexp,
       // it must have found one with fewer parens.
       // Unfortunately we can't check the length here, because
       // ToString produces "\\{" for a literal brace,
       // but "{" is a shorter equivalent.
       // ASSERT_LT(t.size(), strlen(tests[i].regexp))
       //     << " t=" << t << " regexp=" << tests[i].regexp;

       // Test that if we parse the new regexp we get the same structure.
       Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
       ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
       std::string ss = nre->Dump();
       std::string tt = nre->ToString();
       if (s != ss || t != tt)
         LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
       EXPECT_EQ(s, ss);
       EXPECT_EQ(t, tt);
       nre->Decref();
     }
     re->Decref();
   }
 }

 // Test that capture error args are correct.
 TEST(NamedCaptures, ErrorArgs) {
   RegexpStatus status;
   Regexp* re;

   re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
   EXPECT_TRUE(re == NULL);
   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
   EXPECT_EQ(status.error_arg(), "(?P<name");

   re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
   EXPECT_TRUE(re == NULL);
   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
   EXPECT_EQ(status.error_arg(), "(?P<space bar>");
 }

 }  // namespace re2
	// Copyright 2006 The RE2 Authors. All Rights Reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Test parse.cc, dump.cc, and tostring.cc.

	#include <string>

	#include "util/test.h"
	#include "util/logging.h"
	#include "re2/regexp.h"

	namespace re2 {

	// In the past, we used 1<<30 here and zeroed the bit later, but that
	// has undefined behaviour, so now we use an internal-only flag because
	// otherwise we would have to introduce a new flag value just for this.
	static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;

	struct Test {
	const char* regexp;
	const char* parse;
	Regexp::ParseFlags flags;
	};

	static Regexp::ParseFlags kTestFlags = Regexp::MatchNL \|
	Regexp::PerlX \|
	Regexp::PerlClasses \|
	Regexp::UnicodeGroups;

	static Test tests[] = {
	// Base cases
	{ "a", "lit{a}" },
	{ "a.", "cat{lit{a}dot{}}" },
	{ "a.b", "cat{lit{a}dot{}lit{b}}" },
	{ "ab", "str{ab}" },
	{ "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
	{ "abc", "str{abc}" },
	{ "a\|^", "alt{lit{a}bol{}}" },
	{ "a\|b", "cc{0x61-0x62}" },
	{ "(a)", "cap{lit{a}}" },
	{ "(a)\|b", "alt{cap{lit{a}}lit{b}}" },
	{ "a*", "star{lit{a}}" },
	{ "a+", "plus{lit{a}}" },
	{ "a?", "que{lit{a}}" },
	{ "a{2}", "rep{2,2 lit{a}}" },
	{ "a{2,3}", "rep{2,3 lit{a}}" },
	{ "a{2,}", "rep{2,-1 lit{a}}" },
	{ "a*?", "nstar{lit{a}}" },
	{ "a+?", "nplus{lit{a}}" },
	{ "a??", "nque{lit{a}}" },
	{ "a{2}?", "nrep{2,2 lit{a}}" },
	{ "a{2,3}?", "nrep{2,3 lit{a}}" },
	{ "a{2,}?", "nrep{2,-1 lit{a}}" },
	{ "", "emp{}" },
	{ "\|", "alt{emp{}emp{}}" },
	{ "\|x\|", "alt{emp{}lit{x}emp{}}" },
	{ ".", "dot{}" },
	{ "^", "bol{}" },
	{ "$", "eol{}" },
	{ "\\\|", "lit{\|}" },
	{ "\\(", "lit{(}" },
	{ "\\)", "lit{)}" },
	{ "\\", "lit{}" },
	{ "\\+", "lit{+}" },
	{ "\\?", "lit{?}" },
	{ "{", "lit{{}" },
	{ "}", "lit{}}" },
	{ "\\.", "lit{.}" },
	{ "\\^", "lit{^}" },
	{ "\\$", "lit{$}" },
	{ "\\\\", "lit{\\}" },
	{ "[ace]", "cc{0x61 0x63 0x65}" },
	{ "[abc]", "cc{0x61-0x63}" },
	{ "[a-z]", "cc{0x61-0x7a}" },
	{ "[a]", "lit{a}" },
	{ "\\-", "lit{-}" },
	{ "-", "lit{-}" },
	{ "\\_", "lit{_}" },

	// Posix and Perl extensions
	{ "[[:lower:]]", "cc{0x61-0x7a}" },
	{ "[a-z]", "cc{0x61-0x7a}" },
	{ "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
	{ "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
	{ "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
	{ "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
	{ "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
	{ "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
	{ "\\d", "cc{0x30-0x39}" },
	{ "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
	{ "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
	{ "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
	{ "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
	{ "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
	{ "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
	{ "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
	{ "\\C", "byte{}" },

	// Unicode, negatives, and a double negative.
	{ "\\p{Braille}", "cc{0x2800-0x28ff}" },
	{ "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
	{ "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
	{ "\\P{^Braille}", "cc{0x2800-0x28ff}" },

	// More interesting regular expressions.
	{ "a{,2}", "str{a{,2}}" },
	{ "\\.\\^\\$\\\\", "str{.^$\\}" },
	{ "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
	{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
	{ "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" }, // utf-8
	{ "a*{", "cat{star{lit{a}}lit{{}}" },

	// Test precedences
	{ "(?:ab)*", "star{str{ab}}" },
	{ "(ab)*", "star{cap{str{ab}}}" },
	{ "ab\|cd", "alt{str{ab}str{cd}}" },
	{ "a(b\|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },

	// Test squashing of **, ++, ?? et cetera.
	{ "(?:(?:a))", "star{lit{a}}" },
	{ "(?:(?:a)+)+", "plus{lit{a}}" },
	{ "(?:(?:a)?)?", "que{lit{a}}" },
	{ "(?:(?:a)*)+", "star{lit{a}}" },
	{ "(?:(?:a)*)?", "star{lit{a}}" },
	{ "(?:(?:a)+)*", "star{lit{a}}" },
	{ "(?:(?:a)+)?", "star{lit{a}}" },
	{ "(?:(?:a)?)*", "star{lit{a}}" },
	{ "(?:(?:a)?)+", "star{lit{a}}" },

	// Test flattening.
	{ "(?:a)", "lit{a}" },
	{ "(?:ab)(?:cd)", "str{abcd}" },
	{ "(?:a\|b)\|(?:c\|d)", "cc{0x61-0x64}" },
	{ "a\|c", "cc{0x61 0x63}" },
	{ "a\|[cd]", "cc{0x61 0x63-0x64}" },
	{ "a\|.", "dot{}" },
	{ "[ab]\|c", "cc{0x61-0x63}" },
	{ "[ab]\|[cd]", "cc{0x61-0x64}" },
	{ "[ab]\|.", "dot{}" },
	{ ".\|c", "dot{}" },
	{ ".\|[cd]", "dot{}" },
	{ ".\|.", "dot{}" },

	// Test Perl quoted literals
	{ "\\Q+\|?{[\\E", "str{+\|?{[}" },
	{ "\\Q+\\E+", "plus{lit{+}}" },
	{ "\\Q\\\\E", "lit{\\}" },
	{ "\\Q\\\\\\E", "str{\\\\}" },
	{ "\\Qa\\E*", "star{lit{a}}" },
	{ "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
	{ "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },

	// Test Perl \A and \z
	{ "(?m)^", "bol{}" },
	{ "(?m)$", "eol{}" },
	{ "(?-m)^", "bot{}" },
	{ "(?-m)$", "eot{}" },
	{ "(?m)\\A", "bot{}" },
	{ "(?m)\\z", "eot{\\z}" },
	{ "(?-m)\\A", "bot{}" },
	{ "(?-m)\\z", "eot{\\z}" },

	// Test named captures
	{ "(?P<name>a)", "cap{name:lit{a}}" },

	// Case-folded literals
	{ "[Aa]", "litfold{a}" },

	// Strings
	{ "abcde", "str{abcde}" },
	{ "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },

	// Reported bug involving \n leaking in despite use of NeverNL.
	{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
	{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
	{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
	{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
	{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
	{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
	{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
	{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
	{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
	{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
	{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
	{ "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL \| Regexp::FoldCase },
	{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
	Regexp::PerlClasses },
	{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
	Regexp::PerlClasses \| Regexp::FoldCase },
	{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
	Regexp::PerlClasses \| Regexp::NeverNL },
	{ "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
	Regexp::PerlClasses \| Regexp::NeverNL \| Regexp::FoldCase },
	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
	Regexp::PerlClasses },
	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
	Regexp::PerlClasses \| Regexp::FoldCase },
	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
	Regexp::PerlClasses \| Regexp::NeverNL },
	{ "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
	Regexp::PerlClasses \| Regexp::NeverNL \| Regexp::FoldCase },

	// Bug in Regexp::ToString() that emitted [^], which
	// would (obviously) fail to parse when fed back in.
	{ "[\\s\\S]", "cc{0-0x10ffff}" },
	};

	bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
	return Regexp::Equal(a, b);
	}

	void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
	const std::string& title) {
	Regexp** re = new Regexp*[ntests];
	for (int i = 0; i < ntests; i++) {
	RegexpStatus status;
	Regexp::ParseFlags f = flags;
	if (tests[i].flags != 0) {
	f = tests[i].flags & ~TestZeroFlags;
	}
	re[i] = Regexp::Parse(tests[i].regexp, f, &status);
	ASSERT_TRUE(re[i] != NULL)
	<< " " << tests[i].regexp << " " << status.Text();
	std::string s = re[i]->Dump();
	EXPECT_EQ(std::string(tests[i].parse), s)
	<< "Regexp: " << tests[i].regexp
	<< "\nparse: " << std::string(tests[i].parse)
	<< " s: " << s << " flag=" << f;
	}

	for (int i = 0; i < ntests; i++) {
	for (int j = 0; j < ntests; j++) {
	EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse),
	RegexpEqualTestingOnly(re[i], re[j]))
	<< "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
	}
	}

	for (int i = 0; i < ntests; i++)
	re[i]->Decref();
	delete[] re;
	}

	// Test that regexps parse to expected structures.
	TEST(TestParse, SimpleRegexps) {
	TestParse(tests, arraysize(tests), kTestFlags, "simple");
	}

	Test foldcase_tests[] = {
	{ "AbCdE", "strfold{abcde}" },
	{ "[Aa]", "litfold{a}" },
	{ "a", "litfold{a}" },

	// 0x17F is an old English long s (looks like an f) and folds to s.
	// 0x212A is the Kelvin symbol and folds to k.
	{ "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" }, // [Aa][A-z...]
	{ "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
	{ "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
	};

	// Test that parsing with FoldCase works.
	TEST(TestParse, FoldCase) {
	TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
	}

	Test literal_tests[] = {
	{ "(\|)^$.[+?]{5,10},\\", "str{(\|)^$.[+?]{5,10},\\}" },
	};

	// Test that parsing with Literal works.
	TEST(TestParse, Literal) {
	TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
	}

	Test matchnl_tests[] = {
	{ ".", "dot{}" },
	{ "\n", "lit{\n}" },
	{ "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
	{ "[a\\n]", "cc{0xa 0x61}" },
	};

	// Test that parsing with MatchNL works.
	// (Also tested above during simple cases.)
	TEST(TestParse, MatchNL) {
	TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
	}

	Test nomatchnl_tests[] = {
	{ ".", "cc{0-0x9 0xb-0x10ffff}" },
	{ "\n", "lit{\n}" },
	{ "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
	{ "[a\\n]", "cc{0xa 0x61}" },
	};

	// Test that parsing without MatchNL works.
	TEST(TestParse, NoMatchNL) {
	TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
	}

	Test prefix_tests[] = {
	{ "abc\|abd", "cat{str{ab}cc{0x63-0x64}}" },
	{ "a(?:b)c\|abd", "cat{str{ab}cc{0x63-0x64}}" },
	{ "abc\|abd\|aef\|bcx\|bcy",
	"alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
	"cat{str{bc}cc{0x78-0x79}}}" },
	{ "abc\|x\|abd", "alt{str{abc}lit{x}str{abd}}" },
	{ "(?i)abc\|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
	{ "[ab]c\|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
	{ ".c\|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
	{ "\\Cc\|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
	{ "x{2}\|x{2}[0-9]",
	"cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
	{ "x{2}y\|x{2}[0-9]y",
	"cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
	{ "n\|r\|rs",
	"alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
	{ "n\|rs\|r",
	"alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
	{ "r\|rs\|n",
	"alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
	{ "rs\|r\|n",
	"alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
	{ "a\\C?c\|a\\C?b",
	"cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
	{ "^/a/bc\|^/a/de",
	"cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
	// In the past, factoring was limited to kFactorAlternationMaxDepth (8).
	{ "a\|aa\|aaa\|aaaa\|aaaaa\|aaaaaa\|aaaaaaa\|aaaaaaaa\|aaaaaaaaa\|aaaaaaaaaa",
	"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
	"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
	"cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
	"lit{a}}}}}}}}}}}}}}}}}}}" },
	{ "a\|aardvark\|aardvarks\|abaci\|aback\|abacus\|abacuses\|abaft\|abalone\|abalones",
	"cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
	"cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
	"str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
	};

	// Test that prefix factoring works.
	TEST(TestParse, Prefix) {
	TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
	}

	Test nested_tests[] = {
	{ "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
	"cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
	{ "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
	"cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
	{ "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
	"cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
	{ "((((((x{2}){2}){2}){5}){5}){5})",
	"cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
	};

	// Test that nested repetition works.
	TEST(TestParse, Nested) {
	TestParse(nested_tests, arraysize(nested_tests), Regexp::PerlX, "nested");
	}

	// Invalid regular expressions
	const char* badtests[] = {
	"(",
	")",
	"(a",
	"(a\|b\|",
	"(a\|b",
	"[a-z",
	"([a-z)",
	"x{1001}",
	"\xff", // Invalid UTF-8
	"[\xff]",
	"[\\\xff]",
	"\\\xff",
	"(?P<name>a",
	"(?P<name>",
	"(?P<name",
	"(?P<x y>a)",
	"(?P<>a)",
	"[a-Z]",
	"(?i)[a-Z]",
	"a{100000}",
	"a{100000,}",
	"((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
	"(((x{7}){11}){13})",
	"\\Q\\E*",
	};

	// Valid in Perl, bad in POSIX
	const char* only_perl[] = {
	"[a-b-c]",
	"\\Qabc\\E",
	"\\Q*+?{[\\E",
	"\\Q\\\\E",
	"\\Q\\\\\\E",
	"\\Q\\\\\\\\E",
	"\\Q\\\\\\\\\\E",
	"(?:a)",
	"(?P<name>a)",
	};

	// Valid in POSIX, bad in Perl.
	const char* only_posix[] = {
	"a++",
	"a**",
	"a?*",
	"a+*",
	"a{1}*",
	};

	// Test that parser rejects bad regexps.
	TEST(TestParse, InvalidRegexps) {
	for (size_t i = 0; i < arraysize(badtests); i++) {
	ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
	<< " " << badtests[i];
	ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
	<< " " << badtests[i];
	}
	for (size_t i = 0; i < arraysize(only_posix); i++) {
	ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
	<< " " << only_posix[i];
	Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
	ASSERT_TRUE(re != NULL) << " " << only_posix[i];
	re->Decref();
	}
	for (size_t i = 0; i < arraysize(only_perl); i++) {
	ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
	<< " " << only_perl[i];
	Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
	ASSERT_TRUE(re != NULL) << " " << only_perl[i];
	re->Decref();
	}
	}

	// Test that ToString produces original regexp or equivalent one.
	TEST(TestToString, EquivalentParse) {
	for (size_t i = 0; i < arraysize(tests); i++) {
	RegexpStatus status;
	Regexp::ParseFlags f = kTestFlags;
	if (tests[i].flags != 0) {
	f = tests[i].flags & ~TestZeroFlags;
	}
	Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
	ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
	std::string s = re->Dump();
	EXPECT_EQ(std::string(tests[i].parse), s)
	<< "Regexp: " << tests[i].regexp
	<< "\nparse: " << std::string(tests[i].parse)
	<< " s: " << s << " flag=" << f;
	std::string t = re->ToString();
	if (t != tests[i].regexp) {
	// If ToString didn't return the original regexp,
	// it must have found one with fewer parens.
	// Unfortunately we can't check the length here, because
	// ToString produces "\\{" for a literal brace,
	// but "{" is a shorter equivalent.
	// ASSERT_LT(t.size(), strlen(tests[i].regexp))
	// << " t=" << t << " regexp=" << tests[i].regexp;

	// Test that if we parse the new regexp we get the same structure.
	Regexp* nre = Regexp::Parse(t, Regexp::MatchNL \| Regexp::PerlX, &status);
	ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
	std::string ss = nre->Dump();
	std::string tt = nre->ToString();
	if (s != ss \|\| t != tt)
	LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
	EXPECT_EQ(s, ss);
	EXPECT_EQ(t, tt);
	nre->Decref();
	}
	re->Decref();
	}
	}

	// Test that capture error args are correct.
	TEST(NamedCaptures, ErrorArgs) {
	RegexpStatus status;
	Regexp* re;

	re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
	EXPECT_TRUE(re == NULL);
	EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
	EXPECT_EQ(status.error_arg(), "(?P<name");

	re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
	EXPECT_TRUE(re == NULL);
	EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
	EXPECT_EQ(status.error_arg(), "(?P<space bar>");
	}

	} // namespace re2