blob: 47188307815126e6b6d6185d121578e7241e9ca3 [file] [log] [blame]
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test prog.cc, compile.cc
#include <string>
#include "util/test.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/prog.h"
namespace re2 {
// Simple input/output tests checking that
// the regexp compiles to the expected code.
// These are just to sanity check the basic implementation.
// The real confidence tests happen by testing the NFA/DFA
// that run the compiled code.
struct Test {
const char* regexp;
const char* code;
};
static Test tests[] = {
{ "a",
"3. byte [61-61] 0 -> 4\n"
"4. match! 0\n" },
{ "ab",
"3. byte [61-61] 0 -> 4\n"
"4. byte [62-62] 0 -> 5\n"
"5. match! 0\n" },
{ "a|c",
"3+ byte [61-61] 0 -> 5\n"
"4. byte [63-63] 0 -> 5\n"
"5. match! 0\n" },
{ "a|b",
"3. byte [61-62] 0 -> 4\n"
"4. match! 0\n" },
{ "[ab]",
"3. byte [61-62] 0 -> 4\n"
"4. match! 0\n" },
{ "a+",
"3. byte [61-61] 0 -> 4\n"
"4+ nop -> 3\n"
"5. match! 0\n" },
{ "a+?",
"3. byte [61-61] 0 -> 4\n"
"4+ match! 0\n"
"5. nop -> 3\n" },
{ "a*",
"3+ byte [61-61] 1 -> 3\n"
"4. match! 0\n" },
{ "a*?",
"3+ match! 0\n"
"4. byte [61-61] 0 -> 3\n" },
{ "a?",
"3+ byte [61-61] 1 -> 5\n"
"4. nop -> 5\n"
"5. match! 0\n" },
{ "a??",
"3+ nop -> 5\n"
"4. byte [61-61] 0 -> 5\n"
"5. match! 0\n" },
{ "a{4}",
"3. byte [61-61] 0 -> 4\n"
"4. byte [61-61] 0 -> 5\n"
"5. byte [61-61] 0 -> 6\n"
"6. byte [61-61] 0 -> 7\n"
"7. match! 0\n" },
{ "(a)",
"3. capture 2 -> 4\n"
"4. byte [61-61] 0 -> 5\n"
"5. capture 3 -> 6\n"
"6. match! 0\n" },
{ "(?:a)",
"3. byte [61-61] 0 -> 4\n"
"4. match! 0\n" },
{ "",
"3. match! 0\n" },
{ ".",
"3+ byte [00-09] 0 -> 5\n"
"4. byte [0b-ff] 0 -> 5\n"
"5. match! 0\n" },
{ "[^ab]",
"3+ byte [00-09] 0 -> 6\n"
"4+ byte [0b-60] 0 -> 6\n"
"5. byte [63-ff] 0 -> 6\n"
"6. match! 0\n" },
{ "[Aa]",
"3. byte/i [61-61] 0 -> 4\n"
"4. match! 0\n" },
{ "\\C+",
"3. byte [00-ff] 0 -> 4\n"
"4+ altmatch -> 5 | 6\n"
"5+ nop -> 3\n"
"6. match! 0\n" },
{ "\\C*",
"3+ altmatch -> 4 | 5\n"
"4+ byte [00-ff] 1 -> 3\n"
"5. match! 0\n" },
{ "\\C?",
"3+ byte [00-ff] 1 -> 5\n"
"4. nop -> 5\n"
"5. match! 0\n" },
// Issue 20992936
{ "[[-`]",
"3. byte [5b-60] 0 -> 4\n"
"4. match! 0\n" },
// Issue 310
{ "(?:|a)*",
"3+ nop -> 7\n"
"4. nop -> 9\n"
"5+ nop -> 7\n"
"6. nop -> 9\n"
"7+ nop -> 5\n"
"8. byte [61-61] 0 -> 5\n"
"9. match! 0\n" },
{ "(?:|a)+",
"3+ nop -> 5\n"
"4. byte [61-61] 0 -> 5\n"
"5+ nop -> 3\n"
"6. match! 0\n" },
};
TEST(TestRegexpCompileToProg, Simple) {
int failed = 0;
for (size_t i = 0; i < arraysize(tests); i++) {
const re2::Test& t = tests[i];
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
if (re == NULL) {
LOG(ERROR) << "Cannot parse: " << t.regexp;
failed++;
continue;
}
Prog* prog = re->CompileToProg(0);
if (prog == NULL) {
LOG(ERROR) << "Cannot compile: " << t.regexp;
re->Decref();
failed++;
continue;
}
ASSERT_TRUE(re->CompileToProg(1) == NULL);
std::string s = prog->Dump();
if (s != t.code) {
LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
LOG(ERROR) << "Want:\n" << t.code;
LOG(ERROR) << "Got:\n" << s;
failed++;
}
delete prog;
re->Decref();
}
EXPECT_EQ(failed, 0);
}
static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
std::string* bytemap) {
Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
{
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
*bytemap = prog->DumpByteMap();
delete prog;
}
{
Prog* prog = re->CompileToReverseProg(0);
EXPECT_TRUE(prog != NULL);
EXPECT_EQ(*bytemap, prog->DumpByteMap());
delete prog;
}
re->Decref();
}
TEST(TestCompile, Latin1Ranges) {
// The distinct byte ranges involved in the Latin-1 dot ([^\n]).
std::string bytemap;
DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-09] -> 0\n"
"[0a-0a] -> 1\n"
"[0b-ff] -> 0\n",
bytemap);
}
TEST(TestCompile, OtherByteMapTests) {
std::string bytemap;
// Test that "absent" ranges are mapped to the same byte class.
DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-2f] -> 0\n"
"[30-39] -> 1\n"
"[3a-40] -> 0\n"
"[41-46] -> 1\n"
"[47-60] -> 0\n"
"[61-66] -> 1\n"
"[67-ff] -> 0\n",
bytemap);
// Test the byte classes for \b.
DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-2f] -> 0\n"
"[30-39] -> 1\n"
"[3a-40] -> 0\n"
"[41-5a] -> 1\n"
"[5b-5e] -> 0\n"
"[5f-5f] -> 1\n"
"[60-60] -> 0\n"
"[61-7a] -> 1\n"
"[7b-ff] -> 0\n",
bytemap);
// Bug in the ASCII case-folding optimization created too many byte classes.
DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
EXPECT_EQ("[00-5e] -> 0\n"
"[5f-5f] -> 1\n"
"[60-ff] -> 0\n",
bytemap);
}
TEST(TestCompile, UTF8Ranges) {
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
// Once, erroneously split between 0x3f and 0x40 because it is
// a 6-bit boundary.
std::string bytemap;
DumpByteMap(".", Regexp::PerlX, &bytemap);
EXPECT_EQ("[00-09] -> 0\n"
"[0a-0a] -> 1\n"
"[0b-7f] -> 0\n"
"[80-bf] -> 2\n"
"[c0-c1] -> 1\n"
"[c2-df] -> 3\n"
"[e0-ef] -> 4\n"
"[f0-f4] -> 5\n"
"[f5-ff] -> 1\n",
bytemap);
}
TEST(TestCompile, InsufficientMemory) {
Regexp* re = Regexp::Parse(
"^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
Regexp::LikePerl, NULL);
EXPECT_TRUE(re != NULL);
Prog* prog = re->CompileToProg(850);
// If the memory budget has been exhausted, compilation should fail
// and return NULL instead of trying to do anything with NoMatch().
EXPECT_TRUE(prog == NULL);
re->Decref();
}
static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
std::string* forward, std::string* reverse) {
Regexp* re = Regexp::Parse(pattern, flags, NULL);
EXPECT_TRUE(re != NULL);
if (forward != NULL) {
Prog* prog = re->CompileToProg(0);
EXPECT_TRUE(prog != NULL);
*forward = prog->Dump();
delete prog;
}
if (reverse != NULL) {
Prog* prog = re->CompileToReverseProg(0);
EXPECT_TRUE(prog != NULL);
*reverse = prog->Dump();
delete prog;
}
re->Decref();
}
TEST(TestCompile, Bug26705922) {
// Bug in the compiler caused inefficient bytecode to be generated for Unicode
// groups: common suffixes were cached, but common prefixes were not factored.
std::string forward, reverse;
Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
"4. byte [90-90] 0 -> 5\n"
"5. byte [80-80] 0 -> 6\n"
"6+ byte [80-80] 0 -> 8\n"
"7. byte [90-90] 0 -> 8\n"
"8. match! 0\n",
forward);
EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
"4. byte [90-90] 0 -> 5\n"
"5. byte [80-80] 0 -> 6\n"
"6. byte [90-90] 0 -> 7\n"
"7. byte [f0-f0] 0 -> 8\n"
"8. match! 0\n",
reverse);
Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
"4. byte [f0-f0] 0 -> 8\n"
"5. byte [80-bf] 0 -> 6\n"
"6. byte [80-bf] 0 -> 7\n"
"7. match! 0\n"
"8. byte [90-90] 0 -> 5\n",
forward);
EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
"4. byte [80-bf] 0 -> 5\n"
"5+ byte [e8-ef] 0 -> 7\n"
"6. byte [90-90] 0 -> 8\n"
"7. match! 0\n"
"8. byte [f0-f0] 0 -> 7\n",
reverse);
Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
"4+ byte [e0-ef] 0 -> 8\n"
"5. byte [f0-f4] 0 -> 9\n"
"6. byte [80-bf] 0 -> 7\n"
"7. match! 0\n"
"8. byte [80-bf] 0 -> 6\n"
"9. byte [80-bf] 0 -> 8\n",
forward);
EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
"4+ byte [c2-df] 0 -> 6\n"
"5. byte [80-bf] 0 -> 7\n"
"6. match! 0\n"
"7+ byte [e0-ef] 0 -> 6\n"
"8. byte [80-bf] 0 -> 9\n"
"9. byte [f0-f4] 0 -> 6\n",
reverse);
}
TEST(TestCompile, Bug35237384) {
// Bug in the compiler caused inefficient bytecode to be generated for
// nested nullable subexpressions.
std::string forward;
Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
"4. nop -> 5\n"
"5+ byte [61-61] 1 -> 5\n"
"6. nop -> 7\n"
"7+ byte [61-61] 1 -> 7\n"
"8. match! 0\n",
forward);
Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
EXPECT_EQ("3+ nop -> 28\n"
"4. nop -> 30\n"
"5+ byte [61-61] 1 -> 5\n"
"6. nop -> 32\n"
"7+ byte [61-61] 1 -> 7\n"
"8. nop -> 26\n"
"9+ byte [61-61] 1 -> 9\n"
"10. nop -> 20\n"
"11+ byte [62-62] 1 -> 11\n"
"12. nop -> 20\n"
"13+ byte [62-62] 1 -> 13\n"
"14. nop -> 26\n"
"15+ byte [62-62] 1 -> 15\n"
"16. nop -> 32\n"
"17+ nop -> 9\n"
"18. nop -> 11\n"
"19. match! 0\n"
"20+ nop -> 17\n"
"21. nop -> 19\n"
"22+ nop -> 7\n"
"23. nop -> 13\n"
"24+ nop -> 17\n"
"25. nop -> 19\n"
"26+ nop -> 22\n"
"27. nop -> 24\n"
"28+ nop -> 5\n"
"29. nop -> 15\n"
"30+ nop -> 22\n"
"31. nop -> 24\n"
"32+ nop -> 28\n"
"33. nop -> 30\n",
forward);
Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
EXPECT_EQ("3+ nop -> 36\n"
"4+ nop -> 31\n"
"5. nop -> 33\n"
"6+ byte [00-09] 0 -> 8\n"
"7. byte [0b-ff] 0 -> 8\n"
"8+ nop -> 6\n"
"9+ nop -> 29\n"
"10. nop -> 28\n"
"11+ byte [00-09] 0 -> 13\n"
"12. byte [0b-ff] 0 -> 13\n"
"13+ nop -> 11\n"
"14+ nop -> 26\n"
"15. nop -> 28\n"
"16+ byte [00-09] 0 -> 18\n"
"17. byte [0b-ff] 0 -> 18\n"
"18+ nop -> 16\n"
"19+ nop -> 36\n"
"20. nop -> 33\n"
"21+ byte [00-09] 0 -> 23\n"
"22. byte [0b-ff] 0 -> 23\n"
"23+ nop -> 21\n"
"24+ nop -> 31\n"
"25. nop -> 33\n"
"26+ nop -> 28\n"
"27. byte [53-53] 0 -> 11\n"
"28. match! 0\n"
"29+ nop -> 28\n"
"30. byte [53-53] 0 -> 6\n"
"31+ nop -> 33\n"
"32. byte [53-53] 0 -> 21\n"
"33+ nop -> 29\n"
"34+ nop -> 26\n"
"35. nop -> 28\n"
"36+ nop -> 33\n"
"37. byte [53-53] 0 -> 16\n",
forward);
}
} // namespace re2