Note that case-insensitive prefixes are already lowercase.
Change-Id: Idbccb386ed50cbd3b40c72604aa42f0570e50a89
Reviewed-on: https://code-review.googlesource.com/c/re2/+/59030
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/prog.cc b/re2/prog.cc
index 3b2faaa..396b46c 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -924,12 +924,6 @@
// This function takes the prefix as std::string (i.e. not const std::string&
// as normal) because it's going to clobber it, so a temporary is convenient.
static uint64_t* BuildShiftDFA(std::string prefix) {
- // Convert any ASCII letters to lowercase; uppercase will be handled later.
- for (char& b : prefix) {
- if ('A' <= b && b <= 'Z')
- b += 'a' - 'A';
- }
-
// This constant is for convenience now and also for correctness later when
// we clobber the prefix, but still need to know how long it was initially.
const size_t size = prefix.size();
@@ -989,6 +983,10 @@
++dnext;
dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
// Convert ASCII letters to uppercase and record the extra transitions.
+ // Note that ASCII letters are guaranteed to be lowercase at this point
+ // because that's how the parser normalises them. #FunFact: 'k' and 's'
+ // match U+212A and U+017F, respectively, so they won't occur here when
+ // using UTF-8 encoding because the parser will emit character classes.
if ('a' <= b && b <= 'z') {
b -= 'a' - 'A';
dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc
index 820cf2b..60a11f8 100644
--- a/re2/testing/required_prefix_test.cc
+++ b/re2/testing/required_prefix_test.cc
@@ -131,6 +131,40 @@
}
}
+TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) {
+ Regexp* re;
+ std::string p;
+ bool f;
+
+ // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'.
+ re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL);
+ ASSERT_TRUE(re != NULL);
+ ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
+ ASSERT_EQ(p, "klm");
+ ASSERT_EQ(f, true);
+ re->Decref();
+
+ re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL);
+ ASSERT_TRUE(re != NULL);
+ ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
+ ASSERT_EQ(p, "stu");
+ ASSERT_EQ(f, true);
+ re->Decref();
+
+ // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'.
+ // This is because they match U+212A and U+017F, respectively, and
+ // so the parser ends up emitting character classes, not literals.
+ re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
+ re->Decref();
+
+ re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL);
+ ASSERT_TRUE(re != NULL);
+ ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
+ re->Decref();
+}
+
static const char* prefix_accel_tests[] = {
"aababc\\d+",
"(?i)AABABC\\d+",