Note that case-insensitive prefixes are already lowercase. Change-Id: Idbccb386ed50cbd3b40c72604aa42f0570e50a89 Reviewed-on: https://code-review.googlesource.com/c/re2/+/59030 Reviewed-by: Paul Wankadia <junyer@google.com>

commit: 4606f89f43b2991908b904576bb1b936891115bc [log] [tgz]
author: Paul Wankadia <junyer@google.com> Wed Jul 07 04:55:43 2021 -0700
committer: Paul Wankadia <junyer@google.com> Wed Jul 07 11:57:37 2021 +0000
tree: dd7b3f263f705407e1bf9b21b6e93879327799e5
parent: 79534807a0132d62a258c72088e4fe905867a248 [diff]
diff --git a/re2/prog.cc b/re2/prog.cc
index 3b2faaa..396b46c 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc

@@ -924,12 +924,6 @@
 // This function takes the prefix as std::string (i.e. not const std::string&
 // as normal) because it's going to clobber it, so a temporary is convenient.
 static uint64_t* BuildShiftDFA(std::string prefix) {
-  // Convert any ASCII letters to lowercase; uppercase will be handled later.
-  for (char& b : prefix) {
-    if ('A' <= b && b <= 'Z')
-      b += 'a' - 'A';
-  }
-
   // This constant is for convenience now and also for correctness later when
   // we clobber the prefix, but still need to know how long it was initially.
   const size_t size = prefix.size();
@@ -989,6 +983,10 @@
         ++dnext;
       dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
       // Convert ASCII letters to uppercase and record the extra transitions.
+      // Note that ASCII letters are guaranteed to be lowercase at this point
+      // because that's how the parser normalises them. #FunFact: 'k' and 's'
+      // match U+212A and U+017F, respectively, so they won't occur here when
+      // using UTF-8 encoding because the parser will emit character classes.
       if ('a' <= b && b <= 'z') {
         b -= 'a' - 'A';
         dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);

diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc
index 820cf2b..60a11f8 100644
--- a/re2/testing/required_prefix_test.cc
+++ b/re2/testing/required_prefix_test.cc

@@ -131,6 +131,40 @@
   }
 }
 
+TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) {
+  Regexp* re;
+  std::string p;
+  bool f;
+
+  // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'.
+  re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL);
+  ASSERT_TRUE(re != NULL);
+  ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
+  ASSERT_EQ(p, "klm");
+  ASSERT_EQ(f, true);
+  re->Decref();
+
+  re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL);
+  ASSERT_TRUE(re != NULL);
+  ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
+  ASSERT_EQ(p, "stu");
+  ASSERT_EQ(f, true);
+  re->Decref();
+
+  // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'.
+  // This is because they match U+212A and U+017F, respectively, and
+  // so the parser ends up emitting character classes, not literals.
+  re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL);
+  ASSERT_TRUE(re != NULL);
+  ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
+  re->Decref();
+
+  re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL);
+  ASSERT_TRUE(re != NULL);
+  ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
+  re->Decref();
+}
+
 static const char* prefix_accel_tests[] = {
     "aababc\\d+",
     "(?i)AABABC\\d+",
commit	4606f89f43b2991908b904576bb1b936891115bc	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Wed Jul 07 04:55:43 2021 -0700
committer	Paul Wankadia <junyer@google.com>	Wed Jul 07 11:57:37 2021 +0000
tree	dd7b3f263f705407e1bf9b21b6e93879327799e5
parent	79534807a0132d62a258c72088e4fe905867a248 [diff]