Implement Regexp::RequiredPrefixUnanchored().
Change-Id: I1adcb0f77c28c5b54cf0cee2abe56f67a24c96bc
Reviewed-on: https://code-review.googlesource.com/c/re2/+/56212
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/regexp.cc b/re2/regexp.cc
index ea8b37e..9f49ade 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -690,28 +690,51 @@
// 3. the rest
if (op_ != kRegexpConcat)
return false;
- Regexp** sub = this->sub();
int i = 0;
- while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
+ while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
- if (sub[i]->op_ != kRegexpLiteral &&
- sub[i]->op_ != kRegexpLiteralString)
+ Regexp* re = sub()[i];
+ if (re->op_ != kRegexpLiteral &&
+ re->op_ != kRegexpLiteralString)
return false;
-
- bool latin1 = (sub[i]->parse_flags() & Latin1) != 0;
- Rune* runes = sub[i]->op_ == kRegexpLiteral ? &sub[i]->rune_ : sub[i]->runes_;
- int nrunes = sub[i]->op_ == kRegexpLiteral ? 1 : sub[i]->nrunes_;
- ConvertRunesToBytes(latin1, runes, nrunes, prefix);
- *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
- if (++i < nsub_) {
+ i++;
+ if (i < nsub_) {
for (int j = i; j < nsub_; j++)
- sub[j]->Incref();
- *suffix = Concat(sub + i, nsub_ - i, parse_flags());
+ sub()[j]->Incref();
+ *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
} else {
*suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
}
+
+ bool latin1 = (re->parse_flags() & Latin1) != 0;
+ Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
+ int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
+ ConvertRunesToBytes(latin1, runes, nrunes, prefix);
+ *foldcase = (re->parse_flags() & FoldCase) != 0;
+ return true;
+}
+
+// Determines whether regexp matches must be unanchored
+// with a fixed string prefix. If so, returns the prefix.
+// The prefix might be ASCII case-insensitive.
+bool Regexp::RequiredPrefixUnanchored(std::string* prefix, bool* foldcase) {
+ prefix->clear();
+ *foldcase = false;
+
+ // No need for a walker: the regexp must either begin with or be
+ // a literal char or string.
+ Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
+ if (re->op_ != kRegexpLiteral &&
+ re->op_ != kRegexpLiteralString)
+ return false;
+
+ bool latin1 = (re->parse_flags() & Latin1) != 0;
+ Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
+ int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
+ ConvertRunesToBytes(latin1, runes, nrunes, prefix);
+ *foldcase = (re->parse_flags() & FoldCase) != 0;
return true;
}
diff --git a/re2/regexp.h b/re2/regexp.h
index a5d85c8..b39ef69 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -440,6 +440,13 @@
bool RequiredPrefix(std::string* prefix, bool* foldcase,
Regexp** suffix);
+ // Whether every match of this regexp must be unanchored and
+ // begin with a non-empty fixed string (perhaps after ASCII
+ // case-folding). If so, returns the prefix.
+ // Callers should expect *prefix and *foldcase to be "zeroed"
+ // regardless of the return value.
+ bool RequiredPrefixUnanchored(std::string* prefix, bool* foldcase);
+
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
diff --git a/re2/testing/required_prefix_test.cc b/re2/testing/required_prefix_test.cc
index 5460045..67b94b6 100644
--- a/re2/testing/required_prefix_test.cc
+++ b/re2/testing/required_prefix_test.cc
@@ -19,10 +19,13 @@
};
static PrefixTest tests[] = {
- // If the regexp is missing a ^, there's no required prefix.
- { "abc", false },
+ // Empty cases.
{ "", false },
{ "(?m)^", false },
+ { "(?-m)^", false },
+
+ // If the regexp has no ^, there's no required prefix.
+ { "abc", false },
// If the regexp immediately goes into
// something not a literal match, there's no required prefix.
@@ -53,15 +56,15 @@
bool f;
Regexp* s;
ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
- << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf")
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
<< " " << re->Dump();
if (t.return_value) {
ASSERT_EQ(p, std::string(t.prefix))
- << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
ASSERT_EQ(f, t.foldcase)
- << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
ASSERT_EQ(s->ToString(), std::string(t.suffix))
- << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf");
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
s->Decref();
}
re->Decref();
@@ -69,4 +72,54 @@
}
}
+static PrefixTest unanchored_tests[] = {
+ // Empty cases.
+ { "", false },
+ { "(?m)^", false },
+ { "(?-m)^", false },
+
+ // If the regexp has a ^, there's no required prefix.
+ { "^abc", false },
+
+ // If the regexp immediately goes into
+ // something not a literal match, there's no required prefix.
+ { "(abc)", false },
+ { "a*", false },
+
+ // Otherwise, it should work.
+ { "abc$", true, "abc", false, },
+ { "abc", true, "abc", false, },
+ { "(?i)abc", true, "abc", true, },
+ { "abcd*", true, "abc", false, },
+ { "[Aa][Bb]cd*", true, "ab", true, },
+ { "ab[Cc]d*", true, "ab", false, },
+ { "☺abc", true, "☺abc", false, },
+};
+
+TEST(RequiredPrefixUnanchored, SimpleTests) {
+ for (size_t i = 0; i < arraysize(unanchored_tests); i++) {
+ const PrefixTest& t = unanchored_tests[i];
+ for (size_t j = 0; j < 2; j++) {
+ Regexp::ParseFlags flags = Regexp::LikePerl;
+ if (j == 0)
+ flags = flags | Regexp::Latin1;
+ Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
+ ASSERT_TRUE(re != NULL) << " " << t.regexp;
+
+ std::string p;
+ bool f;
+ ASSERT_EQ(t.return_value, re->RequiredPrefixUnanchored(&p, &f))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
+ << " " << re->Dump();
+ if (t.return_value) {
+ ASSERT_EQ(p, std::string(t.prefix))
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ ASSERT_EQ(f, t.foldcase)
+ << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
+ }
+ re->Decref();
+ }
+ }
+}
+
} // namespace re2