Permit Unicode characters beyond ASCII in capture names.
Fixes #321.
Change-Id: I486473a926f097b28862e390f32f0e80f8c548f6
Reviewed-on: https://code-review.googlesource.com/c/re2/+/59130
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/parse.cc b/re2/parse.cc
index 87ff2ca..f1710af 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -1409,13 +1409,15 @@
}
}
- status->set_code(kRegexpBadUTF8);
- status->set_error_arg(StringPiece());
+ if (status != NULL) {
+ status->set_code(kRegexpBadUTF8);
+ status->set_error_arg(StringPiece());
+ }
return -1;
}
-// Return whether name is valid UTF-8.
-// If not, set status to kRegexpBadUTF8.
+// Returns whether name is valid UTF-8.
+// If not, sets status to kRegexpBadUTF8.
static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
StringPiece t = s;
Rune r;
@@ -2013,19 +2015,33 @@
return true;
}
-// Is this a valid capture name? [A-Za-z0-9_]+
-// PCRE limits names to 32 bytes.
-// Python rejects names starting with digits.
-// We don't enforce either of those.
+// Returns whether name is a valid capture name.
static bool IsValidCaptureName(const StringPiece& name) {
if (name.empty())
return false;
- for (size_t i = 0; i < name.size(); i++) {
- int c = name[i];
- if (('0' <= c && c <= '9') ||
- ('a' <= c && c <= 'z') ||
- ('A' <= c && c <= 'Z') ||
- c == '_')
+
+ // Historically, we effectively used [0-9A-Za-z_]+ to validate; that
+ // followed Python 2 except for not restricting the first character.
+ // As of Python 3, Unicode characters beyond ASCII are also allowed;
+ // accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and
+ // Pc categories, but again without restricting the first character.
+ // Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3
+ // performs it for identifiers, but seemingly not for capture names;
+ // if they start doing that for capture names, we won't follow suit.
+ static const CharClass* const cc = []() {
+ CharClassBuilder ccb;
+ for (StringPiece group :
+ {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"})
+ AddUGroup(&ccb, LookupUnicodeGroup(group), +1, Regexp::NoParseFlags);
+ return ccb.GetCharClass();
+ }();
+
+ StringPiece t = name;
+ Rune r;
+ while (!t.empty()) {
+ if (StringPieceToRune(&r, &t, NULL) < 0)
+ return false;
+ if (cc->Contains(r))
continue;
return false;
}
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 1a38418..2e1bfac 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -955,7 +955,7 @@
return cc;
}
-bool CharClass::Contains(Rune r) {
+bool CharClass::Contains(Rune r) const {
RuneRange* rr = ranges_;
int n = nranges_;
while (n > 0) {
diff --git a/re2/regexp.h b/re2/regexp.h
index 2f40642..b6446f9 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h
@@ -254,7 +254,7 @@
bool full() { return nrunes_ == Runemax+1; }
bool FoldsASCII() { return folds_ascii_; }
- bool Contains(Rune r);
+ bool Contains(Rune r) const;
CharClass* Negate();
private:
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 3446526..e571127 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -164,6 +164,7 @@
// Test named captures
{ "(?P<name>a)", "cap{name:lit{a}}" },
+ { "(?P<中文>a)", "cap{中文:lit{a}}" },
// Case-folded literals
{ "[Aa]", "litfold{a}" },