Permit Unicode characters beyond ASCII in capture names. Fixes #321. Change-Id: I486473a926f097b28862e390f32f0e80f8c548f6 Reviewed-on: https://code-review.googlesource.com/c/re2/+/59130 Reviewed-by: Paul Wankadia <junyer@google.com>

commit: 6a994180b85293eafcce21d9f3eb8a3526498248 [log] [tgz]
author: Paul Wankadia <junyer@google.com> Tue Aug 03 02:28:32 2021 -0700
committer: Paul Wankadia <junyer@google.com> Tue Aug 03 09:32:12 2021 +0000
tree: 15b79b6f117a687897b3200249e76db71d5bebb3
parent: 3a95199bcbea2f8762f899a66ea5f2e61fbd0395 [diff]
diff --git a/re2/parse.cc b/re2/parse.cc
index 87ff2ca..f1710af 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc

@@ -1409,13 +1409,15 @@
     }
   }
 
-  status->set_code(kRegexpBadUTF8);
-  status->set_error_arg(StringPiece());
+  if (status != NULL) {
+    status->set_code(kRegexpBadUTF8);
+    status->set_error_arg(StringPiece());
+  }
   return -1;
 }
 
-// Return whether name is valid UTF-8.
-// If not, set status to kRegexpBadUTF8.
+// Returns whether name is valid UTF-8.
+// If not, sets status to kRegexpBadUTF8.
 static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {
   StringPiece t = s;
   Rune r;
@@ -2013,19 +2015,33 @@
   return true;
 }
 
-// Is this a valid capture name?  [A-Za-z0-9_]+
-// PCRE limits names to 32 bytes.
-// Python rejects names starting with digits.
-// We don't enforce either of those.
+// Returns whether name is a valid capture name.
 static bool IsValidCaptureName(const StringPiece& name) {
   if (name.empty())
     return false;
-  for (size_t i = 0; i < name.size(); i++) {
-    int c = name[i];
-    if (('0' <= c && c <= '9') ||
-        ('a' <= c && c <= 'z') ||
-        ('A' <= c && c <= 'Z') ||
-        c == '_')
+
+  // Historically, we effectively used [0-9A-Za-z_]+ to validate; that
+  // followed Python 2 except for not restricting the first character.
+  // As of Python 3, Unicode characters beyond ASCII are also allowed;
+  // accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and
+  // Pc categories, but again without restricting the first character.
+  // Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3
+  // performs it for identifiers, but seemingly not for capture names;
+  // if they start doing that for capture names, we won't follow suit.
+  static const CharClass* const cc = []() {
+    CharClassBuilder ccb;
+    for (StringPiece group :
+         {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"})
+      AddUGroup(&ccb, LookupUnicodeGroup(group), +1, Regexp::NoParseFlags);
+    return ccb.GetCharClass();
+  }();
+
+  StringPiece t = name;
+  Rune r;
+  while (!t.empty()) {
+    if (StringPieceToRune(&r, &t, NULL) < 0)
+      return false;
+    if (cc->Contains(r))
       continue;
     return false;
   }

diff --git a/re2/regexp.cc b/re2/regexp.cc
index 1a38418..2e1bfac 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc

@@ -955,7 +955,7 @@
   return cc;
 }
 
-bool CharClass::Contains(Rune r) {
+bool CharClass::Contains(Rune r) const {
   RuneRange* rr = ranges_;
   int n = nranges_;
   while (n > 0) {

diff --git a/re2/regexp.h b/re2/regexp.h
index 2f40642..b6446f9 100644
--- a/re2/regexp.h
+++ b/re2/regexp.h

@@ -254,7 +254,7 @@
   bool full() { return nrunes_ == Runemax+1; }
   bool FoldsASCII() { return folds_ascii_; }
 
-  bool Contains(Rune r);
+  bool Contains(Rune r) const;
   CharClass* Negate();
 
  private:

diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 3446526..e571127 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc

@@ -164,6 +164,7 @@
 
   // Test named captures
   { "(?P<name>a)", "cap{name:lit{a}}" },
+  { "(?P<中文>a)", "cap{中文:lit{a}}" },
 
   // Case-folded literals
   { "[Aa]", "litfold{a}" },
commit	6a994180b85293eafcce21d9f3eb8a3526498248	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Tue Aug 03 02:28:32 2021 -0700
committer	Paul Wankadia <junyer@google.com>	Tue Aug 03 09:32:12 2021 +0000
tree	15b79b6f117a687897b3200249e76db71d5bebb3
parent	3a95199bcbea2f8762f899a66ea5f2e61fbd0395 [diff]