Optimise the hot loop some more for Clang. Change-Id: I751c9f5a6bebc068c86ff6f2f78f5488f5ee3d23 Reviewed-on: https://code-review.googlesource.com/c/re2/+/58931 Reviewed-by: Paul Wankadia <junyer@google.com>

commit: baadb363939b3448f17136c1133180430dd53215 [log] [tgz]
author: Paul Wankadia <junyer@google.com> Tue Jul 06 13:06:34 2021 -0700
committer: Paul Wankadia <junyer@google.com> Tue Jul 06 20:07:53 2021 +0000
tree: 15b7784d8c264ed4010fb73321dd338fda88c613
parent: 4c0fad4f3c245b7182fe09c869e9094fe14afa1d [diff]
diff --git a/re2/prog.cc b/re2/prog.cc
index 5139523..3979bd2 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc

@@ -917,6 +917,10 @@
   }
 }
 
+// The final state will always be this, which frees up a register for the hot
+// loop and thus avoids the spilling that can occur when building with Clang.
+static const size_t kShiftDFAFinal = 9;
+
 // This function takes the prefix as std::string (i.e. not const std::string&
 // as normal) because it's going to clobber it, so a temporary is convenient.
 static uint64_t* BuildShiftDFA(std::string prefix) {
@@ -965,6 +969,8 @@
     uint16_t ncurr = states[dcurr];
     uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
     size_t dnext = dcurr+1;
+    if (dnext == size)
+      dnext = kShiftDFAFinal;
     states[dnext] = nnext;
   }
 
@@ -994,7 +1000,7 @@
   // in the hot loop, we check for a match only at the end of each iteration,
   // so we must keep signalling the match until we get around to checking it.
   for (int b = 0; b < 256; ++b)
-    dfa[b] |= static_cast<uint64_t>(size * 6) << (size * 6);
+    dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6);
 
   return dfa;
 }
@@ -1006,7 +1012,7 @@
   if (prefix_foldcase_) {
     // Use PrefixAccel_ShiftDFA().
     // ... and no more than nine bytes of the prefix. (See above for details.)
-    prefix_size_ = std::min(prefix_size_, size_t{9});
+    prefix_size_ = std::min(prefix_size_, kShiftDFAFinal);
     prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_));
   } else if (prefix_size_ != 1) {
     // Use PrefixAccel_FrontAndBack().
@@ -1057,7 +1063,7 @@
       uint64_t curr6 = next6 >> (curr5 & 63);
       uint64_t curr7 = next7 >> (curr6 & 63);
 
-      if ((curr7 & 63) == prefix_size_ * 6) {
+      if ((curr7 & 63) == kShiftDFAFinal * 6) {
         // At the time of writing, using the same masking subexpressions from
         // the preceding lines caused Clang to clutter the hot loop computing
         // them - even though they aren't actually needed for shifting! Hence
@@ -1085,7 +1091,7 @@
     uint8_t b = *p++;
     uint64_t next = prefix_dfa_[b];
     curr = next >> (curr & 63);
-    if ((curr & 63) == prefix_size_ * 6)
+    if ((curr & 63) == kShiftDFAFinal * 6)
       return p-prefix_size_;
   }
   return NULL;
commit	baadb363939b3448f17136c1133180430dd53215	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Tue Jul 06 13:06:34 2021 -0700
committer	Paul Wankadia <junyer@google.com>	Tue Jul 06 20:07:53 2021 +0000
tree	15b7784d8c264ed4010fb73321dd338fda88c613
parent	4c0fad4f3c245b7182fe09c869e9094fe14afa1d [diff]