Use the standard first-byte analysis for the DFA too. Change-Id: I1b773ac456091b6e9e25d016119829daabc910b0 Reviewed-on: https://code-review.googlesource.com/27270 Reviewed-by: Paul Wankadia <junyer@google.com>

commit: eb69e9172ec4015fa4ba74ac7b026f818abf8cf1 [log] [tgz]
author: Paul Wankadia <junyer@google.com> Sat May 05 07:03:14 2018 -0700
committer: Paul Wankadia <junyer@google.com> Sat May 05 14:07:59 2018 +0000
tree: 11ff88a572956be49d519e940d9279462c81bd37
parent: 71e6699778cebf99ed11e78e48b9370f79b9ea39 [diff]
diff --git a/re2/dfa.cc b/re2/dfa.cc
index c1c8460..9bc8499 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc

@@ -177,11 +177,10 @@
   typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
 
  private:
-  // Special "firstbyte" values for a state.  (Values >= 0 denote actual bytes.)
+  // Special "first_byte" values for a state.  (Values >= 0 denote actual bytes.)
   enum {
     kFbUnknown = -1,   // No analysis has been performed.
-    kFbMany = -2,      // Many bytes will lead out of this state.
-    kFbNone = -3,      // No bytes lead out of this state.
+    kFbNone = -2,      // The first-byte trick cannot be used.
   };
 
   enum {
@@ -255,7 +254,7 @@
         want_earliest_match(false),
         run_forward(false),
         start(NULL),
-        firstbyte(kFbUnknown),
+        first_byte(kFbUnknown),
         cache_lock(cache_lock),
         failed(false),
         ep(NULL),
@@ -267,7 +266,7 @@
     bool want_earliest_match;
     bool run_forward;
     State* start;
-    int firstbyte;
+    int first_byte;
     RWLocker *cache_lock;
     bool failed;     // "out" parameter: whether search gave up
     const char* ep;  // "out" parameter: end pointer for match
@@ -280,14 +279,14 @@
 
   // Before each search, the parameters to Search are analyzed by
   // AnalyzeSearch to determine the state in which to start and the
-  // "firstbyte" for that state, if any.
+  // "first_byte" for that state, if any.
   struct StartInfo {
-    StartInfo() : start(NULL), firstbyte(kFbUnknown) { }
+    StartInfo() : start(NULL), first_byte(kFbUnknown) {}
     State* start;
-    std::atomic<int> firstbyte;
+    std::atomic<int> first_byte;
   };
 
-  // Fills in params->start and params->firstbyte using
+  // Fills in params->start and params->first_byte using
   // the other search parameters.  Returns true on success,
   // false on failure.
   // cache_mutex_.r <= L < mutex_
@@ -299,7 +298,7 @@
   // cache_mutex_.r <= L < mutex_
   // Might unlock and relock cache_mutex_ via params->cache_lock.
   inline bool InlinedSearchLoop(SearchParams* params,
-                                bool have_firstbyte,
+                                bool have_first_byte,
                                 bool want_earliest_match,
                                 bool run_forward);
 
@@ -1169,7 +1168,7 @@
   // Clear the cache, reset the memory budget.
   for (int i = 0; i < kMaxStart; i++) {
     start_[i].start = NULL;
-    start_[i].firstbyte.store(kFbUnknown, std::memory_order_relaxed);
+    start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed);
   }
   ClearCache();
   mem_budget_ = state_budget_;
@@ -1286,7 +1285,7 @@
 // Instead, it can call memchr to search very quickly for the byte c.
 // Whether the start state has this property is determined during a
 // pre-compilation pass, and if so, the byte b is passed to the search
-// loop as the "firstbyte" argument, along with a boolean "have_firstbyte".
+// loop as the "first_byte" argument, along with a boolean "have_first_byte".
 //
 // Fourth, the desired behavior is to search for the leftmost-best match
 // (approximately, the same one that Perl would find), which is not
@@ -1319,7 +1318,7 @@
 // making them function arguments lets the inliner specialize
 // this function to each combination (see two paragraphs above).
 inline bool DFA::InlinedSearchLoop(SearchParams* params,
-                                   bool have_firstbyte,
+                                   bool have_first_byte,
                                    bool want_earliest_match,
                                    bool run_forward) {
   State* start = params->start;
@@ -1364,18 +1363,18 @@
       fprintf(stderr, "@%td: %s\n",
               p - bp, DumpState(s).c_str());
 
-    if (have_firstbyte && s == start) {
-      // In start state, only way out is to find firstbyte,
+    if (have_first_byte && s == start) {
+      // In start state, only way out is to find first_byte,
       // so use optimized assembly in memchr to skip ahead.
-      // If firstbyte isn't found, we can skip to the end
+      // If first_byte isn't found, we can skip to the end
       // of the string.
       if (run_forward) {
-        if ((p = BytePtr(memchr(p, params->firstbyte, ep - p))) == NULL) {
+        if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) {
           p = ep;
           break;
         }
       } else {
-        if ((p = BytePtr(memrchr(ep, params->firstbyte, p - ep))) == NULL) {
+        if ((p = BytePtr(memrchr(ep, params->first_byte, p - ep))) == NULL) {
           p = ep;
           break;
         }
@@ -1580,7 +1579,7 @@
 // For debugging, calls the general code directly.
 bool DFA::SlowSearchLoop(SearchParams* params) {
   return InlinedSearchLoop(params,
-                           params->firstbyte >= 0,
+                           params->first_byte >= 0,
                            params->want_earliest_match,
                            params->run_forward);
 }
@@ -1601,8 +1600,8 @@
     &DFA::SearchTTT,
   };
 
-  bool have_firstbyte = (params->firstbyte >= 0);
-  int index = 4 * have_firstbyte +
+  bool have_first_byte = params->first_byte >= 0;
+  int index = 4 * have_first_byte +
               2 * params->want_earliest_match +
               1 * params->run_forward;
   return (this->*Searches[index])(params);
@@ -1678,7 +1677,7 @@
       flags = 0;
     }
   }
-  if (params->anchored || prog_->anchor_start())
+  if (params->anchored)
     start |= kStartAnchored;
   StartInfo* info = &start_[start];
 
@@ -1695,12 +1694,12 @@
   }
 
   if (ExtraDebug)
-    fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n",
+    fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n",
             params->anchored, params->run_forward, flags,
-            DumpState(info->start).c_str(), info->firstbyte.load());
+            DumpState(info->start).c_str(), info->first_byte.load());
 
   params->start = info->start;
-  params->firstbyte = info->firstbyte.load(std::memory_order_acquire);
+  params->first_byte = info->first_byte.load(std::memory_order_acquire);
 
   return true;
 }
@@ -1709,12 +1708,12 @@
 bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
                               uint32_t flags) {
   // Quick check.
-  int fb = info->firstbyte.load(std::memory_order_acquire);
+  int fb = info->first_byte.load(std::memory_order_acquire);
   if (fb != kFbUnknown)
     return true;
 
   MutexLock l(&mutex_);
-  fb = info->firstbyte.load(std::memory_order_relaxed);
+  fb = info->first_byte.load(std::memory_order_relaxed);
   if (fb != kFbUnknown)
     return true;
 
@@ -1728,40 +1727,28 @@
 
   if (info->start == DeadState) {
     // Synchronize with "quick check" above.
-    info->firstbyte.store(kFbNone, std::memory_order_release);
+    info->first_byte.store(kFbNone, std::memory_order_release);
     return true;
   }
 
   if (info->start == FullMatchState) {
     // Synchronize with "quick check" above.
-    info->firstbyte.store(kFbNone, std::memory_order_release);  // will be ignored
+    info->first_byte.store(kFbNone, std::memory_order_release);  // will be ignored
     return true;
   }
 
-  // Compute info->firstbyte by running state on all
-  // possible byte values, looking for a single one that
-  // leads to a different state.
-  int firstbyte = kFbNone;
-  for (int i = 0; i < 256; i++) {
-    State* s = RunStateOnByte(info->start, i);
-    if (s == NULL) {
-      // Synchronize with "quick check" above.
-      info->firstbyte.store(kFbUnknown, std::memory_order_release);
-      return false;
-    }
-    if (s == info->start)
-      continue;
-    // Goes to new state...
-    if (firstbyte == kFbNone) {
-      firstbyte = i;        // ... first one
-    } else {
-      firstbyte = kFbMany;  // ... too many
-      break;
-    }
-  }
+  // Even if we have a first_byte, we cannot use it when anchored and,
+  // less obviously, we cannot use it when we are going to need flags.
+  // This trick works only when there is a single byte that leads to a
+  // different state!
+  int first_byte = prog_->first_byte();
+  if (first_byte == -1 ||
+      params->anchored ||
+      info->start->flag_ >> kFlagNeedShift != 0)
+    first_byte = kFbNone;
 
   // Synchronize with "quick check" above.
-  info->firstbyte.store(firstbyte, std::memory_order_release);
+  info->first_byte.store(first_byte, std::memory_order_release);
   return true;
 }
 
@@ -1873,9 +1860,8 @@
   bool carat = anchor_start();
   bool dollar = anchor_end();
   if (reversed_) {
-    bool t = carat;
-    carat = dollar;
-    dollar = t;
+    using std::swap;
+    swap(carat, dollar);
   }
   if (carat && context.begin() != text.begin())
     return false;
commit	eb69e9172ec4015fa4ba74ac7b026f818abf8cf1	[log] [tgz]
author	Paul Wankadia <junyer@google.com>	Sat May 05 07:03:14 2018 -0700
committer	Paul Wankadia <junyer@google.com>	Sat May 05 14:07:59 2018 +0000
tree	11ff88a572956be49d519e940d9279462c81bd37
parent	71e6699778cebf99ed11e78e48b9370f79b9ea39 [diff]