Use the standard first-byte analysis for the DFA too.
Change-Id: I1b773ac456091b6e9e25d016119829daabc910b0
Reviewed-on: https://code-review.googlesource.com/27270
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/dfa.cc b/re2/dfa.cc
index c1c8460..9bc8499 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -177,11 +177,10 @@
typedef std::unordered_set<State*, StateHash, StateEqual> StateSet;
private:
- // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.)
+ // Special "first_byte" values for a state. (Values >= 0 denote actual bytes.)
enum {
kFbUnknown = -1, // No analysis has been performed.
- kFbMany = -2, // Many bytes will lead out of this state.
- kFbNone = -3, // No bytes lead out of this state.
+ kFbNone = -2, // The first-byte trick cannot be used.
};
enum {
@@ -255,7 +254,7 @@
want_earliest_match(false),
run_forward(false),
start(NULL),
- firstbyte(kFbUnknown),
+ first_byte(kFbUnknown),
cache_lock(cache_lock),
failed(false),
ep(NULL),
@@ -267,7 +266,7 @@
bool want_earliest_match;
bool run_forward;
State* start;
- int firstbyte;
+ int first_byte;
RWLocker *cache_lock;
bool failed; // "out" parameter: whether search gave up
const char* ep; // "out" parameter: end pointer for match
@@ -280,14 +279,14 @@
// Before each search, the parameters to Search are analyzed by
// AnalyzeSearch to determine the state in which to start and the
- // "firstbyte" for that state, if any.
+ // "first_byte" for that state, if any.
struct StartInfo {
- StartInfo() : start(NULL), firstbyte(kFbUnknown) { }
+ StartInfo() : start(NULL), first_byte(kFbUnknown) {}
State* start;
- std::atomic<int> firstbyte;
+ std::atomic<int> first_byte;
};
- // Fills in params->start and params->firstbyte using
+ // Fills in params->start and params->first_byte using
// the other search parameters. Returns true on success,
// false on failure.
// cache_mutex_.r <= L < mutex_
@@ -299,7 +298,7 @@
// cache_mutex_.r <= L < mutex_
// Might unlock and relock cache_mutex_ via params->cache_lock.
inline bool InlinedSearchLoop(SearchParams* params,
- bool have_firstbyte,
+ bool have_first_byte,
bool want_earliest_match,
bool run_forward);
@@ -1169,7 +1168,7 @@
// Clear the cache, reset the memory budget.
for (int i = 0; i < kMaxStart; i++) {
start_[i].start = NULL;
- start_[i].firstbyte.store(kFbUnknown, std::memory_order_relaxed);
+ start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed);
}
ClearCache();
mem_budget_ = state_budget_;
@@ -1286,7 +1285,7 @@
// Instead, it can call memchr to search very quickly for the byte c.
// Whether the start state has this property is determined during a
// pre-compilation pass, and if so, the byte b is passed to the search
-// loop as the "firstbyte" argument, along with a boolean "have_firstbyte".
+// loop as the "first_byte" argument, along with a boolean "have_first_byte".
//
// Fourth, the desired behavior is to search for the leftmost-best match
// (approximately, the same one that Perl would find), which is not
@@ -1319,7 +1318,7 @@
// making them function arguments lets the inliner specialize
// this function to each combination (see two paragraphs above).
inline bool DFA::InlinedSearchLoop(SearchParams* params,
- bool have_firstbyte,
+ bool have_first_byte,
bool want_earliest_match,
bool run_forward) {
State* start = params->start;
@@ -1364,18 +1363,18 @@
fprintf(stderr, "@%td: %s\n",
p - bp, DumpState(s).c_str());
- if (have_firstbyte && s == start) {
- // In start state, only way out is to find firstbyte,
+ if (have_first_byte && s == start) {
+ // In start state, only way out is to find first_byte,
// so use optimized assembly in memchr to skip ahead.
- // If firstbyte isn't found, we can skip to the end
+ // If first_byte isn't found, we can skip to the end
// of the string.
if (run_forward) {
- if ((p = BytePtr(memchr(p, params->firstbyte, ep - p))) == NULL) {
+ if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) {
p = ep;
break;
}
} else {
- if ((p = BytePtr(memrchr(ep, params->firstbyte, p - ep))) == NULL) {
+ if ((p = BytePtr(memrchr(ep, params->first_byte, p - ep))) == NULL) {
p = ep;
break;
}
@@ -1580,7 +1579,7 @@
// For debugging, calls the general code directly.
bool DFA::SlowSearchLoop(SearchParams* params) {
return InlinedSearchLoop(params,
- params->firstbyte >= 0,
+ params->first_byte >= 0,
params->want_earliest_match,
params->run_forward);
}
@@ -1601,8 +1600,8 @@
&DFA::SearchTTT,
};
- bool have_firstbyte = (params->firstbyte >= 0);
- int index = 4 * have_firstbyte +
+ bool have_first_byte = params->first_byte >= 0;
+ int index = 4 * have_first_byte +
2 * params->want_earliest_match +
1 * params->run_forward;
return (this->*Searches[index])(params);
@@ -1678,7 +1677,7 @@
flags = 0;
}
}
- if (params->anchored || prog_->anchor_start())
+ if (params->anchored)
start |= kStartAnchored;
StartInfo* info = &start_[start];
@@ -1695,12 +1694,12 @@
}
if (ExtraDebug)
- fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n",
+ fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n",
params->anchored, params->run_forward, flags,
- DumpState(info->start).c_str(), info->firstbyte.load());
+ DumpState(info->start).c_str(), info->first_byte.load());
params->start = info->start;
- params->firstbyte = info->firstbyte.load(std::memory_order_acquire);
+ params->first_byte = info->first_byte.load(std::memory_order_acquire);
return true;
}
@@ -1709,12 +1708,12 @@
bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
uint32_t flags) {
// Quick check.
- int fb = info->firstbyte.load(std::memory_order_acquire);
+ int fb = info->first_byte.load(std::memory_order_acquire);
if (fb != kFbUnknown)
return true;
MutexLock l(&mutex_);
- fb = info->firstbyte.load(std::memory_order_relaxed);
+ fb = info->first_byte.load(std::memory_order_relaxed);
if (fb != kFbUnknown)
return true;
@@ -1728,40 +1727,28 @@
if (info->start == DeadState) {
// Synchronize with "quick check" above.
- info->firstbyte.store(kFbNone, std::memory_order_release);
+ info->first_byte.store(kFbNone, std::memory_order_release);
return true;
}
if (info->start == FullMatchState) {
// Synchronize with "quick check" above.
- info->firstbyte.store(kFbNone, std::memory_order_release); // will be ignored
+ info->first_byte.store(kFbNone, std::memory_order_release); // will be ignored
return true;
}
- // Compute info->firstbyte by running state on all
- // possible byte values, looking for a single one that
- // leads to a different state.
- int firstbyte = kFbNone;
- for (int i = 0; i < 256; i++) {
- State* s = RunStateOnByte(info->start, i);
- if (s == NULL) {
- // Synchronize with "quick check" above.
- info->firstbyte.store(kFbUnknown, std::memory_order_release);
- return false;
- }
- if (s == info->start)
- continue;
- // Goes to new state...
- if (firstbyte == kFbNone) {
- firstbyte = i; // ... first one
- } else {
- firstbyte = kFbMany; // ... too many
- break;
- }
- }
+ // Even if we have a first_byte, we cannot use it when anchored and,
+ // less obviously, we cannot use it when we are going to need flags.
+ // This trick works only when there is a single byte that leads to a
+ // different state!
+ int first_byte = prog_->first_byte();
+ if (first_byte == -1 ||
+ params->anchored ||
+ info->start->flag_ >> kFlagNeedShift != 0)
+ first_byte = kFbNone;
// Synchronize with "quick check" above.
- info->firstbyte.store(firstbyte, std::memory_order_release);
+ info->first_byte.store(first_byte, std::memory_order_release);
return true;
}
@@ -1873,9 +1860,8 @@
bool carat = anchor_start();
bool dollar = anchor_end();
if (reversed_) {
- bool t = carat;
- carat = dollar;
- dollar = t;
+ using std::swap;
+ swap(carat, dollar);
}
if (carat && context.begin() != text.begin())
return false;