Compute first_byte_ eagerly.
Change-Id: Id4ebc31a21914c0cfdde40037da1bec9ab538c76
Reviewed-on: https://code-review.googlesource.com/c/re2/+/55234
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/compile.cc b/re2/compile.cc
index 326ab15..b3e5bdc 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -1186,6 +1186,7 @@
prog_->Optimize();
prog_->Flatten();
prog_->ComputeByteMap();
+ prog_->ComputeFirstByte();
// Record remaining memory for DFA.
if (max_mem_ <= 0) {
diff --git a/re2/nfa.cc b/re2/nfa.cc
index 77fb5fb..75ca306 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -629,10 +629,7 @@
return false;
}
-// Computes whether all successful matches have a common first byte,
-// and if so, returns that byte. If not, returns -1.
-int Prog::ComputeFirstByte() {
- int b = -1;
+void Prog::ComputeFirstByte() {
SparseSet q(size());
q.insert(start());
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
@@ -645,23 +642,27 @@
case kInstMatch:
// The empty string matches: no first byte.
- return -1;
+ first_byte_ = -1;
+ return;
case kInstByteRange:
if (!ip->last())
q.insert(id+1);
- // Must match only a single byte
- if (ip->lo() != ip->hi())
- return -1;
- if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
- return -1;
+ // Must match only a single byte.
+ if (ip->lo() != ip->hi() ||
+ (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')) {
+ first_byte_ = -1;
+ return;
+ }
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
- if (b == -1)
- b = ip->lo();
- else if (b != ip->lo())
- return -1;
+ if (first_byte_ == -1) {
+ first_byte_ = ip->lo();
+ } else if (first_byte_ != ip->lo()) {
+ first_byte_ = -1;
+ return;
+ }
break;
case kInstNop:
@@ -687,7 +688,6 @@
break;
}
}
- return b;
}
bool
diff --git a/re2/prog.cc b/re2/prog.cc
index 9e91faf..9244319 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -184,14 +184,31 @@
return map;
}
-int Prog::first_byte() {
- std::call_once(first_byte_once_, [](Prog* prog) {
- prog->first_byte_ = prog->ComputeFirstByte();
- }, this);
- return first_byte_;
-}
+// Is ip a guaranteed match at end of text, perhaps after some capturing?
+static bool IsMatch(Prog* prog, Prog::Inst* ip) {
+ for (;;) {
+ switch (ip->opcode()) {
+ default:
+ LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
+ return false;
-static bool IsMatch(Prog*, Prog::Inst*);
+ case kInstAlt:
+ case kInstAltMatch:
+ case kInstByteRange:
+ case kInstFail:
+ case kInstEmptyWidth:
+ return false;
+
+ case kInstCapture:
+ case kInstNop:
+ ip = prog->inst(ip->out());
+ break;
+
+ case kInstMatch:
+ return true;
+ }
+ }
+}
// Peep-hole optimizer.
void Prog::Optimize() {
@@ -257,32 +274,6 @@
}
}
-// Is ip a guaranteed match at end of text, perhaps after some capturing?
-static bool IsMatch(Prog* prog, Prog::Inst* ip) {
- for (;;) {
- switch (ip->opcode()) {
- default:
- LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
- return false;
-
- case kInstAlt:
- case kInstAltMatch:
- case kInstByteRange:
- case kInstFail:
- case kInstEmptyWidth:
- return false;
-
- case kInstCapture:
- case kInstNop:
- ip = prog->inst(ip->out());
- break;
-
- case kInstMatch:
- return true;
- }
- }
-}
-
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
diff --git a/re2/prog.h b/re2/prog.h
index 4ba72c0..4306672 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -215,9 +215,7 @@
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
-
- // Lazily computed.
- int first_byte();
+ int first_byte() { return first_byte_; }
// Returns string representation of program for debugging.
std::string Dump();
@@ -295,9 +293,8 @@
// Compute bytemap.
void ComputeByteMap();
- // Computes whether all matches must begin with the same first
- // byte, and if so, returns that byte. If not, returns -1.
- int ComputeFirstByte();
+ // Computes whether all matches must begin with the same first byte.
+ void ComputeFirstByte();
// Run peep-hole optimizer on program.
void Optimize();
@@ -416,7 +413,6 @@
uint8_t bytemap_[256]; // map from input bytes to byte classes
- std::once_flag first_byte_once_;
std::once_flag dfa_first_once_;
std::once_flag dfa_longest_once_;