blob: 499f560f8d64fc01752a3501f1f0bd9f14c3b803 [file] [log] [blame]
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled regular expression representation.
// Tested by compile_test.cc
#include "util/util.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
namespace re2 {
// Constructors per Inst opcode
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstAlt);
out1_ = out1;
}
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
foldcase_ = foldcase & 0xFF;
}
void Prog::Inst::InitCapture(int cap, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstCapture);
cap_ = cap;
}
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstEmptyWidth);
empty_ = empty;
}
void Prog::Inst::InitMatch(int32 id) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstMatch);
match_id_ = id;
}
void Prog::Inst::InitNop(uint32 out) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstNop);
}
void Prog::Inst::InitFail() {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstFail);
}
string Prog::Inst::Dump() {
switch (opcode()) {
default:
return StringPrintf("opcode %d", static_cast<int>(opcode()));
case kInstAlt:
return StringPrintf("alt -> %d | %d", out(), out1_);
case kInstAltMatch:
return StringPrintf("altmatch -> %d | %d", out(), out1_);
case kInstByteRange:
return StringPrintf("byte%s [%02x-%02x] -> %d",
foldcase_ ? "/i" : "",
lo_, hi_, out());
case kInstCapture:
return StringPrintf("capture %d -> %d", cap_, out());
case kInstEmptyWidth:
return StringPrintf("emptywidth %#x -> %d",
static_cast<int>(empty_), out());
case kInstMatch:
return StringPrintf("match! %d", match_id());
case kInstNop:
return StringPrintf("nop -> %d", out());
case kInstFail:
return StringPrintf("fail");
}
}
Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
size_(0),
byte_inst_count_(0),
bytemap_range_(0),
flags_(0),
onepass_statesize_(0),
inst_(NULL),
dfa_first_(NULL),
dfa_longest_(NULL),
dfa_mem_(0),
delete_dfa_(NULL),
unbytemap_(NULL),
onepass_nodes_(NULL),
onepass_start_(NULL) {
}
Prog::~Prog() {
if (delete_dfa_) {
if (dfa_first_)
delete_dfa_(dfa_first_);
if (dfa_longest_)
delete_dfa_(dfa_longest_);
}
delete[] onepass_nodes_;
delete[] inst_;
delete[] unbytemap_;
}
typedef SparseSet Workq;
static inline void AddToQueue(Workq* q, int id) {
if (id != 0)
q->insert(id);
}
static string ProgToString(Prog* prog, Workq* q) {
string s;
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
AddToQueue(q, ip->out());
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
AddToQueue(q, ip->out1());
}
return s;
}
string Prog::Dump() {
string map;
if (false) { // Debugging
int lo = 0;
StringAppendF(&map, "byte map:\n");
for (int i = 0; i < bytemap_range_; i++) {
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
lo = unbytemap_[i] + 1;
}
StringAppendF(&map, "\n");
}
Workq q(size_);
AddToQueue(&q, start_);
return map + ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
}
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
void Prog::Optimize() {
Workq q(size_);
// Eliminate nops. Most are taken out during compilation
// but a few are hard to avoid.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
int j = ip->out();
Inst* jp;
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->set_out(j);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt) {
j = ip->out1();
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->out1_ = j;
AddToQueue(&q, ip->out1());
}
}
// Insert kInstAltMatch instructions
// Look for
// ip: Alt -> j | k
// j: ByteRange [00-FF] -> ip
// k: Match
// or the reverse (the above is the greedy one).
// Rewrite Alt to AltMatch.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt)
AddToQueue(&q, ip->out1());
if (ip->opcode() == kInstAlt) {
Inst* j = inst(ip->out());
Inst* k = inst(ip->out1());
if (j->opcode() == kInstByteRange && j->out() == id &&
j->lo() == 0x00 && j->hi() == 0xFF &&
IsMatch(this, k)) {
ip->set_opcode(kInstAltMatch);
continue;
}
if (IsMatch(this, j) &&
k->opcode() == kInstByteRange && k->out() == id &&
k->lo() == 0x00 && k->hi() == 0xFF) {
ip->set_opcode(kInstAltMatch);
}
}
}
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
if (p == text.begin())
flags |= kEmptyBeginText | kEmptyBeginLine;
else if (p[-1] == '\n')
flags |= kEmptyBeginLine;
// $ and \z
if (p == text.end())
flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.end() && p[0] == '\n')
flags |= kEmptyEndLine;
// \b and \B
if (p == text.begin() && p == text.end()) {
// no word boundary here
} else if (p == text.begin()) {
if (IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
} else if (p == text.end()) {
if (IsWordChar(p[-1]))
flags |= kEmptyWordBoundary;
} else {
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
}
if (!(flags & kEmptyWordBoundary))
flags |= kEmptyNonWordBoundary;
return flags;
}
void Prog::MarkByteRange(int lo, int hi) {
DCHECK_GE(lo, 0);
DCHECK_GE(hi, 0);
DCHECK_LE(lo, 255);
DCHECK_LE(hi, 255);
DCHECK_LE(lo, hi);
if (0 < lo && lo <= 255)
byterange_.Set(lo - 1);
if (0 <= hi && hi <= 255)
byterange_.Set(hi);
}
void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for prog_.
// Ranges of bytes that are treated as indistinguishable
// by the regexp program are mapped to a single byte class.
// The vector prog_->byterange() marks the end of each
// such range.
const Bitmap<256>& v = byterange();
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
uint8 n = 0;
uint32 bits = 0;
for (int i = 0; i < 256; i++) {
if ((i&31) == 0)
bits = v.Word(i >> 5);
bytemap_[i] = n;
n += bits & 1;
bits >>= 1;
}
bytemap_range_ = bytemap_[255] + 1;
unbytemap_ = new uint8[bytemap_range_];
for (int i = 0; i < 256; i++)
unbytemap_[bytemap_[i]] = static_cast<uint8>(i);
if (0) { // For debugging: use trivial byte map.
for (int i = 0; i < 256; i++) {
bytemap_[i] = static_cast<uint8>(i);
unbytemap_[i] = static_cast<uint8>(i);
}
bytemap_range_ = 256;
LOG(INFO) << "Using trivial bytemap.";
}
}
} // namespace re2