Make RE2::Set sort the patterns before compiling.
Change-Id: I44bc22b9eb6e5c294c26b0418dfc301110aa4904
Reviewed-on: https://code-review.googlesource.com/17830
Reviewed-by: Paul Wankadia <junyer@google.com>
diff --git a/re2/set.cc b/re2/set.cc
index 86a32a7..3b8fe61 100644
--- a/re2/set.cc
+++ b/re2/set.cc
@@ -5,6 +5,7 @@
#include "re2/set.h"
#include <stddef.h>
+#include <algorithm>
#include <memory>
#include "util/util.h"
@@ -26,8 +27,8 @@
}
RE2::Set::~Set() {
- for (size_t i = 0; i < re_.size(); i++)
- re_[i]->Decref();
+ for (size_t i = 0; i < elem_.size(); i++)
+ elem_[i].second->Decref();
delete prog_;
}
@@ -50,7 +51,7 @@
}
// Concatenate with match index and push on vector.
- int n = static_cast<int>(re_.size());
+ int n = static_cast<int>(elem_.size());
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
@@ -67,7 +68,7 @@
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
- re_.push_back(re);
+ elem_.emplace_back(pattern.ToString(), re);
return n;
}
@@ -77,13 +78,26 @@
return false;
}
compiled_ = true;
- size_ = static_cast<int>(re_.size());
+ size_ = static_cast<int>(elem_.size());
+
+ // Sort the elements by their patterns. This is "good enough" because
+ // we are only hoping to improve the factoring of common prefixes and
+ // that can be achieved without chasing pointers around Regexp graphs
+ // right now. (The factoring logic will do exactly that soon enough.)
+ std::sort(elem_.begin(), elem_.end(),
+ [](const Elem& a, const Elem& b) -> bool {
+ return a.first < b.first;
+ });
+
+ re2::Regexp** sub = new re2::Regexp*[size_];
+ for (size_t i = 0; i < elem_.size(); i++)
+ sub[i] = elem_[i].second;
+ elem_.clear();
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
- re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(re_.data()),
- static_cast<int>(re_.size()), pf);
- re_.clear();
+ re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf);
+ delete[] sub;
prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
re->Decref();
diff --git a/re2/set.h b/re2/set.h
index f3d37d8..f97d8a5 100644
--- a/re2/set.h
+++ b/re2/set.h
@@ -6,6 +6,7 @@
#define RE2_SET_H_
#include <string>
+#include <utility>
#include <vector>
#include "re2/re2.h"
@@ -45,9 +46,11 @@
bool Match(const StringPiece& text, std::vector<int>* v) const;
private:
+ typedef std::pair<string, re2::Regexp*> Elem;
+
RE2::Options options_;
RE2::Anchor anchor_;
- std::vector<re2::Regexp*> re_;
+ std::vector<Elem> elem_;
re2::Prog* prog_;
bool compiled_;
int size_;