re2/simplify.cc - re2 - Git at Google

 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Rewrite POSIX and other features in re
 // to use simple extended regular expression features.
 // Also sort and simplify character classes.

 #include <string>

 #include "util/util.h"
 #include "util/logging.h"
 #include "util/pod_array.h"
 #include "util/utf.h"
 #include "re2/regexp.h"
 #include "re2/walker-inl.h"

 namespace re2 {

 // Parses the regexp src and then simplifies it and sets *dst to the
 // string representation of the simplified form.  Returns true on success.
 // Returns false and sets *error (if error != NULL) on error.
 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
                             std::string* dst, RegexpStatus* status) {
   Regexp* re = Parse(src, flags, status);
   if (re == NULL)
     return false;
   Regexp* sre = re->Simplify();
   re->Decref();
   if (sre == NULL) {
     // Should not happen, since Simplify never fails.
     LOG(ERROR) << "Simplify failed on " << src;
     if (status) {
       status->set_code(kRegexpInternalError);
       status->set_error_arg(src);
     }
     return false;
   }
   *dst = sre->ToString();
   sre->Decref();
   return true;
 }

 // Assuming the simple_ flags on the children are accurate,
 // is this Regexp* simple?
 bool Regexp::ComputeSimple() {
   Regexp** subs;
   switch (op_) {
     case kRegexpNoMatch:
     case kRegexpEmptyMatch:
     case kRegexpLiteral:
     case kRegexpLiteralString:
     case kRegexpBeginLine:
     case kRegexpEndLine:
     case kRegexpBeginText:
     case kRegexpWordBoundary:
     case kRegexpNoWordBoundary:
     case kRegexpEndText:
     case kRegexpAnyChar:
     case kRegexpAnyByte:
     case kRegexpHaveMatch:
       return true;
     case kRegexpConcat:
     case kRegexpAlternate:
       // These are simple as long as the subpieces are simple.
       subs = sub();
       for (int i = 0; i < nsub_; i++)
         if (!subs[i]->simple())
           return false;
       return true;
     case kRegexpCharClass:
       // Simple as long as the char class is not empty, not full.
       if (ccb_ != NULL)
         return !ccb_->empty() && !ccb_->full();
       return !cc_->empty() && !cc_->full();
     case kRegexpCapture:
       subs = sub();
       return subs[0]->simple();
     case kRegexpStar:
     case kRegexpPlus:
     case kRegexpQuest:
       subs = sub();
       if (!subs[0]->simple())
         return false;
       switch (subs[0]->op_) {
         case kRegexpStar:
         case kRegexpPlus:
         case kRegexpQuest:
         case kRegexpEmptyMatch:
         case kRegexpNoMatch:
           return false;
         default:
           break;
       }
       return true;
     case kRegexpRepeat:
       return false;
   }
   LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
   return false;
 }

 // Walker subclass used by Simplify.
 // Coalesces runs of star/plus/quest/repeat of the same literal along with any
 // occurrences of that literal into repeats of that literal. It also works for
 // char classes, any char and any byte.
 // PostVisit creates the coalesced result, which should then be simplified.
 class CoalesceWalker : public Regexp::Walker<Regexp*> {
  public:
   CoalesceWalker() {}
   virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
                             Regexp** child_args, int nchild_args);
   virtual Regexp* Copy(Regexp* re);
   virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

  private:
   // These functions are declared inside CoalesceWalker so that
   // they can edit the private fields of the Regexps they construct.

   // Returns true if r1 and r2 can be coalesced. In particular, ensures that
   // the parse flags are consistent. (They will not be checked again later.)
   static bool CanCoalesce(Regexp* r1, Regexp* r2);

   // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
   // will be empty match and the coalesced op. In other cases, where part of a
   // literal string was removed to be coalesced, the array elements afterwards
   // will be the coalesced op and the remainder of the literal string.
   static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);

   CoalesceWalker(const CoalesceWalker&) = delete;
   CoalesceWalker& operator=(const CoalesceWalker&) = delete;
 };

 // Walker subclass used by Simplify.
 // The simplify walk is purely post-recursive: given the simplified children,
 // PostVisit creates the simplified result.
 // The child_args are simplified Regexp*s.
 class SimplifyWalker : public Regexp::Walker<Regexp*> {
  public:
   SimplifyWalker() {}
   virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
   virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
                             Regexp** child_args, int nchild_args);
   virtual Regexp* Copy(Regexp* re);
   virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

  private:
   // These functions are declared inside SimplifyWalker so that
   // they can edit the private fields of the Regexps they construct.

   // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
   // Caller must Decref return value when done with it.
   static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);

   // Simplifies the expression re{min,max} in terms of *, +, and ?.
   // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
   // Caller must Decref return value when done with it.
   static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
                                 Regexp::ParseFlags parse_flags);

   // Simplifies a character class by expanding any named classes
   // into rune ranges.  Does not edit re.  Does not consume ref to re.
   // Caller must Decref return value when done with it.
   static Regexp* SimplifyCharClass(Regexp* re);

   SimplifyWalker(const SimplifyWalker&) = delete;
   SimplifyWalker& operator=(const SimplifyWalker&) = delete;
 };

 // Simplifies a regular expression, returning a new regexp.
 // The new regexp uses traditional Unix egrep features only,
 // plus the Perl (?:) non-capturing parentheses.
 // Otherwise, no POSIX or Perl additions.  The new regexp
 // captures exactly the same subexpressions (with the same indices)
 // as the original.
 // Does not edit current object.
 // Caller must Decref() return value when done with it.

 Regexp* Regexp::Simplify() {
   CoalesceWalker cw;
   Regexp* cre = cw.Walk(this, NULL);
   if (cre == NULL)
     return cre;
   SimplifyWalker sw;
   Regexp* sre = sw.Walk(cre, NULL);
   cre->Decref();
   return sre;
 }

 #define Simplify DontCallSimplify  // Avoid accidental recursion

 // Utility function for PostVisit implementations that compares re->sub() with
 // child_args to determine whether any child_args changed. In the common case,
 // where nothing changed, calls Decref() for all child_args and returns false,
 // so PostVisit must return re->Incref(). Otherwise, returns true.
 static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
   for (int i = 0; i < re->nsub(); i++) {
     Regexp* sub = re->sub()[i];
     Regexp* newsub = child_args[i];
     if (newsub != sub)
       return true;
   }
   for (int i = 0; i < re->nsub(); i++) {
     Regexp* newsub = child_args[i];
     newsub->Decref();
   }
   return false;
 }

 Regexp* CoalesceWalker::Copy(Regexp* re) {
   return re->Incref();
 }

 Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
   // This should never be called, since we use Walk and not
   // WalkExponential.
   LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
   return re->Incref();
 }

 Regexp* CoalesceWalker::PostVisit(Regexp* re,
                                   Regexp* parent_arg,
                                   Regexp* pre_arg,
                                   Regexp** child_args,
                                   int nchild_args) {
   if (re->nsub() == 0)
     return re->Incref();

   if (re->op() != kRegexpConcat) {
     if (!ChildArgsChanged(re, child_args))
       return re->Incref();

     // Something changed. Build a new op.
     Regexp* nre = new Regexp(re->op(), re->parse_flags());
     nre->AllocSub(re->nsub());
     Regexp** nre_subs = nre->sub();
     for (int i = 0; i < re->nsub(); i++)
       nre_subs[i] = child_args[i];
     // Repeats and Captures have additional data that must be copied.
     if (re->op() == kRegexpRepeat) {
       nre->min_ = re->min();
       nre->max_ = re->max();
     } else if (re->op() == kRegexpCapture) {
       nre->cap_ = re->cap();
     }
     return nre;
   }

   bool can_coalesce = false;
   for (int i = 0; i < re->nsub(); i++) {
     if (i+1 < re->nsub() &&
         CanCoalesce(child_args[i], child_args[i+1])) {
       can_coalesce = true;
       break;
     }
   }
   if (!can_coalesce) {
     if (!ChildArgsChanged(re, child_args))
       return re->Incref();

     // Something changed. Build a new op.
     Regexp* nre = new Regexp(re->op(), re->parse_flags());
     nre->AllocSub(re->nsub());
     Regexp** nre_subs = nre->sub();
     for (int i = 0; i < re->nsub(); i++)
       nre_subs[i] = child_args[i];
     return nre;
   }

   for (int i = 0; i < re->nsub(); i++) {
     if (i+1 < re->nsub() &&
         CanCoalesce(child_args[i], child_args[i+1]))
       DoCoalesce(&child_args[i], &child_args[i+1]);
   }
   // Determine how many empty matches were left by DoCoalesce.
   int n = 0;
   for (int i = n; i < re->nsub(); i++) {
     if (child_args[i]->op() == kRegexpEmptyMatch)
       n++;
   }
   // Build a new op.
   Regexp* nre = new Regexp(re->op(), re->parse_flags());
   nre->AllocSub(re->nsub() - n);
   Regexp** nre_subs = nre->sub();
   for (int i = 0, j = 0; i < re->nsub(); i++) {
     if (child_args[i]->op() == kRegexpEmptyMatch) {
       child_args[i]->Decref();
       continue;
     }
     nre_subs[j] = child_args[i];
     j++;
   }
   return nre;
 }

 bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
   // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
   // any byte.
   if ((r1->op() == kRegexpStar ||
        r1->op() == kRegexpPlus ||
        r1->op() == kRegexpQuest ||
        r1->op() == kRegexpRepeat) &&
       (r1->sub()[0]->op() == kRegexpLiteral ||
        r1->sub()[0]->op() == kRegexpCharClass ||
        r1->sub()[0]->op() == kRegexpAnyChar ||
        r1->sub()[0]->op() == kRegexpAnyByte)) {
     // r2 must be a star/plus/quest/repeat of the same literal, char class,
     // any char or any byte.
     if ((r2->op() == kRegexpStar ||
          r2->op() == kRegexpPlus ||
          r2->op() == kRegexpQuest ||
          r2->op() == kRegexpRepeat) &&
         Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
         // The parse flags must be consistent.
         ((r1->parse_flags() & Regexp::NonGreedy) ==
          (r2->parse_flags() & Regexp::NonGreedy))) {
       return true;
     }
     // ... OR an occurrence of that literal, char class, any char or any byte
     if (Regexp::Equal(r1->sub()[0], r2)) {
       return true;
     }
     // ... OR a literal string that begins with that literal.
     if (r1->sub()[0]->op() == kRegexpLiteral &&
         r2->op() == kRegexpLiteralString &&
         r2->runes()[0] == r1->sub()[0]->rune() &&
         // The parse flags must be consistent.
         ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
          (r2->parse_flags() & Regexp::FoldCase))) {
       return true;
     }
   }
   return false;
 }

 void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
   Regexp* r1 = *r1ptr;
   Regexp* r2 = *r2ptr;

   Regexp* nre = Regexp::Repeat(
       r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);

   switch (r1->op()) {
     case kRegexpStar:
       nre->min_ = 0;
       nre->max_ = -1;
       break;

     case kRegexpPlus:
       nre->min_ = 1;
       nre->max_ = -1;
       break;

     case kRegexpQuest:
       nre->min_ = 0;
       nre->max_ = 1;
       break;

     case kRegexpRepeat:
       nre->min_ = r1->min();
       nre->max_ = r1->max();
       break;

     default:
       LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
       nre->Decref();
       return;
   }

   switch (r2->op()) {
     case kRegexpStar:
       nre->max_ = -1;
       goto LeaveEmpty;

     case kRegexpPlus:
       nre->min_++;
       nre->max_ = -1;
       goto LeaveEmpty;

     case kRegexpQuest:
       if (nre->max() != -1)
         nre->max_++;
       goto LeaveEmpty;

     case kRegexpRepeat:
       nre->min_ += r2->min();
       if (r2->max() == -1)
         nre->max_ = -1;
       else if (nre->max() != -1)
         nre->max_ += r2->max();
       goto LeaveEmpty;

     case kRegexpLiteral:
     case kRegexpCharClass:
     case kRegexpAnyChar:
     case kRegexpAnyByte:
       nre->min_++;
       if (nre->max() != -1)
         nre->max_++;
       goto LeaveEmpty;

     LeaveEmpty:
       *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
       *r2ptr = nre;
       break;

     case kRegexpLiteralString: {
       Rune r = r1->sub()[0]->rune();
       // Determine how much of the literal string is removed.
       // We know that we have at least one rune. :)
       int n = 1;
       while (n < r2->nrunes() && r2->runes()[n] == r)
         n++;
       nre->min_ += n;
       if (nre->max() != -1)
         nre->max_ += n;
       if (n == r2->nrunes())
         goto LeaveEmpty;
       *r1ptr = nre;
       *r2ptr = Regexp::LiteralString(
           &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
       break;
     }

     default:
       LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
       nre->Decref();
       return;
   }

   r1->Decref();
   r2->Decref();
 }

 Regexp* SimplifyWalker::Copy(Regexp* re) {
   return re->Incref();
 }

 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
   // This should never be called, since we use Walk and not
   // WalkExponential.
   LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
   return re->Incref();
 }

 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
   if (re->simple()) {
     *stop = true;
     return re->Incref();
   }
   return NULL;
 }

 Regexp* SimplifyWalker::PostVisit(Regexp* re,
                                   Regexp* parent_arg,
                                   Regexp* pre_arg,
                                   Regexp** child_args,
                                   int nchild_args) {
   switch (re->op()) {
     case kRegexpNoMatch:
     case kRegexpEmptyMatch:
     case kRegexpLiteral:
     case kRegexpLiteralString:
     case kRegexpBeginLine:
     case kRegexpEndLine:
     case kRegexpBeginText:
     case kRegexpWordBoundary:
     case kRegexpNoWordBoundary:
     case kRegexpEndText:
     case kRegexpAnyChar:
     case kRegexpAnyByte:
     case kRegexpHaveMatch:
       // All these are always simple.
       re->simple_ = true;
       return re->Incref();

     case kRegexpConcat:
     case kRegexpAlternate: {
       // These are simple as long as the subpieces are simple.
       if (!ChildArgsChanged(re, child_args)) {
         re->simple_ = true;
         return re->Incref();
       }
       Regexp* nre = new Regexp(re->op(), re->parse_flags());
       nre->AllocSub(re->nsub());
       Regexp** nre_subs = nre->sub();
       for (int i = 0; i < re->nsub(); i++)
         nre_subs[i] = child_args[i];
       nre->simple_ = true;
       return nre;
     }

     case kRegexpCapture: {
       Regexp* newsub = child_args[0];
       if (newsub == re->sub()[0]) {
         newsub->Decref();
         re->simple_ = true;
         return re->Incref();
       }
       Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
       nre->AllocSub(1);
       nre->sub()[0] = newsub;
       nre->cap_ = re->cap();
       nre->simple_ = true;
       return nre;
     }

     case kRegexpStar:
     case kRegexpPlus:
     case kRegexpQuest: {
       Regexp* newsub = child_args[0];
       // Special case: repeat the empty string as much as
       // you want, but it's still the empty string.
       if (newsub->op() == kRegexpEmptyMatch)
         return newsub;

       // These are simple as long as the subpiece is simple.
       if (newsub == re->sub()[0]) {
         newsub->Decref();
         re->simple_ = true;
         return re->Incref();
       }

       // These are also idempotent if flags are constant.
       if (re->op() == newsub->op() &&
           re->parse_flags() == newsub->parse_flags())
         return newsub;

       Regexp* nre = new Regexp(re->op(), re->parse_flags());
       nre->AllocSub(1);
       nre->sub()[0] = newsub;
       nre->simple_ = true;
       return nre;
     }

     case kRegexpRepeat: {
       Regexp* newsub = child_args[0];
       // Special case: repeat the empty string as much as
       // you want, but it's still the empty string.
       if (newsub->op() == kRegexpEmptyMatch)
         return newsub;

       Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
                                    re->parse_flags());
       newsub->Decref();
       nre->simple_ = true;
       return nre;
     }

     case kRegexpCharClass: {
       Regexp* nre = SimplifyCharClass(re);
       nre->simple_ = true;
       return nre;
     }
   }

   LOG(ERROR) << "Simplify case not handled: " << re->op();
   return re->Incref();
 }

 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
 // Returns a new Regexp, handing the ref to the caller.
 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
                                 Regexp::ParseFlags parse_flags) {
   Regexp* re = new Regexp(kRegexpConcat, parse_flags);
   re->AllocSub(2);
   Regexp** subs = re->sub();
   subs[0] = re1;
   subs[1] = re2;
   return re;
 }

 // Simplifies the expression re{min,max} in terms of *, +, and ?.
 // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
 // Caller must Decref return value when done with it.
 // The result will *not* necessarily have the right capturing parens
 // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
 // but in the Regexp* representation, both (x) are marked as $1.
 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
                                        Regexp::ParseFlags f) {
   // x{n,} means at least n matches of x.
   if (max == -1) {
     // Special case: x{0,} is x*
     if (min == 0)
       return Regexp::Star(re->Incref(), f);

     // Special case: x{1,} is x+
     if (min == 1)
       return Regexp::Plus(re->Incref(), f);

     // General case: x{4,} is xxxx+
     PODArray<Regexp*> nre_subs(min);
     for (int i = 0; i < min-1; i++)
       nre_subs[i] = re->Incref();
     nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
     return Regexp::Concat(nre_subs.data(), min, f);
   }

   // Special case: (x){0} matches only empty string.
   if (min == 0 && max == 0)
     return new Regexp(kRegexpEmptyMatch, f);

   // Special case: x{1} is just x.
   if (min == 1 && max == 1)
     return re->Incref();

   // General case: x{n,m} means n copies of x and m copies of x?.
   // The machine will do less work if we nest the final m copies,
   // so that x{2,5} = xx(x(x(x)?)?)?

   // Build leading prefix: xx.  Capturing only on the last one.
   Regexp* nre = NULL;
   if (min > 0) {
     PODArray<Regexp*> nre_subs(min);
     for (int i = 0; i < min; i++)
       nre_subs[i] = re->Incref();
     nre = Regexp::Concat(nre_subs.data(), min, f);
   }

   // Build and attach suffix: (x(x(x)?)?)?
   if (max > min) {
     Regexp* suf = Regexp::Quest(re->Incref(), f);
     for (int i = min+1; i < max; i++)
       suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
     if (nre == NULL)
       nre = suf;
     else
       nre = Concat2(nre, suf, f);
   }

   if (nre == NULL) {
     // Some degenerate case, like min > max, or min < max < 0.
     // This shouldn't happen, because the parser rejects such regexps.
     LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
     return new Regexp(kRegexpNoMatch, f);
   }

   return nre;
 }

 // Simplifies a character class.
 // Caller must Decref return value when done with it.
 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
   CharClass* cc = re->cc();

   // Special cases
   if (cc->empty())
     return new Regexp(kRegexpNoMatch, re->parse_flags());
   if (cc->full())
     return new Regexp(kRegexpAnyChar, re->parse_flags());

   return re->Incref();
 }

 }  // namespace re2
	// Copyright 2006 The RE2 Authors. All Rights Reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// Rewrite POSIX and other features in re
	// to use simple extended regular expression features.
	// Also sort and simplify character classes.

	#include <string>

	#include "util/util.h"
	#include "util/logging.h"
	#include "util/pod_array.h"
	#include "util/utf.h"
	#include "re2/regexp.h"
	#include "re2/walker-inl.h"

	namespace re2 {

	// Parses the regexp src and then simplifies it and sets *dst to the
	// string representation of the simplified form. Returns true on success.
	// Returns false and sets *error (if error != NULL) on error.
	bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
	std::string* dst, RegexpStatus* status) {
	Regexp* re = Parse(src, flags, status);
	if (re == NULL)
	return false;
	Regexp* sre = re->Simplify();
	re->Decref();
	if (sre == NULL) {
	// Should not happen, since Simplify never fails.
	LOG(ERROR) << "Simplify failed on " << src;
	if (status) {
	status->set_code(kRegexpInternalError);
	status->set_error_arg(src);
	}
	return false;
	}
	*dst = sre->ToString();
	sre->Decref();
	return true;
	}

	// Assuming the simple_ flags on the children are accurate,
	// is this Regexp* simple?
	bool Regexp::ComputeSimple() {
	Regexp** subs;
	switch (op_) {
	case kRegexpNoMatch:
	case kRegexpEmptyMatch:
	case kRegexpLiteral:
	case kRegexpLiteralString:
	case kRegexpBeginLine:
	case kRegexpEndLine:
	case kRegexpBeginText:
	case kRegexpWordBoundary:
	case kRegexpNoWordBoundary:
	case kRegexpEndText:
	case kRegexpAnyChar:
	case kRegexpAnyByte:
	case kRegexpHaveMatch:
	return true;
	case kRegexpConcat:
	case kRegexpAlternate:
	// These are simple as long as the subpieces are simple.
	subs = sub();
	for (int i = 0; i < nsub_; i++)
	if (!subs[i]->simple())
	return false;
	return true;
	case kRegexpCharClass:
	// Simple as long as the char class is not empty, not full.
	if (ccb_ != NULL)
	return !ccb_->empty() && !ccb_->full();
	return !cc_->empty() && !cc_->full();
	case kRegexpCapture:
	subs = sub();
	return subs[0]->simple();
	case kRegexpStar:
	case kRegexpPlus:
	case kRegexpQuest:
	subs = sub();
	if (!subs[0]->simple())
	return false;
	switch (subs[0]->op_) {
	case kRegexpStar:
	case kRegexpPlus:
	case kRegexpQuest:
	case kRegexpEmptyMatch:
	case kRegexpNoMatch:
	return false;
	default:
	break;
	}
	return true;
	case kRegexpRepeat:
	return false;
	}
	LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
	return false;
	}

	// Walker subclass used by Simplify.
	// Coalesces runs of star/plus/quest/repeat of the same literal along with any
	// occurrences of that literal into repeats of that literal. It also works for
	// char classes, any char and any byte.
	// PostVisit creates the coalesced result, which should then be simplified.
	class CoalesceWalker : public Regexp::Walker<Regexp*> {
	public:
	CoalesceWalker() {}
	virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
	Regexp** child_args, int nchild_args);
	virtual Regexp* Copy(Regexp* re);
	virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

	private:
	// These functions are declared inside CoalesceWalker so that
	// they can edit the private fields of the Regexps they construct.

	// Returns true if r1 and r2 can be coalesced. In particular, ensures that
	// the parse flags are consistent. (They will not be checked again later.)
	static bool CanCoalesce(Regexp* r1, Regexp* r2);

	// Coalesces r1ptr and r2ptr. In most cases, the array elements afterwards
	// will be empty match and the coalesced op. In other cases, where part of a
	// literal string was removed to be coalesced, the array elements afterwards
	// will be the coalesced op and the remainder of the literal string.
	static void DoCoalesce(Regexp r1ptr, Regexp r2ptr);

	CoalesceWalker(const CoalesceWalker&) = delete;
	CoalesceWalker& operator=(const CoalesceWalker&) = delete;
	};

	// Walker subclass used by Simplify.
	// The simplify walk is purely post-recursive: given the simplified children,
	// PostVisit creates the simplified result.
	// The child_args are simplified Regexp*s.
	class SimplifyWalker : public Regexp::Walker<Regexp*> {
	public:
	SimplifyWalker() {}
	virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
	virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
	Regexp** child_args, int nchild_args);
	virtual Regexp* Copy(Regexp* re);
	virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

	private:
	// These functions are declared inside SimplifyWalker so that
	// they can edit the private fields of the Regexps they construct.

	// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
	// Caller must Decref return value when done with it.
	static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);

	// Simplifies the expression re{min,max} in terms of *, +, and ?.
	// Returns a new regexp. Does not edit re. Does not consume reference to re.
	// Caller must Decref return value when done with it.
	static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
	Regexp::ParseFlags parse_flags);

	// Simplifies a character class by expanding any named classes
	// into rune ranges. Does not edit re. Does not consume ref to re.
	// Caller must Decref return value when done with it.
	static Regexp* SimplifyCharClass(Regexp* re);

	SimplifyWalker(const SimplifyWalker&) = delete;
	SimplifyWalker& operator=(const SimplifyWalker&) = delete;
	};

	// Simplifies a regular expression, returning a new regexp.
	// The new regexp uses traditional Unix egrep features only,
	// plus the Perl (?:) non-capturing parentheses.
	// Otherwise, no POSIX or Perl additions. The new regexp
	// captures exactly the same subexpressions (with the same indices)
	// as the original.
	// Does not edit current object.
	// Caller must Decref() return value when done with it.

	Regexp* Regexp::Simplify() {
	CoalesceWalker cw;
	Regexp* cre = cw.Walk(this, NULL);
	if (cre == NULL)
	return cre;
	SimplifyWalker sw;
	Regexp* sre = sw.Walk(cre, NULL);
	cre->Decref();
	return sre;
	}

	#define Simplify DontCallSimplify // Avoid accidental recursion

	// Utility function for PostVisit implementations that compares re->sub() with
	// child_args to determine whether any child_args changed. In the common case,
	// where nothing changed, calls Decref() for all child_args and returns false,
	// so PostVisit must return re->Incref(). Otherwise, returns true.
	static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
	for (int i = 0; i < re->nsub(); i++) {
	Regexp* sub = re->sub()[i];
	Regexp* newsub = child_args[i];
	if (newsub != sub)
	return true;
	}
	for (int i = 0; i < re->nsub(); i++) {
	Regexp* newsub = child_args[i];
	newsub->Decref();
	}
	return false;
	}

	Regexp* CoalesceWalker::Copy(Regexp* re) {
	return re->Incref();
	}

	Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
	// This should never be called, since we use Walk and not
	// WalkExponential.
	LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
	return re->Incref();
	}

	Regexp* CoalesceWalker::PostVisit(Regexp* re,
	Regexp* parent_arg,
	Regexp* pre_arg,
	Regexp** child_args,
	int nchild_args) {
	if (re->nsub() == 0)
	return re->Incref();

	if (re->op() != kRegexpConcat) {
	if (!ChildArgsChanged(re, child_args))
	return re->Incref();

	// Something changed. Build a new op.
	Regexp* nre = new Regexp(re->op(), re->parse_flags());
	nre->AllocSub(re->nsub());
	Regexp** nre_subs = nre->sub();
	for (int i = 0; i < re->nsub(); i++)
	nre_subs[i] = child_args[i];
	// Repeats and Captures have additional data that must be copied.
	if (re->op() == kRegexpRepeat) {
	nre->min_ = re->min();
	nre->max_ = re->max();
	} else if (re->op() == kRegexpCapture) {
	nre->cap_ = re->cap();
	}
	return nre;
	}

	bool can_coalesce = false;
	for (int i = 0; i < re->nsub(); i++) {
	if (i+1 < re->nsub() &&
	CanCoalesce(child_args[i], child_args[i+1])) {
	can_coalesce = true;
	break;
	}
	}
	if (!can_coalesce) {
	if (!ChildArgsChanged(re, child_args))
	return re->Incref();

	// Something changed. Build a new op.
	Regexp* nre = new Regexp(re->op(), re->parse_flags());
	nre->AllocSub(re->nsub());
	Regexp** nre_subs = nre->sub();
	for (int i = 0; i < re->nsub(); i++)
	nre_subs[i] = child_args[i];
	return nre;
	}

	for (int i = 0; i < re->nsub(); i++) {
	if (i+1 < re->nsub() &&
	CanCoalesce(child_args[i], child_args[i+1]))
	DoCoalesce(&child_args[i], &child_args[i+1]);
	}
	// Determine how many empty matches were left by DoCoalesce.
	int n = 0;
	for (int i = n; i < re->nsub(); i++) {
	if (child_args[i]->op() == kRegexpEmptyMatch)
	n++;
	}
	// Build a new op.
	Regexp* nre = new Regexp(re->op(), re->parse_flags());
	nre->AllocSub(re->nsub() - n);
	Regexp** nre_subs = nre->sub();
	for (int i = 0, j = 0; i < re->nsub(); i++) {
	if (child_args[i]->op() == kRegexpEmptyMatch) {
	child_args[i]->Decref();
	continue;
	}
	nre_subs[j] = child_args[i];
	j++;
	}
	return nre;
	}

	bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
	// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
	// any byte.
	if ((r1->op() == kRegexpStar \|\|
	r1->op() == kRegexpPlus \|\|
	r1->op() == kRegexpQuest \|\|
	r1->op() == kRegexpRepeat) &&
	(r1->sub()[0]->op() == kRegexpLiteral \|\|
	r1->sub()[0]->op() == kRegexpCharClass \|\|
	r1->sub()[0]->op() == kRegexpAnyChar \|\|
	r1->sub()[0]->op() == kRegexpAnyByte)) {
	// r2 must be a star/plus/quest/repeat of the same literal, char class,
	// any char or any byte.
	if ((r2->op() == kRegexpStar \|\|
	r2->op() == kRegexpPlus \|\|
	r2->op() == kRegexpQuest \|\|
	r2->op() == kRegexpRepeat) &&
	Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
	// The parse flags must be consistent.
	((r1->parse_flags() & Regexp::NonGreedy) ==
	(r2->parse_flags() & Regexp::NonGreedy))) {
	return true;
	}
	// ... OR an occurrence of that literal, char class, any char or any byte
	if (Regexp::Equal(r1->sub()[0], r2)) {
	return true;
	}
	// ... OR a literal string that begins with that literal.
	if (r1->sub()[0]->op() == kRegexpLiteral &&
	r2->op() == kRegexpLiteralString &&
	r2->runes()[0] == r1->sub()[0]->rune() &&
	// The parse flags must be consistent.
	((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
	(r2->parse_flags() & Regexp::FoldCase))) {
	return true;
	}
	}
	return false;
	}

	void CoalesceWalker::DoCoalesce(Regexp r1ptr, Regexp r2ptr) {
	Regexp* r1 = *r1ptr;
	Regexp* r2 = *r2ptr;

	Regexp* nre = Regexp::Repeat(
	r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);

	switch (r1->op()) {
	case kRegexpStar:
	nre->min_ = 0;
	nre->max_ = -1;
	break;

	case kRegexpPlus:
	nre->min_ = 1;
	nre->max_ = -1;
	break;

	case kRegexpQuest:
	nre->min_ = 0;
	nre->max_ = 1;
	break;

	case kRegexpRepeat:
	nre->min_ = r1->min();
	nre->max_ = r1->max();
	break;

	default:
	LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
	nre->Decref();
	return;
	}

	switch (r2->op()) {
	case kRegexpStar:
	nre->max_ = -1;
	goto LeaveEmpty;

	case kRegexpPlus:
	nre->min_++;
	nre->max_ = -1;
	goto LeaveEmpty;

	case kRegexpQuest:
	if (nre->max() != -1)
	nre->max_++;
	goto LeaveEmpty;

	case kRegexpRepeat:
	nre->min_ += r2->min();
	if (r2->max() == -1)
	nre->max_ = -1;
	else if (nre->max() != -1)
	nre->max_ += r2->max();
	goto LeaveEmpty;

	case kRegexpLiteral:
	case kRegexpCharClass:
	case kRegexpAnyChar:
	case kRegexpAnyByte:
	nre->min_++;
	if (nre->max() != -1)
	nre->max_++;
	goto LeaveEmpty;

	LeaveEmpty:
	*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
	*r2ptr = nre;
	break;

	case kRegexpLiteralString: {
	Rune r = r1->sub()[0]->rune();
	// Determine how much of the literal string is removed.
	// We know that we have at least one rune. :)
	int n = 1;
	while (n < r2->nrunes() && r2->runes()[n] == r)
	n++;
	nre->min_ += n;
	if (nre->max() != -1)
	nre->max_ += n;
	if (n == r2->nrunes())
	goto LeaveEmpty;
	*r1ptr = nre;
	*r2ptr = Regexp::LiteralString(
	&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
	break;
	}

	default:
	LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
	nre->Decref();
	return;
	}

	r1->Decref();
	r2->Decref();
	}

	Regexp* SimplifyWalker::Copy(Regexp* re) {
	return re->Incref();
	}

	Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
	// This should never be called, since we use Walk and not
	// WalkExponential.
	LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
	return re->Incref();
	}

	Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
	if (re->simple()) {
	*stop = true;
	return re->Incref();
	}
	return NULL;
	}

	Regexp* SimplifyWalker::PostVisit(Regexp* re,
	Regexp* parent_arg,
	Regexp* pre_arg,
	Regexp** child_args,
	int nchild_args) {
	switch (re->op()) {
	case kRegexpNoMatch:
	case kRegexpEmptyMatch:
	case kRegexpLiteral:
	case kRegexpLiteralString:
	case kRegexpBeginLine:
	case kRegexpEndLine:
	case kRegexpBeginText:
	case kRegexpWordBoundary:
	case kRegexpNoWordBoundary:
	case kRegexpEndText:
	case kRegexpAnyChar:
	case kRegexpAnyByte:
	case kRegexpHaveMatch:
	// All these are always simple.
	re->simple_ = true;
	return re->Incref();

	case kRegexpConcat:
	case kRegexpAlternate: {
	// These are simple as long as the subpieces are simple.
	if (!ChildArgsChanged(re, child_args)) {
	re->simple_ = true;
	return re->Incref();
	}
	Regexp* nre = new Regexp(re->op(), re->parse_flags());
	nre->AllocSub(re->nsub());
	Regexp** nre_subs = nre->sub();
	for (int i = 0; i < re->nsub(); i++)
	nre_subs[i] = child_args[i];
	nre->simple_ = true;
	return nre;
	}

	case kRegexpCapture: {
	Regexp* newsub = child_args[0];
	if (newsub == re->sub()[0]) {
	newsub->Decref();
	re->simple_ = true;
	return re->Incref();
	}
	Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
	nre->AllocSub(1);
	nre->sub()[0] = newsub;
	nre->cap_ = re->cap();
	nre->simple_ = true;
	return nre;
	}

	case kRegexpStar:
	case kRegexpPlus:
	case kRegexpQuest: {
	Regexp* newsub = child_args[0];
	// Special case: repeat the empty string as much as
	// you want, but it's still the empty string.
	if (newsub->op() == kRegexpEmptyMatch)
	return newsub;

	// These are simple as long as the subpiece is simple.
	if (newsub == re->sub()[0]) {
	newsub->Decref();
	re->simple_ = true;
	return re->Incref();
	}

	// These are also idempotent if flags are constant.
	if (re->op() == newsub->op() &&
	re->parse_flags() == newsub->parse_flags())
	return newsub;

	Regexp* nre = new Regexp(re->op(), re->parse_flags());
	nre->AllocSub(1);
	nre->sub()[0] = newsub;
	nre->simple_ = true;
	return nre;
	}

	case kRegexpRepeat: {
	Regexp* newsub = child_args[0];
	// Special case: repeat the empty string as much as
	// you want, but it's still the empty string.
	if (newsub->op() == kRegexpEmptyMatch)
	return newsub;

	Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
	re->parse_flags());
	newsub->Decref();
	nre->simple_ = true;
	return nre;
	}

	case kRegexpCharClass: {
	Regexp* nre = SimplifyCharClass(re);
	nre->simple_ = true;
	return nre;
	}
	}

	LOG(ERROR) << "Simplify case not handled: " << re->op();
	return re->Incref();
	}

	// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
	// Returns a new Regexp, handing the ref to the caller.
	Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
	Regexp::ParseFlags parse_flags) {
	Regexp* re = new Regexp(kRegexpConcat, parse_flags);
	re->AllocSub(2);
	Regexp** subs = re->sub();
	subs[0] = re1;
	subs[1] = re2;
	return re;
	}

	// Simplifies the expression re{min,max} in terms of *, +, and ?.
	// Returns a new regexp. Does not edit re. Does not consume reference to re.
	// Caller must Decref return value when done with it.
	// The result will not necessarily have the right capturing parens
	// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
	// but in the Regexp* representation, both (x) are marked as $1.
	Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
	Regexp::ParseFlags f) {
	// x{n,} means at least n matches of x.
	if (max == -1) {
	// Special case: x{0,} is x*
	if (min == 0)
	return Regexp::Star(re->Incref(), f);

	// Special case: x{1,} is x+
	if (min == 1)
	return Regexp::Plus(re->Incref(), f);

	// General case: x{4,} is xxxx+
	PODArray<Regexp*> nre_subs(min);
	for (int i = 0; i < min-1; i++)
	nre_subs[i] = re->Incref();
	nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
	return Regexp::Concat(nre_subs.data(), min, f);
	}

	// Special case: (x){0} matches only empty string.
	if (min == 0 && max == 0)
	return new Regexp(kRegexpEmptyMatch, f);

	// Special case: x{1} is just x.
	if (min == 1 && max == 1)
	return re->Incref();

	// General case: x{n,m} means n copies of x and m copies of x?.
	// The machine will do less work if we nest the final m copies,
	// so that x{2,5} = xx(x(x(x)?)?)?

	// Build leading prefix: xx. Capturing only on the last one.
	Regexp* nre = NULL;
	if (min > 0) {
	PODArray<Regexp*> nre_subs(min);
	for (int i = 0; i < min; i++)
	nre_subs[i] = re->Incref();
	nre = Regexp::Concat(nre_subs.data(), min, f);
	}

	// Build and attach suffix: (x(x(x)?)?)?
	if (max > min) {
	Regexp* suf = Regexp::Quest(re->Incref(), f);
	for (int i = min+1; i < max; i++)
	suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
	if (nre == NULL)
	nre = suf;
	else
	nre = Concat2(nre, suf, f);
	}

	if (nre == NULL) {
	// Some degenerate case, like min > max, or min < max < 0.
	// This shouldn't happen, because the parser rejects such regexps.
	LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
	return new Regexp(kRegexpNoMatch, f);
	}

	return nre;
	}

	// Simplifies a character class.
	// Caller must Decref return value when done with it.
	Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
	CharClass* cc = re->cc();

	// Special cases
	if (cc->empty())
	return new Regexp(kRegexpNoMatch, re->parse_flags());
	if (cc->full())
	return new Regexp(kRegexpAnyChar, re->parse_flags());

	return re->Incref();
	}

	} // namespace re2