re2/regexp.cc - re2 - Git at Google

 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // Regular expression representation.
 // Tested by parse_test.cc

 #include "re2/regexp.h"

 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
 #include <algorithm>
 #include <map>
 #include <mutex>
 #include <string>
 #include <vector>

 #include "util/util.h"
 #include "util/logging.h"
 #include "util/mutex.h"
 #include "util/utf.h"
 #include "re2/stringpiece.h"
 #include "re2/walker-inl.h"

 namespace re2 {

 // Constructor.  Allocates vectors as appropriate for operator.
 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
   : op_(static_cast<uint8_t>(op)),
     simple_(false),
     parse_flags_(static_cast<uint16_t>(parse_flags)),
     ref_(1),
     nsub_(0),
     down_(NULL) {
   subone_ = NULL;
   memset(the_union_, 0, sizeof the_union_);
 }

 // Destructor.  Assumes already cleaned up children.
 // Private: use Decref() instead of delete to destroy Regexps.
 // Can't call Decref on the sub-Regexps here because
 // that could cause arbitrarily deep recursion, so
 // required Decref() to have handled them for us.
 Regexp::~Regexp() {
   if (nsub_ > 0)
     LOG(DFATAL) << "Regexp not destroyed.";

   switch (op_) {
     default:
       break;
     case kRegexpCapture:
       delete name_;
       break;
     case kRegexpLiteralString:
       delete[] runes_;
       break;
     case kRegexpCharClass:
       if (cc_)
         cc_->Delete();
       delete ccb_;
       break;
   }
 }

 // If it's possible to destroy this regexp without recurring,
 // do so and return true.  Else return false.
 bool Regexp::QuickDestroy() {
   if (nsub_ == 0) {
     delete this;
     return true;
   }
   return false;
 }

 // Lazily allocated.
 static Mutex* ref_mutex;
 static std::map<Regexp*, int>* ref_map;

 int Regexp::Ref() {
   if (ref_ < kMaxRef)
     return ref_;

   MutexLock l(ref_mutex);
   return (*ref_map)[this];
 }

 // Increments reference count, returns object as convenience.
 Regexp* Regexp::Incref() {
   if (ref_ >= kMaxRef-1) {
     static std::once_flag ref_once;
     std::call_once(ref_once, []() {
       ref_mutex = new Mutex;
       ref_map = new std::map<Regexp*, int>;
     });

     // Store ref count in overflow map.
     MutexLock l(ref_mutex);
     if (ref_ == kMaxRef) {
       // already overflowed
       (*ref_map)[this]++;
     } else {
       // overflowing now
       (*ref_map)[this] = kMaxRef;
       ref_ = kMaxRef;
     }
     return this;
   }

   ref_++;
   return this;
 }

 // Decrements reference count and deletes this object if count reaches 0.
 void Regexp::Decref() {
   if (ref_ == kMaxRef) {
     // Ref count is stored in overflow map.
     MutexLock l(ref_mutex);
     int r = (*ref_map)[this] - 1;
     if (r < kMaxRef) {
       ref_ = static_cast<uint16_t>(r);
       ref_map->erase(this);
     } else {
       (*ref_map)[this] = r;
     }
     return;
   }
   ref_--;
   if (ref_ == 0)
     Destroy();
 }

 // Deletes this object; ref count has count reached 0.
 void Regexp::Destroy() {
   if (QuickDestroy())
     return;

   // Handle recursive Destroy with explicit stack
   // to avoid arbitrarily deep recursion on process stack [sigh].
   down_ = NULL;
   Regexp* stack = this;
   while (stack != NULL) {
     Regexp* re = stack;
     stack = re->down_;
     if (re->ref_ != 0)
       LOG(DFATAL) << "Bad reference count " << re->ref_;
     if (re->nsub_ > 0) {
       Regexp** subs = re->sub();
       for (int i = 0; i < re->nsub_; i++) {
         Regexp* sub = subs[i];
         if (sub == NULL)
           continue;
         if (sub->ref_ == kMaxRef)
           sub->Decref();
         else
           --sub->ref_;
         if (sub->ref_ == 0 && !sub->QuickDestroy()) {
           sub->down_ = stack;
           stack = sub;
         }
       }
       if (re->nsub_ > 1)
         delete[] subs;
       re->nsub_ = 0;
     }
     delete re;
   }
 }

 void Regexp::AddRuneToString(Rune r) {
   DCHECK(op_ == kRegexpLiteralString);
   if (nrunes_ == 0) {
     // start with 8
     runes_ = new Rune[8];
   } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
     // double on powers of two
     Rune *old = runes_;
     runes_ = new Rune[nrunes_ * 2];
     for (int i = 0; i < nrunes_; i++)
       runes_[i] = old[i];
     delete[] old;
   }

   runes_[nrunes_++] = r;
 }

 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
   Regexp* re = new Regexp(kRegexpHaveMatch, flags);
   re->match_id_ = match_id;
   return re;
 }

 Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
   // Squash **, ++ and ??.
   if (op == sub->op() && flags == sub->parse_flags())
     return sub;

   // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
   // op is Star/Plus/Quest, we just have to check that sub->op() is too.
   if ((sub->op() == kRegexpStar ||
        sub->op() == kRegexpPlus ||
        sub->op() == kRegexpQuest) &&
       flags == sub->parse_flags()) {
     // If sub is Star, no need to rewrite it.
     if (sub->op() == kRegexpStar)
       return sub;

     // Rewrite sub to Star.
     Regexp* re = new Regexp(kRegexpStar, flags);
     re->AllocSub(1);
     re->sub()[0] = sub->sub()[0]->Incref();
     sub->Decref();  // We didn't consume the reference after all.
     return re;
   }

   Regexp* re = new Regexp(op, flags);
   re->AllocSub(1);
   re->sub()[0] = sub;
   return re;
 }

 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
   return StarPlusOrQuest(kRegexpPlus, sub, flags);
 }

 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
   return StarPlusOrQuest(kRegexpStar, sub, flags);
 }

 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
   return StarPlusOrQuest(kRegexpQuest, sub, flags);
 }

 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
                                   ParseFlags flags, bool can_factor) {
   if (nsub == 1)
     return sub[0];

   if (nsub == 0) {
     if (op == kRegexpAlternate)
       return new Regexp(kRegexpNoMatch, flags);
     else
       return new Regexp(kRegexpEmptyMatch, flags);
   }

   Regexp** subcopy = NULL;
   if (op == kRegexpAlternate && can_factor) {
     // Going to edit sub; make a copy so we don't step on caller.
     subcopy = new Regexp*[nsub];
     memmove(subcopy, sub, nsub * sizeof sub[0]);
     sub = subcopy;
     nsub = FactorAlternation(sub, nsub, flags);
     if (nsub == 1) {
       Regexp* re = sub[0];
       delete[] subcopy;
       return re;
     }
   }

   if (nsub > kMaxNsub) {
     // Too many subexpressions to fit in a single Regexp.
     // Make a two-level tree.  Two levels gets us to 65535^2.
     int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
     Regexp* re = new Regexp(op, flags);
     re->AllocSub(nbigsub);
     Regexp** subs = re->sub();
     for (int i = 0; i < nbigsub - 1; i++)
       subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
     subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
                                           nsub - (nbigsub-1)*kMaxNsub, flags,
                                           false);
     delete[] subcopy;
     return re;
   }

   Regexp* re = new Regexp(op, flags);
   re->AllocSub(nsub);
   Regexp** subs = re->sub();
   for (int i = 0; i < nsub; i++)
     subs[i] = sub[i];

   delete[] subcopy;
   return re;
 }

 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
   return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
 }

 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
 }

 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
   return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
 }

 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
   Regexp* re = new Regexp(kRegexpCapture, flags);
   re->AllocSub(1);
   re->sub()[0] = sub;
   re->cap_ = cap;
   return re;
 }

 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
   Regexp* re = new Regexp(kRegexpRepeat, flags);
   re->AllocSub(1);
   re->sub()[0] = sub;
   re->min_ = min;
   re->max_ = max;
   return re;
 }

 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
   Regexp* re = new Regexp(kRegexpLiteral, flags);
   re->rune_ = rune;
   return re;
 }

 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
   if (nrunes <= 0)
     return new Regexp(kRegexpEmptyMatch, flags);
   if (nrunes == 1)
     return NewLiteral(runes[0], flags);
   Regexp* re = new Regexp(kRegexpLiteralString, flags);
   for (int i = 0; i < nrunes; i++)
     re->AddRuneToString(runes[i]);
   return re;
 }

 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
   Regexp* re = new Regexp(kRegexpCharClass, flags);
   re->cc_ = cc;
   return re;
 }

 void Regexp::Swap(Regexp* that) {
   // Regexp is not trivially copyable, so we cannot freely copy it with
   // memmove(3), but swapping objects like so is safe for our purposes.
   char tmp[sizeof *this];
   void* vthis = reinterpret_cast<void*>(this);
   void* vthat = reinterpret_cast<void*>(that);
   memmove(tmp, vthis, sizeof *this);
   memmove(vthis, vthat, sizeof *this);
   memmove(vthat, tmp, sizeof *this);
 }

 // Tests equality of all top-level structure but not subregexps.
 static bool TopEqual(Regexp* a, Regexp* b) {
   if (a->op() != b->op())
     return false;

   switch (a->op()) {
     case kRegexpNoMatch:
     case kRegexpEmptyMatch:
     case kRegexpAnyChar:
     case kRegexpAnyByte:
     case kRegexpBeginLine:
     case kRegexpEndLine:
     case kRegexpWordBoundary:
     case kRegexpNoWordBoundary:
     case kRegexpBeginText:
       return true;

     case kRegexpEndText:
       // The parse flags remember whether it's \z or (?-m:$),
       // which matters when testing against PCRE.
       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;

     case kRegexpLiteral:
       return a->rune() == b->rune() &&
              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;

     case kRegexpLiteralString:
       return a->nrunes() == b->nrunes() &&
              ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
              memcmp(a->runes(), b->runes(),
                     a->nrunes() * sizeof a->runes()[0]) == 0;

     case kRegexpAlternate:
     case kRegexpConcat:
       return a->nsub() == b->nsub();

     case kRegexpStar:
     case kRegexpPlus:
     case kRegexpQuest:
       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;

     case kRegexpRepeat:
       return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
              a->min() == b->min() &&
              a->max() == b->max();

     case kRegexpCapture:
       return a->cap() == b->cap() && a->name() == b->name();

     case kRegexpHaveMatch:
       return a->match_id() == b->match_id();

     case kRegexpCharClass: {
       CharClass* acc = a->cc();
       CharClass* bcc = b->cc();
       return acc->size() == bcc->size() &&
              acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
              memcmp(acc->begin(), bcc->begin(),
                     (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
     }
   }

   LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
   return 0;
 }

 bool Regexp::Equal(Regexp* a, Regexp* b) {
   if (a == NULL || b == NULL)
     return a == b;

   if (!TopEqual(a, b))
     return false;

   // Fast path:
   // return without allocating vector if there are no subregexps.
   switch (a->op()) {
     case kRegexpAlternate:
     case kRegexpConcat:
     case kRegexpStar:
     case kRegexpPlus:
     case kRegexpQuest:
     case kRegexpRepeat:
     case kRegexpCapture:
       break;

     default:
       return true;
   }

   // Committed to doing real work.
   // The stack (vector) has pairs of regexps waiting to
   // be compared.  The regexps are only equal if
   // all the pairs end up being equal.
   std::vector<Regexp*> stk;

   for (;;) {
     // Invariant: TopEqual(a, b) == true.
     Regexp* a2;
     Regexp* b2;
     switch (a->op()) {
       default:
         break;
       case kRegexpAlternate:
       case kRegexpConcat:
         for (int i = 0; i < a->nsub(); i++) {
           a2 = a->sub()[i];
           b2 = b->sub()[i];
           if (!TopEqual(a2, b2))
             return false;
           stk.push_back(a2);
           stk.push_back(b2);
         }
         break;

       case kRegexpStar:
       case kRegexpPlus:
       case kRegexpQuest:
       case kRegexpRepeat:
       case kRegexpCapture:
         a2 = a->sub()[0];
         b2 = b->sub()[0];
         if (!TopEqual(a2, b2))
           return false;
         // Really:
         //   stk.push_back(a2);
         //   stk.push_back(b2);
         //   break;
         // but faster to assign directly and loop.
         a = a2;
         b = b2;
         continue;
     }

     size_t n = stk.size();
     if (n == 0)
       break;

     DCHECK_GE(n, 2);
     a = stk[n-2];
     b = stk[n-1];
     stk.resize(n-2);
   }

   return true;
 }

 // Keep in sync with enum RegexpStatusCode in regexp.h
 static const char *kErrorStrings[] = {
   "no error",
   "unexpected error",
   "invalid escape sequence",
   "invalid character class",
   "invalid character class range",
   "missing ]",
   "missing )",
   "trailing \\",
   "no argument for repetition operator",
   "invalid repetition size",
   "bad repetition operator",
   "invalid perl operator",
   "invalid UTF-8",
   "invalid named capture group",
 };

 std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
   if (code < 0 || code >= arraysize(kErrorStrings))
     code = kRegexpInternalError;
   return kErrorStrings[code];
 }

 std::string RegexpStatus::Text() const {
   if (error_arg_.empty())
     return CodeText(code_);
   std::string s;
   s.append(CodeText(code_));
   s.append(": ");
   s.append(error_arg_.data(), error_arg_.size());
   return s;
 }

 void RegexpStatus::Copy(const RegexpStatus& status) {
   code_ = status.code_;
   error_arg_ = status.error_arg_;
 }

 typedef int Ignored;  // Walker<void> doesn't exist

 // Walker subclass to count capturing parens in regexp.
 class NumCapturesWalker : public Regexp::Walker<Ignored> {
  public:
   NumCapturesWalker() : ncapture_(0) {}
   int ncapture() { return ncapture_; }

   virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
     if (re->op() == kRegexpCapture)
       ncapture_++;
     return ignored;
   }
   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
     // Should never be called: we use Walk not WalkExponential.
     LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
     return ignored;
   }

  private:
   int ncapture_;

   NumCapturesWalker(const NumCapturesWalker&) = delete;
   NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
 };

 int Regexp::NumCaptures() {
   NumCapturesWalker w;
   w.Walk(this, 0);
   return w.ncapture();
 }

 // Walker class to build map of named capture groups and their indices.
 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
  public:
   NamedCapturesWalker() : map_(NULL) {}
   ~NamedCapturesWalker() { delete map_; }

   std::map<std::string, int>* TakeMap() {
     std::map<std::string, int>* m = map_;
     map_ = NULL;
     return m;
   }

   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
     if (re->op() == kRegexpCapture && re->name() != NULL) {
       // Allocate map once we find a name.
       if (map_ == NULL)
         map_ = new std::map<std::string, int>;

       // Record first occurrence of each name.
       // (The rule is that if you have the same name
       // multiple times, only the leftmost one counts.)
       if (map_->find(*re->name()) == map_->end())
         (*map_)[*re->name()] = re->cap();
     }
     return ignored;
   }

   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
     // Should never be called: we use Walk not WalkExponential.
     LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
     return ignored;
   }

  private:
   std::map<std::string, int>* map_;

   NamedCapturesWalker(const NamedCapturesWalker&) = delete;
   NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
 };

 std::map<std::string, int>* Regexp::NamedCaptures() {
   NamedCapturesWalker w;
   w.Walk(this, 0);
   return w.TakeMap();
 }

 // Walker class to build map from capture group indices to their names.
 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
  public:
   CaptureNamesWalker() : map_(NULL) {}
   ~CaptureNamesWalker() { delete map_; }

   std::map<int, std::string>* TakeMap() {
     std::map<int, std::string>* m = map_;
     map_ = NULL;
     return m;
   }

   Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
     if (re->op() == kRegexpCapture && re->name() != NULL) {
       // Allocate map once we find a name.
       if (map_ == NULL)
         map_ = new std::map<int, std::string>;

       (*map_)[re->cap()] = *re->name();
     }
     return ignored;
   }

   virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
     // Should never be called: we use Walk not WalkExponential.
     LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
     return ignored;
   }

  private:
   std::map<int, std::string>* map_;

   CaptureNamesWalker(const CaptureNamesWalker&) = delete;
   CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
 };

 std::map<int, std::string>* Regexp::CaptureNames() {
   CaptureNamesWalker w;
   w.Walk(this, 0);
   return w.TakeMap();
 }

 // Determines whether regexp matches must be anchored
 // with a fixed string prefix.  If so, returns the prefix and
 // the regexp that remains after the prefix.  The prefix might
 // be ASCII case-insensitive.
 bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
                             Regexp** suffix) {
   // No need for a walker: the regexp must be of the form
   // 1. some number of ^ anchors
   // 2. a literal char or string
   // 3. the rest
   prefix->clear();
   *foldcase = false;
   *suffix = NULL;
   if (op_ != kRegexpConcat)
     return false;

   // Some number of anchors, then a literal or concatenation.
   int i = 0;
   Regexp** sub = this->sub();
   while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
     i++;
   if (i == 0 || i >= nsub_)
     return false;

   Regexp* re = sub[i];
   switch (re->op_) {
     default:
       return false;

     case kRegexpLiteralString:
       // Convert to string in proper encoding.
       if (re->parse_flags() & Latin1) {
         prefix->resize(re->nrunes_);
         for (int j = 0; j < re->nrunes_; j++)
           (*prefix)[j] = static_cast<char>(re->runes_[j]);
       } else {
         // Convert to UTF-8 in place.
         // Assume worst-case space and then trim.
         prefix->resize(re->nrunes_ * UTFmax);
         char *p = &(*prefix)[0];
         for (int j = 0; j < re->nrunes_; j++) {
           Rune r = re->runes_[j];
           if (r < Runeself)
             *p++ = static_cast<char>(r);
           else
             p += runetochar(p, &r);
         }
         prefix->resize(p - &(*prefix)[0]);
       }
       break;

     case kRegexpLiteral:
       if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
         prefix->append(1, static_cast<char>(re->rune_));
       } else {
         char buf[UTFmax];
         prefix->append(buf, runetochar(buf, &re->rune_));
       }
       break;
   }
   *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
   i++;

   // The rest.
   if (i < nsub_) {
     for (int j = i; j < nsub_; j++)
       sub[j]->Incref();
     re = Concat(sub + i, nsub_ - i, parse_flags());
   } else {
     re = new Regexp(kRegexpEmptyMatch, parse_flags());
   }
   *suffix = re;
   return true;
 }

 // Character class builder is a balanced binary tree (STL set)
 // containing non-overlapping, non-abutting RuneRanges.
 // The less-than operator used in the tree treats two
 // ranges as equal if they overlap at all, so that
 // lookups for a particular Rune are possible.

 CharClassBuilder::CharClassBuilder() {
   nrunes_ = 0;
   upper_ = 0;
   lower_ = 0;
 }

 // Add lo-hi to the class; return whether class got bigger.
 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
   if (hi < lo)
     return false;

   if (lo <= 'z' && hi >= 'A') {
     // Overlaps some alpha, maybe not all.
     // Update bitmaps telling which ASCII letters are in the set.
     Rune lo1 = std::max<Rune>(lo, 'A');
     Rune hi1 = std::min<Rune>(hi, 'Z');
     if (lo1 <= hi1)
       upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');

     lo1 = std::max<Rune>(lo, 'a');
     hi1 = std::min<Rune>(hi, 'z');
     if (lo1 <= hi1)
       lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
   }

   {  // Check whether lo, hi is already in the class.
     iterator it = ranges_.find(RuneRange(lo, lo));
     if (it != end() && it->lo <= lo && hi <= it->hi)
       return false;
   }

   // Look for a range abutting lo on the left.
   // If it exists, take it out and increase our range.
   if (lo > 0) {
     iterator it = ranges_.find(RuneRange(lo-1, lo-1));
     if (it != end()) {
       lo = it->lo;
       if (it->hi > hi)
         hi = it->hi;
       nrunes_ -= it->hi - it->lo + 1;
       ranges_.erase(it);
     }
   }

   // Look for a range abutting hi on the right.
   // If it exists, take it out and increase our range.
   if (hi < Runemax) {
     iterator it = ranges_.find(RuneRange(hi+1, hi+1));
     if (it != end()) {
       hi = it->hi;
       nrunes_ -= it->hi - it->lo + 1;
       ranges_.erase(it);
     }
   }

   // Look for ranges between lo and hi.  Take them out.
   // This is only safe because the set has no overlapping ranges.
   // We've already removed any ranges abutting lo and hi, so
   // any that overlap [lo, hi] must be contained within it.
   for (;;) {
     iterator it = ranges_.find(RuneRange(lo, hi));
     if (it == end())
       break;
     nrunes_ -= it->hi - it->lo + 1;
     ranges_.erase(it);
   }

   // Finally, add [lo, hi].
   nrunes_ += hi - lo + 1;
   ranges_.insert(RuneRange(lo, hi));
   return true;
 }

 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
   for (iterator it = cc->begin(); it != cc->end(); ++it)
     AddRange(it->lo, it->hi);
 }

 bool CharClassBuilder::Contains(Rune r) {
   return ranges_.find(RuneRange(r, r)) != end();
 }

 // Does the character class behave the same on A-Z as on a-z?
 bool CharClassBuilder::FoldsASCII() {
   return ((upper_ ^ lower_) & AlphaMask) == 0;
 }

 CharClassBuilder* CharClassBuilder::Copy() {
   CharClassBuilder* cc = new CharClassBuilder;
   for (iterator it = begin(); it != end(); ++it)
     cc->ranges_.insert(RuneRange(it->lo, it->hi));
   cc->upper_ = upper_;
   cc->lower_ = lower_;
   cc->nrunes_ = nrunes_;
   return cc;
 }


 void CharClassBuilder::RemoveAbove(Rune r) {
   if (r >= Runemax)
     return;

   if (r < 'z') {
     if (r < 'a')
       lower_ = 0;
     else
       lower_ &= AlphaMask >> ('z' - r);
   }

   if (r < 'Z') {
     if (r < 'A')
       upper_ = 0;
     else
       upper_ &= AlphaMask >> ('Z' - r);
   }

   for (;;) {

     iterator it = ranges_.find(RuneRange(r + 1, Runemax));
     if (it == end())
       break;
     RuneRange rr = *it;
     ranges_.erase(it);
     nrunes_ -= rr.hi - rr.lo + 1;
     if (rr.lo <= r) {
       rr.hi = r;
       ranges_.insert(rr);
       nrunes_ += rr.hi - rr.lo + 1;
     }
   }
 }

 void CharClassBuilder::Negate() {
   // Build up negation and then copy in.
   // Could edit ranges in place, but C++ won't let me.
   std::vector<RuneRange> v;
   v.reserve(ranges_.size() + 1);

   // In negation, first range begins at 0, unless
   // the current class begins at 0.
   iterator it = begin();
   if (it == end()) {
     v.push_back(RuneRange(0, Runemax));
   } else {
     int nextlo = 0;
     if (it->lo == 0) {
       nextlo = it->hi + 1;
       ++it;
     }
     for (; it != end(); ++it) {
       v.push_back(RuneRange(nextlo, it->lo - 1));
       nextlo = it->hi + 1;
     }
     if (nextlo <= Runemax)
       v.push_back(RuneRange(nextlo, Runemax));
   }

   ranges_.clear();
   for (size_t i = 0; i < v.size(); i++)
     ranges_.insert(v[i]);

   upper_ = AlphaMask & ~upper_;
   lower_ = AlphaMask & ~lower_;
   nrunes_ = Runemax+1 - nrunes_;
 }

 // Character class is a sorted list of ranges.
 // The ranges are allocated in the same block as the header,
 // necessitating a special allocator and Delete method.

 CharClass* CharClass::New(int maxranges) {
   CharClass* cc;
   uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
   cc = reinterpret_cast<CharClass*>(data);
   cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
   cc->nranges_ = 0;
   cc->folds_ascii_ = false;
   cc->nrunes_ = 0;
   return cc;
 }

 void CharClass::Delete() {
   uint8_t* data = reinterpret_cast<uint8_t*>(this);
   delete[] data;
 }

 CharClass* CharClass::Negate() {
   CharClass* cc = CharClass::New(nranges_+1);
   cc->folds_ascii_ = folds_ascii_;
   cc->nrunes_ = Runemax + 1 - nrunes_;
   int n = 0;
   int nextlo = 0;
   for (CharClass::iterator it = begin(); it != end(); ++it) {
     if (it->lo == nextlo) {
       nextlo = it->hi + 1;
     } else {
       cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
       nextlo = it->hi + 1;
     }
   }
   if (nextlo <= Runemax)
     cc->ranges_[n++] = RuneRange(nextlo, Runemax);
   cc->nranges_ = n;
   return cc;
 }

 bool CharClass::Contains(Rune r) {
   RuneRange* rr = ranges_;
   int n = nranges_;
   while (n > 0) {
     int m = n/2;
     if (rr[m].hi < r) {
       rr += m+1;
       n -= m+1;
     } else if (r < rr[m].lo) {
       n = m;
     } else {  // rr[m].lo <= r && r <= rr[m].hi
       return true;
     }
   }
   return false;
 }

 CharClass* CharClassBuilder::GetCharClass() {
   CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
   int n = 0;
   for (iterator it = begin(); it != end(); ++it)
     cc->ranges_[n++] = *it;
   cc->nranges_ = n;
   DCHECK_LE(n, static_cast<int>(ranges_.size()));
   cc->nrunes_ = nrunes_;
   cc->folds_ascii_ = FoldsASCII();
   return cc;
 }

 }  // namespace re2