re2/regexp.h - re2 - Git at Google

 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 #ifndef RE2_REGEXP_H_
 #define RE2_REGEXP_H_

 // --- SPONSORED LINK --------------------------------------------------
 // If you want to use this library for regular expression matching,
 // you should use re2/re2.h, which provides a class RE2 that
 // mimics the PCRE interface provided by PCRE's C++ wrappers.
 // This header describes the low-level interface used to implement RE2
 // and may change in backwards-incompatible ways from time to time.
 // In contrast, RE2's interface will not.
 // ---------------------------------------------------------------------

 // Regular expression library: parsing, execution, and manipulation
 // of regular expressions.
 //
 // Any operation that traverses the Regexp structures should be written
 // using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
 // regular expressions such as x++++++++++++++++++++... might cause recursive
 // traversals to overflow the stack.
 //
 // It is the caller's responsibility to provide appropriate mutual exclusion
 // around manipulation of the regexps.  RE2 does this.
 //
 // PARSING
 //
 // Regexp::Parse parses regular expressions encoded in UTF-8.
 // The default syntax is POSIX extended regular expressions,
 // with the following changes:
 //
 //   1.  Backreferences (optional in POSIX EREs) are not supported.
 //         (Supporting them precludes the use of DFA-based
 //          matching engines.)
 //
 //   2.  Collating elements and collation classes are not supported.
 //         (No one has needed or wanted them.)
 //
 // The exact syntax accepted can be modified by passing flags to
 // Regexp::Parse.  In particular, many of the basic Perl additions
 // are available.  The flags are documented below (search for LikePerl).
 //
 // If parsed with the flag Regexp::Latin1, both the regular expression
 // and the input to the matching routines are assumed to be encoded in
 // Latin-1, not UTF-8.
 //
 // EXECUTION
 //
 // Once Regexp has parsed a regular expression, it provides methods
 // to search text using that regular expression.  These methods are
 // implemented via calling out to other regular expression libraries.
 // (Let's call them the sublibraries.)
 //
 // To call a sublibrary, Regexp does not simply prepare a
 // string version of the regular expression and hand it to the
 // sublibrary.  Instead, Regexp prepares, from its own parsed form, the
 // corresponding internal representation used by the sublibrary.
 // This has the drawback of needing to know the internal representation
 // used by the sublibrary, but it has two important benefits:
 //
 //   1. The syntax and meaning of regular expressions is guaranteed
 //      to be that used by Regexp's parser, not the syntax expected
 //      by the sublibrary.  Regexp might accept a restricted or
 //      expanded syntax for regular expressions as compared with
 //      the sublibrary.  As long as Regexp can translate from its
 //      internal form into the sublibrary's, clients need not know
 //      exactly which sublibrary they are using.
 //
 //   2. The sublibrary parsers are bypassed.  For whatever reason,
 //      sublibrary regular expression parsers often have security
 //      problems.  For example, plan9grep's regular expression parser
 //      has a buffer overflow in its handling of large character
 //      classes, and PCRE's parser has had buffer overflow problems
 //      in the past.  Security-team requires sandboxing of sublibrary
 //      regular expression parsers.  Avoiding the sublibrary parsers
 //      avoids the sandbox.
 //
 // The execution methods we use now are provided by the compiled form,
 // Prog, described in prog.h
 //
 // MANIPULATION
 //
 // Unlike other regular expression libraries, Regexp makes its parsed
 // form accessible to clients, so that client code can analyze the
 // parsed regular expressions.

 #include <stdint.h>
 #include <map>
 #include <set>
 #include <string>

 #include "util/util.h"
 #include "util/logging.h"
 #include "util/utf.h"
 #include "re2/stringpiece.h"

 namespace re2 {

 // Keep in sync with string list kOpcodeNames[] in testing/dump.cc
 enum RegexpOp {
   // Matches no strings.
   kRegexpNoMatch = 1,

   // Matches empty string.
   kRegexpEmptyMatch,

   // Matches rune_.
   kRegexpLiteral,

   // Matches runes_.
   kRegexpLiteralString,

   // Matches concatenation of sub_[0..nsub-1].
   kRegexpConcat,
   // Matches union of sub_[0..nsub-1].
   kRegexpAlternate,

   // Matches sub_[0] zero or more times.
   kRegexpStar,
   // Matches sub_[0] one or more times.
   kRegexpPlus,
   // Matches sub_[0] zero or one times.
   kRegexpQuest,

   // Matches sub_[0] at least min_ times, at most max_ times.
   // max_ == -1 means no upper limit.
   kRegexpRepeat,

   // Parenthesized (capturing) subexpression.  Index is cap_.
   // Optionally, capturing name is name_.
   kRegexpCapture,

   // Matches any character.
   kRegexpAnyChar,

   // Matches any byte [sic].
   kRegexpAnyByte,

   // Matches empty string at beginning of line.
   kRegexpBeginLine,
   // Matches empty string at end of line.
   kRegexpEndLine,

   // Matches word boundary "\b".
   kRegexpWordBoundary,
   // Matches not-a-word boundary "\B".
   kRegexpNoWordBoundary,

   // Matches empty string at beginning of text.
   kRegexpBeginText,
   // Matches empty string at end of text.
   kRegexpEndText,

   // Matches character class given by cc_.
   kRegexpCharClass,

   // Forces match of entire expression right now,
   // with match ID match_id_ (used by RE2::Set).
   kRegexpHaveMatch,

   kMaxRegexpOp = kRegexpHaveMatch,
 };

 // Keep in sync with string list in regexp.cc
 enum RegexpStatusCode {
   // No error
   kRegexpSuccess = 0,

   // Unexpected error
   kRegexpInternalError,

   // Parse errors
   kRegexpBadEscape,          // bad escape sequence
   kRegexpBadCharClass,       // bad character class
   kRegexpBadCharRange,       // bad character class range
   kRegexpMissingBracket,     // missing closing ]
   kRegexpMissingParen,       // missing closing )
   kRegexpTrailingBackslash,  // at end of regexp
   kRegexpRepeatArgument,     // repeat argument missing, e.g. "*"
   kRegexpRepeatSize,         // bad repetition argument
   kRegexpRepeatOp,           // bad repetition operator
   kRegexpBadPerlOp,          // bad perl operator
   kRegexpBadUTF8,            // invalid UTF-8 in regexp
   kRegexpBadNamedCapture,    // bad named capture
 };

 // Error status for certain operations.
 class RegexpStatus {
  public:
   RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
   ~RegexpStatus() { delete tmp_; }

   void set_code(RegexpStatusCode code) { code_ = code; }
   void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
   void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
   RegexpStatusCode code() const { return code_; }
   const StringPiece& error_arg() const { return error_arg_; }
   bool ok() const { return code() == kRegexpSuccess; }

   // Copies state from status.
   void Copy(const RegexpStatus& status);

   // Returns text equivalent of code, e.g.:
   //   "Bad character class"
   static string CodeText(RegexpStatusCode code);

   // Returns text describing error, e.g.:
   //   "Bad character class: [z-a]"
   string Text() const;

  private:
   RegexpStatusCode code_;  // Kind of error
   StringPiece error_arg_;       // Piece of regexp containing syntax error.
   string* tmp_;                 // Temporary storage, possibly where error_arg_ is.

   RegexpStatus(const RegexpStatus&) = delete;
   RegexpStatus& operator=(const RegexpStatus&) = delete;
 };

 // Compiled form; see prog.h
 class Prog;

 struct RuneRange {
   RuneRange() : lo(0), hi(0) { }
   RuneRange(int l, int h) : lo(l), hi(h) { }
   Rune lo;
   Rune hi;
 };

 // Less-than on RuneRanges treats a == b if they overlap at all.
 // This lets us look in a set to find the range covering a particular Rune.
 struct RuneRangeLess {
   bool operator()(const RuneRange& a, const RuneRange& b) const {
     return a.hi < b.lo;
   }
 };

 class CharClassBuilder;

 class CharClass {
  public:
   void Delete();

   typedef RuneRange* iterator;
   iterator begin() { return ranges_; }
   iterator end() { return ranges_ + nranges_; }

   int size() { return nrunes_; }
   bool empty() { return nrunes_ == 0; }
   bool full() { return nrunes_ == Runemax+1; }
   bool FoldsASCII() { return folds_ascii_; }

   bool Contains(Rune r);
   CharClass* Negate();

  private:
   CharClass();  // not implemented
   ~CharClass();  // not implemented
   static CharClass* New(int maxranges);

   friend class CharClassBuilder;

   bool folds_ascii_;
   int nrunes_;
   RuneRange *ranges_;
   int nranges_;

   CharClass(const CharClass&) = delete;
   CharClass& operator=(const CharClass&) = delete;
 };

 class Regexp {
  public:

   // Flags for parsing.  Can be ORed together.
   enum ParseFlags {
     NoParseFlags  = 0,
     FoldCase      = 1<<0,   // Fold case during matching (case-insensitive).
     Literal       = 1<<1,   // Treat s as literal string instead of a regexp.
     ClassNL       = 1<<2,   // Allow char classes like [^a-z] and \D and \s
                             // and [[:space:]] to match newline.
     DotNL         = 1<<3,   // Allow . to match newline.
     MatchNL       = ClassNL | DotNL,
     OneLine       = 1<<4,   // Treat ^ and $ as only matching at beginning and
                             // end of text, not around embedded newlines.
                             // (Perl's default)
     Latin1        = 1<<5,   // Regexp and text are in Latin1, not UTF-8.
     NonGreedy     = 1<<6,   // Repetition operators are non-greedy by default.
     PerlClasses   = 1<<7,   // Allow Perl character classes like \d.
     PerlB         = 1<<8,   // Allow Perl's \b and \B.
     PerlX         = 1<<9,   // Perl extensions:
                             //   non-capturing parens - (?: )
                             //   non-greedy operators - *? +? ?? {}?
                             //   flag edits - (?i) (?-i) (?i: )
                             //     i - FoldCase
                             //     m - !OneLine
                             //     s - DotNL
                             //     U - NonGreedy
                             //   line ends: \A \z
                             //   \Q and \E to disable/enable metacharacters
                             //   (?P<name>expr) for named captures
                             //   \C to match any single byte
     UnicodeGroups = 1<<10,  // Allow \p{Han} for Unicode Han group
                             //   and \P{Han} for its negation.
     NeverNL       = 1<<11,  // Never match NL, even if the regexp mentions
                             //   it explicitly.
     NeverCapture  = 1<<12,  // Parse all parens as non-capturing.

     // As close to Perl as we can get.
     LikePerl      = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
                     UnicodeGroups,

     // Internal use only.
     WasDollar     = 1<<13,  // on kRegexpEndText: was $ in regexp text
     AllParseFlags = (1<<14)-1,
   };

   // Get.  No set, Regexps are logically immutable once created.
   RegexpOp op() { return static_cast<RegexpOp>(op_); }
   int nsub() { return nsub_; }
   bool simple() { return simple_ != 0; }
   ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
   int Ref();  // For testing.

   Regexp** sub() {
     if(nsub_ <= 1)
       return &subone_;
     else
       return submany_;
   }

   int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
   int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
   Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
   CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
   int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
   const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
   Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
   int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
   int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }

   // Increments reference count, returns object as convenience.
   Regexp* Incref();

   // Decrements reference count and deletes this object if count reaches 0.
   void Decref();

   // Parses string s to produce regular expression, returned.
   // Caller must release return value with re->Decref().
   // On failure, sets *status (if status != NULL) and returns NULL.
   static Regexp* Parse(const StringPiece& s, ParseFlags flags,
                        RegexpStatus* status);

   // Returns a _new_ simplified version of the current regexp.
   // Does not edit the current regexp.
   // Caller must release return value with re->Decref().
   // Simplified means that counted repetition has been rewritten
   // into simpler terms and all Perl/POSIX features have been
   // removed.  The result will capture exactly the same
   // subexpressions the original did, unless formatted with ToString.
   Regexp* Simplify();
   friend class CoalesceWalker;
   friend class SimplifyWalker;

   // Parses the regexp src and then simplifies it and sets *dst to the
   // string representation of the simplified form.  Returns true on success.
   // Returns false and sets *status (if status != NULL) on parse error.
   static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
                              string* dst,
                              RegexpStatus* status);

   // Returns the number of capturing groups in the regexp.
   int NumCaptures();
   friend class NumCapturesWalker;

   // Returns a map from names to capturing group indices,
   // or NULL if the regexp contains no named capture groups.
   // The caller is responsible for deleting the map.
   std::map<string, int>* NamedCaptures();

   // Returns a map from capturing group indices to capturing group
   // names or NULL if the regexp contains no named capture groups. The
   // caller is responsible for deleting the map.
   std::map<int, string>* CaptureNames();

   // Returns a string representation of the current regexp,
   // using as few parentheses as possible.
   string ToString();

   // Convenience functions.  They consume the passed reference,
   // so in many cases you should use, e.g., Plus(re->Incref(), flags).
   // They do not consume allocated arrays like subs or runes.
   static Regexp* Plus(Regexp* sub, ParseFlags flags);
   static Regexp* Star(Regexp* sub, ParseFlags flags);
   static Regexp* Quest(Regexp* sub, ParseFlags flags);
   static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
   static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
   static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
   static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
   static Regexp* NewLiteral(Rune rune, ParseFlags flags);
   static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
   static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
   static Regexp* HaveMatch(int match_id, ParseFlags flags);

   // Like Alternate but does not factor out common prefixes.
   static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);

   // Debugging function.  Returns string format for regexp
   // that makes structure clear.  Does NOT use regexp syntax.
   string Dump();

   // Helper traversal class, defined fully in walker-inl.h.
   template<typename T> class Walker;

   // Compile to Prog.  See prog.h
   // Reverse prog expects to be run over text backward.
   // Construction and execution of prog will
   // stay within approximately max_mem bytes of memory.
   // If max_mem <= 0, a reasonable default is used.
   Prog* CompileToProg(int64_t max_mem);
   Prog* CompileToReverseProg(int64_t max_mem);

   // Whether to expect this library to find exactly the same answer as PCRE
   // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
   // obscure cases behave differently.  Technically this is more a property
   // of the Prog than the Regexp, but the computation is much easier to do
   // on the Regexp.  See mimics_pcre.cc for the exact conditions.
   bool MimicsPCRE();

   // Benchmarking function.
   void NullWalk();

   // Whether every match of this regexp must be anchored and
   // begin with a non-empty fixed string (perhaps after ASCII
   // case-folding).  If so, returns the prefix and the sub-regexp that
   // follows it.
   // Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
   // regardless of the return value.
   bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);

  private:
   // Constructor allocates vectors as appropriate for operator.
   explicit Regexp(RegexpOp op, ParseFlags parse_flags);

   // Use Decref() instead of delete to release Regexps.
   // This is private to catch deletes at compile time.
   ~Regexp();
   void Destroy();
   bool QuickDestroy();

   // Helpers for Parse.  Listed here so they can edit Regexps.
   class ParseState;

   friend class ParseState;
   friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
                              RegexpStatus* status);

   // Helper for testing [sic].
   friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);

   // Computes whether Regexp is already simple.
   bool ComputeSimple();

   // Constructor that generates a Star, Plus or Quest,
   // squashing the pair if sub is also a Star, Plus or Quest.
   static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);

   // Constructor that generates a concatenation or alternation,
   // enforcing the limit on the number of subexpressions for
   // a particular Regexp.
   static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
                                    ParseFlags flags, bool can_factor);

   // Returns the leading string that re starts with.
   // The returned Rune* points into a piece of re,
   // so it must not be used after the caller calls re->Decref().
   static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);

   // Removes the first n leading runes from the beginning of re.
   // Edits re in place.
   static void RemoveLeadingString(Regexp* re, int n);

   // Returns the leading regexp in re's top-level concatenation.
   // The returned Regexp* points at re or a sub-expression of re,
   // so it must not be used after the caller calls re->Decref().
   static Regexp* LeadingRegexp(Regexp* re);

   // Removes LeadingRegexp(re) from re and returns the remainder.
   // Might edit re in place.
   static Regexp* RemoveLeadingRegexp(Regexp* re);

   // Simplifies an alternation of literal strings by factoring out
   // common prefixes.
   static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
   friend class FactorAlternationImpl;

   // Is a == b?  Only efficient on regexps that have not been through
   // Simplify yet - the expansion of a kRegexpRepeat will make this
   // take a long time.  Do not call on such regexps, hence private.
   static bool Equal(Regexp* a, Regexp* b);

   // Allocate space for n sub-regexps.
   void AllocSub(int n) {
     DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
     if (n > 1)
       submany_ = new Regexp*[n];
     nsub_ = static_cast<uint16_t>(n);
   }

   // Add Rune to LiteralString
   void AddRuneToString(Rune r);

   // Swaps this with that, in place.
   void Swap(Regexp *that);

   // Operator.  See description of operators above.
   // uint8_t instead of RegexpOp to control space usage.
   uint8_t op_;

   // Is this regexp structure already simple
   // (has it been returned by Simplify)?
   // uint8_t instead of bool to control space usage.
   uint8_t simple_;

   // Flags saved from parsing and used during execution.
   // (Only FoldCase is used.)
   // uint16_t instead of ParseFlags to control space usage.
   uint16_t parse_flags_;

   // Reference count.  Exists so that SimplifyRegexp can build
   // regexp structures that are dags rather than trees to avoid
   // exponential blowup in space requirements.
   // uint16_t to control space usage.
   // The standard regexp routines will never generate a
   // ref greater than the maximum repeat count (kMaxRepeat),
   // but even so, Incref and Decref consult an overflow map
   // when ref_ reaches kMaxRef.
   uint16_t ref_;
   static const uint16_t kMaxRef = 0xffff;

   // Subexpressions.
   // uint16_t to control space usage.
   // Concat and Alternate handle larger numbers of subexpressions
   // by building concatenation or alternation trees.
   // Other routines should call Concat or Alternate instead of
   // filling in sub() by hand.
   uint16_t nsub_;
   static const uint16_t kMaxNsub = 0xffff;
   union {
     Regexp** submany_;  // if nsub_ > 1
     Regexp* subone_;  // if nsub_ == 1
   };

   // Extra space for parse and teardown stacks.
   Regexp* down_;

   // Arguments to operator.  See description of operators above.
   union {
     struct {  // Repeat
       int max_;
       int min_;
     };
     struct {  // Capture
       int cap_;
       string* name_;
     };
     struct {  // LiteralString
       int nrunes_;
       Rune* runes_;
     };
     struct {  // CharClass
       // These two could be in separate union members,
       // but it wouldn't save any space (there are other two-word structs)
       // and keeping them separate avoids confusion during parsing.
       CharClass* cc_;
       CharClassBuilder* ccb_;
     };
     Rune rune_;  // Literal
     int match_id_;  // HaveMatch
     void *the_union_[2];  // as big as any other element, for memset
   };

   Regexp(const Regexp&) = delete;
   Regexp& operator=(const Regexp&) = delete;
 };

 // Character class set: contains non-overlapping, non-abutting RuneRanges.
 typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;

 class CharClassBuilder {
  public:
   CharClassBuilder();

   typedef RuneRangeSet::iterator iterator;
   iterator begin() { return ranges_.begin(); }
   iterator end() { return ranges_.end(); }

   int size() { return nrunes_; }
   bool empty() { return nrunes_ == 0; }
   bool full() { return nrunes_ == Runemax+1; }

   bool Contains(Rune r);
   bool FoldsASCII();
   bool AddRange(Rune lo, Rune hi);  // returns whether class changed
   CharClassBuilder* Copy();
   void AddCharClass(CharClassBuilder* cc);
   void Negate();
   void RemoveAbove(Rune r);
   CharClass* GetCharClass();
   void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);

  private:
   static const uint32_t AlphaMask = (1<<26) - 1;
   uint32_t upper_;  // bitmap of A-Z
   uint32_t lower_;  // bitmap of a-z
   int nrunes_;
   RuneRangeSet ranges_;

   CharClassBuilder(const CharClassBuilder&) = delete;
   CharClassBuilder& operator=(const CharClassBuilder&) = delete;
 };

 // Bitwise ops on ParseFlags produce ParseFlags.
 inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
                                     Regexp::ParseFlags b) {
   return static_cast<Regexp::ParseFlags>(
       static_cast<int>(a) | static_cast<int>(b));
 }

 inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
                                     Regexp::ParseFlags b) {
   return static_cast<Regexp::ParseFlags>(
       static_cast<int>(a) ^ static_cast<int>(b));
 }

 inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
                                     Regexp::ParseFlags b) {
   return static_cast<Regexp::ParseFlags>(
       static_cast<int>(a) & static_cast<int>(b));
 }

 inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
   // Attempting to produce a value out of enum's range has undefined behaviour.
   return static_cast<Regexp::ParseFlags>(
       ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
 }

 }  // namespace re2

 #endif  // RE2_REGEXP_H_