| /* |
| * reserved comment block |
| * DO NOT REMOVE OR ALTER! |
| */ |
| /* |
| * Copyright 1999-2004 The Apache Software Foundation. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package com.sun.org.apache.regexp.internal; |
| |
| import com.sun.org.apache.regexp.internal.RE; |
| import java.util.Hashtable; |
| |
| /** |
| * A regular expression compiler class. This class compiles a pattern string into a |
| * regular expression program interpretable by the RE evaluator class. The 'recompile' |
| * command line tool uses this compiler to pre-compile regular expressions for use |
| * with RE. For a description of the syntax accepted by RECompiler and what you can |
| * do with regular expressions, see the documentation for the RE matcher class. |
| * |
| * @see RE |
| * @see recompile |
| * |
| * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a> |
| * @author <a href="mailto:gholam@xtra.co.nz">Michael McCallum</a> |
| */ |
| public class RECompiler |
| { |
| // The compiled program |
| char[] instruction; // The compiled RE 'program' instruction buffer |
| int lenInstruction; // The amount of the program buffer currently in use |
| |
| // Input state for compiling regular expression |
| String pattern; // Input string |
| int len; // Length of the pattern string |
| int idx; // Current input index into ac |
| int parens; // Total number of paren pairs |
| |
| // Node flags |
| static final int NODE_NORMAL = 0; // No flags (nothing special) |
| static final int NODE_NULLABLE = 1; // True if node is potentially null |
| static final int NODE_TOPLEVEL = 2; // True if top level expr |
| |
| // Special types of 'escapes' |
| static final int ESC_MASK = 0xffff0; // Escape complexity mask |
| static final int ESC_BACKREF = 0xfffff; // Escape is really a backreference |
| static final int ESC_COMPLEX = 0xffffe; // Escape isn't really a true character |
| static final int ESC_CLASS = 0xffffd; // Escape represents a whole class of characters |
| |
| // {m,n} stacks |
| int maxBrackets = 10; // Maximum number of bracket pairs |
| static final int bracketUnbounded = -1; // Unbounded value |
| int brackets = 0; // Number of bracket sets |
| int[] bracketStart = null; // Starting point |
| int[] bracketEnd = null; // Ending point |
| int[] bracketMin = null; // Minimum number of matches |
| int[] bracketOpt = null; // Additional optional matches |
| |
| // Lookup table for POSIX character class names |
| static Hashtable hashPOSIX = new Hashtable(); |
| static |
| { |
| hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM)); |
| hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA)); |
| hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK)); |
| hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL)); |
| hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT)); |
| hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH)); |
| hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER)); |
| hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT)); |
| hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT)); |
| hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE)); |
| hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER)); |
| hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT)); |
| hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART)); |
| hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART)); |
| } |
| |
| /** |
| * Constructor. Creates (initially empty) storage for a regular expression program. |
| */ |
| public RECompiler() |
| { |
| // Start off with a generous, yet reasonable, initial size |
| instruction = new char[128]; |
| lenInstruction = 0; |
| } |
| |
| /** |
| * Ensures that n more characters can fit in the program buffer. |
| * If n more can't fit, then the size is doubled until it can. |
| * @param n Number of additional characters to ensure will fit. |
| */ |
| void ensure(int n) |
| { |
| // Get current program length |
| int curlen = instruction.length; |
| |
| // If the current length + n more is too much |
| if (lenInstruction + n >= curlen) |
| { |
| // Double the size of the program array until n more will fit |
| while (lenInstruction + n >= curlen) |
| { |
| curlen *= 2; |
| } |
| |
| // Allocate new program array and move data into it |
| char[] newInstruction = new char[curlen]; |
| System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction); |
| instruction = newInstruction; |
| } |
| } |
| |
| /** |
| * Emit a single character into the program stream. |
| * @param c Character to add |
| */ |
| void emit(char c) |
| { |
| // Make room for character |
| ensure(1); |
| |
| // Add character |
| instruction[lenInstruction++] = c; |
| } |
| |
| /** |
| * Inserts a node with a given opcode and opdata at insertAt. The node relative next |
| * pointer is initialized to 0. |
| * @param opcode Opcode for new node |
| * @param opdata Opdata for new node (only the low 16 bits are currently used) |
| * @param insertAt Index at which to insert the new node in the program |
| */ |
| void nodeInsert(char opcode, int opdata, int insertAt) |
| { |
| // Make room for a new node |
| ensure(RE.nodeSize); |
| |
| // Move everything from insertAt to the end down nodeSize elements |
| System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt); |
| instruction[insertAt + RE.offsetOpcode] = opcode; |
| instruction[insertAt + RE.offsetOpdata] = (char)opdata; |
| instruction[insertAt + RE.offsetNext] = 0; |
| lenInstruction += RE.nodeSize; |
| } |
| |
| /** |
| * Appends a node to the end of a node chain |
| * @param node Start of node chain to traverse |
| * @param pointTo Node to have the tail of the chain point to |
| */ |
| void setNextOfEnd(int node, int pointTo) |
| { |
| // Traverse the chain until the next offset is 0 |
| int next = instruction[node + RE.offsetNext]; |
| // while the 'node' is not the last in the chain |
| // and the 'node' is not the last in the program. |
| while ( next != 0 && node < lenInstruction ) |
| { |
| // if the node we are supposed to point to is in the chain then |
| // point to the end of the program instead. |
| // Michael McCallum <gholam@xtra.co.nz> |
| // FIXME: // This is a _hack_ to stop infinite programs. |
| // I believe that the implementation of the reluctant matches is wrong but |
| // have not worked out a better way yet. |
| if ( node == pointTo ) { |
| pointTo = lenInstruction; |
| } |
| node += next; |
| next = instruction[node + RE.offsetNext]; |
| } |
| // if we have reached the end of the program then dont set the pointTo. |
| // im not sure if this will break any thing but passes all the tests. |
| if ( node < lenInstruction ) { |
| // Point the last node in the chain to pointTo. |
| instruction[node + RE.offsetNext] = (char)(short)(pointTo - node); |
| } |
| } |
| |
| /** |
| * Adds a new node |
| * @param opcode Opcode for node |
| * @param opdata Opdata for node (only the low 16 bits are currently used) |
| * @return Index of new node in program |
| */ |
| int node(char opcode, int opdata) |
| { |
| // Make room for a new node |
| ensure(RE.nodeSize); |
| |
| // Add new node at end |
| instruction[lenInstruction + RE.offsetOpcode] = opcode; |
| instruction[lenInstruction + RE.offsetOpdata] = (char)opdata; |
| instruction[lenInstruction + RE.offsetNext] = 0; |
| lenInstruction += RE.nodeSize; |
| |
| // Return index of new node |
| return lenInstruction - RE.nodeSize; |
| } |
| |
| |
| /** |
| * Throws a new internal error exception |
| * @exception Error Thrown in the event of an internal error. |
| */ |
| void internalError() throws Error |
| { |
| throw new Error("Internal error!"); |
| } |
| |
| /** |
| * Throws a new syntax error exception |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| void syntaxError(String s) throws RESyntaxException |
| { |
| throw new RESyntaxException(s); |
| } |
| |
| /** |
| * Allocate storage for brackets only as needed |
| */ |
| void allocBrackets() |
| { |
| // Allocate bracket stacks if not already done |
| if (bracketStart == null) |
| { |
| // Allocate storage |
| bracketStart = new int[maxBrackets]; |
| bracketEnd = new int[maxBrackets]; |
| bracketMin = new int[maxBrackets]; |
| bracketOpt = new int[maxBrackets]; |
| |
| // Initialize to invalid values |
| for (int i = 0; i < maxBrackets; i++) |
| { |
| bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1; |
| } |
| } |
| } |
| |
| /** Enlarge storage for brackets only as needed. */ |
| synchronized void reallocBrackets() { |
| // trick the tricky |
| if (bracketStart == null) { |
| allocBrackets(); |
| } |
| |
| int new_size = maxBrackets * 2; |
| int[] new_bS = new int[new_size]; |
| int[] new_bE = new int[new_size]; |
| int[] new_bM = new int[new_size]; |
| int[] new_bO = new int[new_size]; |
| // Initialize to invalid values |
| for (int i=brackets; i<new_size; i++) { |
| new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1; |
| } |
| System.arraycopy(bracketStart,0, new_bS,0, brackets); |
| System.arraycopy(bracketEnd,0, new_bE,0, brackets); |
| System.arraycopy(bracketMin,0, new_bM,0, brackets); |
| System.arraycopy(bracketOpt,0, new_bO,0, brackets); |
| bracketStart = new_bS; |
| bracketEnd = new_bE; |
| bracketMin = new_bM; |
| bracketOpt = new_bO; |
| maxBrackets = new_size; |
| } |
| |
| /** |
| * Match bracket {m,n} expression put results in bracket member variables |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| void bracket() throws RESyntaxException |
| { |
| // Current character must be a '{' |
| if (idx >= len || pattern.charAt(idx++) != '{') |
| { |
| internalError(); |
| } |
| |
| // Next char must be a digit |
| if (idx >= len || !Character.isDigit(pattern.charAt(idx))) |
| { |
| syntaxError("Expected digit"); |
| } |
| |
| // Get min ('m' of {m,n}) number |
| StringBuffer number = new StringBuffer(); |
| while (idx < len && Character.isDigit(pattern.charAt(idx))) |
| { |
| number.append(pattern.charAt(idx++)); |
| } |
| try |
| { |
| bracketMin[brackets] = Integer.parseInt(number.toString()); |
| } |
| catch (NumberFormatException e) |
| { |
| syntaxError("Expected valid number"); |
| } |
| |
| // If out of input, fail |
| if (idx >= len) |
| { |
| syntaxError("Expected comma or right bracket"); |
| } |
| |
| // If end of expr, optional limit is 0 |
| if (pattern.charAt(idx) == '}') |
| { |
| idx++; |
| bracketOpt[brackets] = 0; |
| return; |
| } |
| |
| // Must have at least {m,} and maybe {m,n}. |
| if (idx >= len || pattern.charAt(idx++) != ',') |
| { |
| syntaxError("Expected comma"); |
| } |
| |
| // If out of input, fail |
| if (idx >= len) |
| { |
| syntaxError("Expected comma or right bracket"); |
| } |
| |
| // If {m,} max is unlimited |
| if (pattern.charAt(idx) == '}') |
| { |
| idx++; |
| bracketOpt[brackets] = bracketUnbounded; |
| return; |
| } |
| |
| // Next char must be a digit |
| if (idx >= len || !Character.isDigit(pattern.charAt(idx))) |
| { |
| syntaxError("Expected digit"); |
| } |
| |
| // Get max number |
| number.setLength(0); |
| while (idx < len && Character.isDigit(pattern.charAt(idx))) |
| { |
| number.append(pattern.charAt(idx++)); |
| } |
| try |
| { |
| bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets]; |
| } |
| catch (NumberFormatException e) |
| { |
| syntaxError("Expected valid number"); |
| } |
| |
| // Optional repetitions must be >= 0 |
| if (bracketOpt[brackets] < 0) |
| { |
| syntaxError("Bad range"); |
| } |
| |
| // Must have close brace |
| if (idx >= len || pattern.charAt(idx++) != '}') |
| { |
| syntaxError("Missing close brace"); |
| } |
| } |
| |
| /** |
| * Match an escape sequence. Handles quoted chars and octal escapes as well |
| * as normal escape characters. Always advances the input stream by the |
| * right amount. This code "understands" the subtle difference between an |
| * octal escape and a backref. You can access the type of ESC_CLASS or |
| * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1]. |
| * @return ESC_* code or character if simple escape |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| int escape() throws RESyntaxException |
| { |
| // "Shouldn't" happen |
| if (pattern.charAt(idx) != '\\') |
| { |
| internalError(); |
| } |
| |
| // Escape shouldn't occur as last character in string! |
| if (idx + 1 == len) |
| { |
| syntaxError("Escape terminates string"); |
| } |
| |
| // Switch on character after backslash |
| idx += 2; |
| char escapeChar = pattern.charAt(idx - 1); |
| switch (escapeChar) |
| { |
| case RE.E_BOUND: |
| case RE.E_NBOUND: |
| return ESC_COMPLEX; |
| |
| case RE.E_ALNUM: |
| case RE.E_NALNUM: |
| case RE.E_SPACE: |
| case RE.E_NSPACE: |
| case RE.E_DIGIT: |
| case RE.E_NDIGIT: |
| return ESC_CLASS; |
| |
| case 'u': |
| case 'x': |
| { |
| // Exact required hex digits for escape type |
| int hexDigits = (escapeChar == 'u' ? 4 : 2); |
| |
| // Parse up to hexDigits characters from input |
| int val = 0; |
| for ( ; idx < len && hexDigits-- > 0; idx++) |
| { |
| // Get char |
| char c = pattern.charAt(idx); |
| |
| // If it's a hexadecimal digit (0-9) |
| if (c >= '0' && c <= '9') |
| { |
| // Compute new value |
| val = (val << 4) + c - '0'; |
| } |
| else |
| { |
| // If it's a hexadecimal letter (a-f) |
| c = Character.toLowerCase(c); |
| if (c >= 'a' && c <= 'f') |
| { |
| // Compute new value |
| val = (val << 4) + (c - 'a') + 10; |
| } |
| else |
| { |
| // If it's not a valid digit or hex letter, the escape must be invalid |
| // because hexDigits of input have not been absorbed yet. |
| syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar); |
| } |
| } |
| } |
| return val; |
| } |
| |
| case 't': |
| return '\t'; |
| |
| case 'n': |
| return '\n'; |
| |
| case 'r': |
| return '\r'; |
| |
| case 'f': |
| return '\f'; |
| |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| |
| // An octal escape starts with a 0 or has two digits in a row |
| if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0') |
| { |
| // Handle \nnn octal escapes |
| int val = escapeChar - '0'; |
| if (idx < len && Character.isDigit(pattern.charAt(idx))) |
| { |
| val = ((val << 3) + (pattern.charAt(idx++) - '0')); |
| if (idx < len && Character.isDigit(pattern.charAt(idx))) |
| { |
| val = ((val << 3) + (pattern.charAt(idx++) - '0')); |
| } |
| } |
| return val; |
| } |
| |
| // It's actually a backreference (\[1-9]), not an escape |
| return ESC_BACKREF; |
| |
| default: |
| |
| // Simple quoting of a character |
| return escapeChar; |
| } |
| } |
| |
| /** |
| * Compile a character class |
| * @return Index of class node |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| int characterClass() throws RESyntaxException |
| { |
| // Check for bad calling or empty class |
| if (pattern.charAt(idx) != '[') |
| { |
| internalError(); |
| } |
| |
| // Check for unterminated or empty class |
| if ((idx + 1) >= len || pattern.charAt(++idx) == ']') |
| { |
| syntaxError("Empty or unterminated class"); |
| } |
| |
| // Check for POSIX character class |
| if (idx < len && pattern.charAt(idx) == ':') |
| { |
| // Skip colon |
| idx++; |
| |
| // POSIX character classes are denoted with lowercase ASCII strings |
| int idxStart = idx; |
| while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') |
| { |
| idx++; |
| } |
| |
| // Should be a ":]" to terminate the POSIX character class |
| if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']') |
| { |
| // Get character class |
| String charClass = pattern.substring(idxStart, idx); |
| |
| // Select the POSIX class id |
| Character i = (Character)hashPOSIX.get(charClass); |
| if (i != null) |
| { |
| // Move past colon and right bracket |
| idx += 2; |
| |
| // Return new POSIX character class node |
| return node(RE.OP_POSIXCLASS, i.charValue()); |
| } |
| syntaxError("Invalid POSIX character class '" + charClass + "'"); |
| } |
| syntaxError("Invalid POSIX character class syntax"); |
| } |
| |
| // Try to build a class. Create OP_ANYOF node |
| int ret = node(RE.OP_ANYOF, 0); |
| |
| // Parse class declaration |
| char CHAR_INVALID = Character.MAX_VALUE; |
| char last = CHAR_INVALID; |
| char simpleChar = 0; |
| boolean include = true; |
| boolean definingRange = false; |
| int idxFirst = idx; |
| char rangeStart = Character.MIN_VALUE; |
| char rangeEnd; |
| RERange range = new RERange(); |
| while (idx < len && pattern.charAt(idx) != ']') |
| { |
| |
| switchOnCharacter: |
| |
| // Switch on character |
| switch (pattern.charAt(idx)) |
| { |
| case '^': |
| include = !include; |
| if (idx == idxFirst) |
| { |
| range.include(Character.MIN_VALUE, Character.MAX_VALUE, true); |
| } |
| idx++; |
| continue; |
| |
| case '\\': |
| { |
| // Escape always advances the stream |
| int c; |
| switch (c = escape ()) |
| { |
| case ESC_COMPLEX: |
| case ESC_BACKREF: |
| |
| // Word boundaries and backrefs not allowed in a character class! |
| syntaxError("Bad character class"); |
| |
| case ESC_CLASS: |
| |
| // Classes can't be an endpoint of a range |
| if (definingRange) |
| { |
| syntaxError("Bad character class"); |
| } |
| |
| // Handle specific type of class (some are ok) |
| switch (pattern.charAt(idx - 1)) |
| { |
| case RE.E_NSPACE: |
| case RE.E_NDIGIT: |
| case RE.E_NALNUM: |
| syntaxError("Bad character class"); |
| |
| case RE.E_SPACE: |
| range.include('\t', include); |
| range.include('\r', include); |
| range.include('\f', include); |
| range.include('\n', include); |
| range.include('\b', include); |
| range.include(' ', include); |
| break; |
| |
| case RE.E_ALNUM: |
| range.include('a', 'z', include); |
| range.include('A', 'Z', include); |
| range.include('_', include); |
| |
| // Fall through! |
| |
| case RE.E_DIGIT: |
| range.include('0', '9', include); |
| break; |
| } |
| |
| // Make last char invalid (can't be a range start) |
| last = CHAR_INVALID; |
| break; |
| |
| default: |
| |
| // Escape is simple so treat as a simple char |
| simpleChar = (char) c; |
| break switchOnCharacter; |
| } |
| } |
| continue; |
| |
| case '-': |
| |
| // Start a range if one isn't already started |
| if (definingRange) |
| { |
| syntaxError("Bad class range"); |
| } |
| definingRange = true; |
| |
| // If no last character, start of range is 0 |
| rangeStart = (last == CHAR_INVALID ? 0 : last); |
| |
| // Premature end of range. define up to Character.MAX_VALUE |
| if ((idx + 1) < len && pattern.charAt(++idx) == ']') |
| { |
| simpleChar = Character.MAX_VALUE; |
| break; |
| } |
| continue; |
| |
| default: |
| simpleChar = pattern.charAt(idx++); |
| break; |
| } |
| |
| // Handle simple character simpleChar |
| if (definingRange) |
| { |
| // if we are defining a range make it now |
| rangeEnd = simpleChar; |
| |
| // Actually create a range if the range is ok |
| if (rangeStart >= rangeEnd) |
| { |
| syntaxError("Bad character class"); |
| } |
| range.include(rangeStart, rangeEnd, include); |
| |
| // We are done defining the range |
| last = CHAR_INVALID; |
| definingRange = false; |
| } |
| else |
| { |
| // If simple character and not start of range, include it |
| if (idx >= len || pattern.charAt(idx) != '-') |
| { |
| range.include(simpleChar, include); |
| } |
| last = simpleChar; |
| } |
| } |
| |
| // Shouldn't be out of input |
| if (idx == len) |
| { |
| syntaxError("Unterminated character class"); |
| } |
| |
| // Absorb the ']' end of class marker |
| idx++; |
| |
| // Emit character class definition |
| instruction[ret + RE.offsetOpdata] = (char)range.num; |
| for (int i = 0; i < range.num; i++) |
| { |
| emit((char)range.minRange[i]); |
| emit((char)range.maxRange[i]); |
| } |
| return ret; |
| } |
| |
| /** |
| * Absorb an atomic character string. This method is a little tricky because |
| * it can un-include the last character of string if a closure operator follows. |
| * This is correct because *+? have higher precedence than concatentation (thus |
| * ABC* means AB(C*) and NOT (ABC)*). |
| * @return Index of new atom node |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| int atom() throws RESyntaxException |
| { |
| // Create a string node |
| int ret = node(RE.OP_ATOM, 0); |
| |
| // Length of atom |
| int lenAtom = 0; |
| |
| // Loop while we've got input |
| |
| atomLoop: |
| |
| while (idx < len) |
| { |
| // Is there a next char? |
| if ((idx + 1) < len) |
| { |
| char c = pattern.charAt(idx + 1); |
| |
| // If the next 'char' is an escape, look past the whole escape |
| if (pattern.charAt(idx) == '\\') |
| { |
| int idxEscape = idx; |
| escape(); |
| if (idx < len) |
| { |
| c = pattern.charAt(idx); |
| } |
| idx = idxEscape; |
| } |
| |
| // Switch on next char |
| switch (c) |
| { |
| case '{': |
| case '?': |
| case '*': |
| case '+': |
| |
| // If the next character is a closure operator and our atom is non-empty, the |
| // current character should bind to the closure operator rather than the atom |
| if (lenAtom != 0) |
| { |
| break atomLoop; |
| } |
| } |
| } |
| |
| // Switch on current char |
| switch (pattern.charAt(idx)) |
| { |
| case ']': |
| case '^': |
| case '$': |
| case '.': |
| case '[': |
| case '(': |
| case ')': |
| case '|': |
| break atomLoop; |
| |
| case '{': |
| case '?': |
| case '*': |
| case '+': |
| |
| // We should have an atom by now |
| if (lenAtom == 0) |
| { |
| // No atom before closure |
| syntaxError("Missing operand to closure"); |
| } |
| break atomLoop; |
| |
| case '\\': |
| |
| { |
| // Get the escaped character (advances input automatically) |
| int idxBeforeEscape = idx; |
| int c = escape(); |
| |
| // Check if it's a simple escape (as opposed to, say, a backreference) |
| if ((c & ESC_MASK) == ESC_MASK) |
| { |
| // Not a simple escape, so backup to where we were before the escape. |
| idx = idxBeforeEscape; |
| break atomLoop; |
| } |
| |
| // Add escaped char to atom |
| emit((char) c); |
| lenAtom++; |
| } |
| break; |
| |
| default: |
| |
| // Add normal character to atom |
| emit(pattern.charAt(idx++)); |
| lenAtom++; |
| break; |
| } |
| } |
| |
| // This "shouldn't" happen |
| if (lenAtom == 0) |
| { |
| internalError(); |
| } |
| |
| // Emit the atom length into the program |
| instruction[ret + RE.offsetOpdata] = (char)lenAtom; |
| return ret; |
| } |
| |
| /** |
| * Match a terminal node. |
| * @param flags Flags |
| * @return Index of terminal node (closeable) |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| int terminal(int[] flags) throws RESyntaxException |
| { |
| switch (pattern.charAt(idx)) |
| { |
| case RE.OP_EOL: |
| case RE.OP_BOL: |
| case RE.OP_ANY: |
| return node(pattern.charAt(idx++), 0); |
| |
| case '[': |
| return characterClass(); |
| |
| case '(': |
| return expr(flags); |
| |
| case ')': |
| syntaxError("Unexpected close paren"); |
| |
| case '|': |
| internalError(); |
| |
| case ']': |
| syntaxError("Mismatched class"); |
| |
| case 0: |
| syntaxError("Unexpected end of input"); |
| |
| case '?': |
| case '+': |
| case '{': |
| case '*': |
| syntaxError("Missing operand to closure"); |
| |
| case '\\': |
| { |
| // Don't forget, escape() advances the input stream! |
| int idxBeforeEscape = idx; |
| |
| // Switch on escaped character |
| switch (escape()) |
| { |
| case ESC_CLASS: |
| case ESC_COMPLEX: |
| flags[0] &= ~NODE_NULLABLE; |
| return node(RE.OP_ESCAPE, pattern.charAt(idx - 1)); |
| |
| case ESC_BACKREF: |
| { |
| char backreference = (char)(pattern.charAt(idx - 1) - '0'); |
| if (parens <= backreference) |
| { |
| syntaxError("Bad backreference"); |
| } |
| flags[0] |= NODE_NULLABLE; |
| return node(RE.OP_BACKREF, backreference); |
| } |
| |
| default: |
| |
| // We had a simple escape and we want to have it end up in |
| // an atom, so we back up and fall though to the default handling |
| idx = idxBeforeEscape; |
| flags[0] &= ~NODE_NULLABLE; |
| break; |
| } |
| } |
| } |
| |
| // Everything above either fails or returns. |
| // If it wasn't one of the above, it must be the start of an atom. |
| flags[0] &= ~NODE_NULLABLE; |
| return atom(); |
| } |
| |
| /** |
| * Compile a possibly closured terminal |
| * @param flags Flags passed by reference |
| * @return Index of closured node |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| int closure(int[] flags) throws RESyntaxException |
| { |
| // Before terminal |
| int idxBeforeTerminal = idx; |
| |
| // Values to pass by reference to terminal() |
| int[] terminalFlags = { NODE_NORMAL }; |
| |
| // Get terminal symbol |
| int ret = terminal(terminalFlags); |
| |
| // Or in flags from terminal symbol |
| flags[0] |= terminalFlags[0]; |
| |
| // Advance input, set NODE_NULLABLE flag and do sanity checks |
| if (idx >= len) |
| { |
| return ret; |
| } |
| boolean greedy = true; |
| char closureType = pattern.charAt(idx); |
| switch (closureType) |
| { |
| case '?': |
| case '*': |
| |
| // The current node can be null |
| flags[0] |= NODE_NULLABLE; |
| |
| case '+': |
| |
| // Eat closure character |
| idx++; |
| |
| case '{': |
| |
| // Don't allow blantant stupidity |
| int opcode = instruction[ret + RE.offsetOpcode]; |
| if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) |
| { |
| syntaxError("Bad closure operand"); |
| } |
| if ((terminalFlags[0] & NODE_NULLABLE) != 0) |
| { |
| syntaxError("Closure operand can't be nullable"); |
| } |
| break; |
| } |
| |
| // If the next character is a '?', make the closure non-greedy (reluctant) |
| if (idx < len && pattern.charAt(idx) == '?') |
| { |
| idx++; |
| greedy = false; |
| } |
| |
| if (greedy) |
| { |
| // Actually do the closure now |
| switch (closureType) |
| { |
| case '{': |
| { |
| // We look for our bracket in the list |
| boolean found = false; |
| int i; |
| allocBrackets(); |
| for (i = 0; i < brackets; i++) |
| { |
| if (bracketStart[i] == idx) |
| { |
| found = true; |
| break; |
| } |
| } |
| |
| // If its not in the list we parse the {m,n} |
| if (!found) |
| { |
| if (brackets >= maxBrackets) |
| { |
| reallocBrackets(); |
| } |
| bracketStart[brackets] = idx; |
| bracket(); |
| bracketEnd[brackets] = idx; |
| i = brackets++; |
| } |
| |
| // Process min first |
| if (bracketMin[i]-- > 0) |
| { |
| if (bracketMin[i] > 0 || bracketOpt[i] != 0) { |
| // Rewind stream and run it through again - more matchers coming |
| for (int j = 0; j < brackets; j++) { |
| if (j != i && bracketStart[j] < idx |
| && bracketStart[j] >= idxBeforeTerminal) |
| { |
| brackets--; |
| bracketStart[j] = bracketStart[brackets]; |
| bracketEnd[j] = bracketEnd[brackets]; |
| bracketMin[j] = bracketMin[brackets]; |
| bracketOpt[j] = bracketOpt[brackets]; |
| } |
| } |
| |
| idx = idxBeforeTerminal; |
| } else { |
| // Bug #1030: No optinal matches - no need to rewind |
| idx = bracketEnd[i]; |
| } |
| break; |
| } |
| |
| // Do the right thing for maximum ({m,}) |
| if (bracketOpt[i] == bracketUnbounded) |
| { |
| // Drop through now and closure expression. |
| // We are done with the {m,} expr, so skip rest |
| closureType = '*'; |
| bracketOpt[i] = 0; |
| idx = bracketEnd[i]; |
| } |
| else |
| if (bracketOpt[i]-- > 0) |
| { |
| if (bracketOpt[i] > 0) |
| { |
| // More optional matchers - 'play it again sam!' |
| idx = idxBeforeTerminal; |
| } else { |
| // Bug #1030: We are done - this one is last and optional |
| idx = bracketEnd[i]; |
| } |
| // Drop through to optionally close |
| closureType = '?'; |
| } |
| else |
| { |
| // Rollback terminal - neither min nor opt matchers present |
| lenInstruction = ret; |
| node(RE.OP_NOTHING, 0); |
| |
| // We are done. skip the rest of {m,n} expr |
| idx = bracketEnd[i]; |
| break; |
| } |
| } |
| |
| // Fall through! |
| |
| case '?': |
| case '*': |
| |
| if (!greedy) |
| { |
| break; |
| } |
| |
| if (closureType == '?') |
| { |
| // X? is compiled as (X|) |
| nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X |
| setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option |
| int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING |
| setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING |
| setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node |
| } |
| |
| if (closureType == '*') |
| { |
| // X* is compiled as (X{gotoX}|) |
| nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X |
| setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option |
| setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto |
| setNextOfEnd(ret + RE.nodeSize, ret); // the start again |
| setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is |
| setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING |
| } |
| break; |
| |
| case '+': |
| { |
| // X+ is compiled as X({gotoX}|) |
| int branch; |
| branch = node(RE.OP_BRANCH, 0); // a new branch |
| setNextOfEnd(ret, branch); // is added to the end of X |
| setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start |
| setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option |
| setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING |
| } |
| break; |
| } |
| } |
| else |
| { |
| // Add end after closured subexpr |
| setNextOfEnd(ret, node(RE.OP_END, 0)); |
| |
| // Actually do the closure now |
| switch (closureType) |
| { |
| case '?': |
| nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret); |
| break; |
| |
| case '*': |
| nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret); |
| break; |
| |
| case '+': |
| nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret); |
| break; |
| } |
| |
| // Point to the expr after the closure |
| setNextOfEnd(ret, lenInstruction); |
| } |
| return ret; |
| } |
| |
| /** |
| * Compile one branch of an or operator (implements concatenation) |
| * @param flags Flags passed by reference |
| * @return Pointer to branch node |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| int branch(int[] flags) throws RESyntaxException |
| { |
| // Get each possibly closured piece and concat |
| int node; |
| int ret = node(RE.OP_BRANCH, 0); |
| int chain = -1; |
| int[] closureFlags = new int[1]; |
| boolean nullable = true; |
| while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') |
| { |
| // Get new node |
| closureFlags[0] = NODE_NORMAL; |
| node = closure(closureFlags); |
| if (closureFlags[0] == NODE_NORMAL) |
| { |
| nullable = false; |
| } |
| |
| // If there's a chain, append to the end |
| if (chain != -1) |
| { |
| setNextOfEnd(chain, node); |
| } |
| |
| // Chain starts at current |
| chain = node; |
| } |
| |
| // If we don't run loop, make a nothing node |
| if (chain == -1) |
| { |
| node(RE.OP_NOTHING, 0); |
| } |
| |
| // Set nullable flag for this branch |
| if (nullable) |
| { |
| flags[0] |= NODE_NULLABLE; |
| } |
| return ret; |
| } |
| |
| /** |
| * Compile an expression with possible parens around it. Paren matching |
| * is done at this level so we can tie the branch tails together. |
| * @param flags Flag value passed by reference |
| * @return Node index of expression in instruction array |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| */ |
| int expr(int[] flags) throws RESyntaxException |
| { |
| // Create open paren node unless we were called from the top level (which has no parens) |
| int paren = -1; |
| int ret = -1; |
| int closeParens = parens; |
| if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') |
| { |
| // if its a cluster ( rather than a proper subexpression ie with backrefs ) |
| if ( idx + 2 < len && pattern.charAt( idx + 1 ) == '?' && pattern.charAt( idx + 2 ) == ':' ) |
| { |
| paren = 2; |
| idx += 3; |
| ret = node( RE.OP_OPEN_CLUSTER, 0 ); |
| } |
| else |
| { |
| paren = 1; |
| idx++; |
| ret = node(RE.OP_OPEN, parens++); |
| } |
| } |
| flags[0] &= ~NODE_TOPLEVEL; |
| |
| // Create a branch node |
| int branch = branch(flags); |
| if (ret == -1) |
| { |
| ret = branch; |
| } |
| else |
| { |
| setNextOfEnd(ret, branch); |
| } |
| |
| // Loop through branches |
| while (idx < len && pattern.charAt(idx) == '|') |
| { |
| idx++; |
| branch = branch(flags); |
| setNextOfEnd(ret, branch); |
| } |
| |
| // Create an ending node (either a close paren or an OP_END) |
| int end; |
| if ( paren > 0 ) |
| { |
| if (idx < len && pattern.charAt(idx) == ')') |
| { |
| idx++; |
| } |
| else |
| { |
| syntaxError("Missing close paren"); |
| } |
| if ( paren == 1 ) |
| { |
| end = node(RE.OP_CLOSE, closeParens); |
| } |
| else |
| { |
| end = node( RE.OP_CLOSE_CLUSTER, 0 ); |
| } |
| } |
| else |
| { |
| end = node(RE.OP_END, 0); |
| } |
| |
| // Append the ending node to the ret nodelist |
| setNextOfEnd(ret, end); |
| |
| // Hook the ends of each branch to the end node |
| int currentNode = ret; |
| int nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; |
| // while the next node o |
| while ( nextNodeOffset != 0 && currentNode < lenInstruction ) |
| { |
| // If branch, make the end of the branch's operand chain point to the end node. |
| if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH ) |
| { |
| setNextOfEnd( currentNode + RE.nodeSize, end ); |
| } |
| nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; |
| currentNode += nextNodeOffset; |
| } |
| |
| // Return the node list |
| return ret; |
| } |
| |
| /** |
| * Compiles a regular expression pattern into a program runnable by the pattern |
| * matcher class 'RE'. |
| * @param pattern Regular expression pattern to compile (see RECompiler class |
| * for details). |
| * @return A compiled regular expression program. |
| * @exception RESyntaxException Thrown if the regular expression has invalid syntax. |
| * @see RECompiler |
| * @see RE |
| */ |
| public REProgram compile(String pattern) throws RESyntaxException |
| { |
| // Initialize variables for compilation |
| this.pattern = pattern; // Save pattern in instance variable |
| len = pattern.length(); // Precompute pattern length for speed |
| idx = 0; // Set parsing index to the first character |
| lenInstruction = 0; // Set emitted instruction count to zero |
| parens = 1; // Set paren level to 1 (the implicit outer parens) |
| brackets = 0; // No bracketed closures yet |
| |
| // Initialize pass by reference flags value |
| int[] flags = { NODE_TOPLEVEL }; |
| |
| // Parse expression |
| expr(flags); |
| |
| // Should be at end of input |
| if (idx != len) |
| { |
| if (pattern.charAt(idx) == ')') |
| { |
| syntaxError("Unmatched close paren"); |
| } |
| syntaxError("Unexpected input remains"); |
| } |
| |
| // Return the result |
| char[] ins = new char[lenInstruction]; |
| System.arraycopy(instruction, 0, ins, 0, lenInstruction); |
| return new REProgram(parens, ins); |
| } |
| |
| /** |
| * Local, nested class for maintaining character ranges for character classes. |
| */ |
| class RERange |
| { |
| int size = 16; // Capacity of current range arrays |
| int[] minRange = new int[size]; // Range minima |
| int[] maxRange = new int[size]; // Range maxima |
| int num = 0; // Number of range array elements in use |
| |
| /** |
| * Deletes the range at a given index from the range lists |
| * @param index Index of range to delete from minRange and maxRange arrays. |
| */ |
| void delete(int index) |
| { |
| // Return if no elements left or index is out of range |
| if (num == 0 || index >= num) |
| { |
| return; |
| } |
| |
| // Move elements down |
| while (++index < num) |
| { |
| if (index - 1 >= 0) |
| { |
| minRange[index-1] = minRange[index]; |
| maxRange[index-1] = maxRange[index]; |
| } |
| } |
| |
| // One less element now |
| num--; |
| } |
| |
| /** |
| * Merges a range into the range list, coalescing ranges if possible. |
| * @param min Minimum end of range |
| * @param max Maximum end of range |
| */ |
| void merge(int min, int max) |
| { |
| // Loop through ranges |
| for (int i = 0; i < num; i++) |
| { |
| // Min-max is subsumed by minRange[i]-maxRange[i] |
| if (min >= minRange[i] && max <= maxRange[i]) |
| { |
| return; |
| } |
| |
| // Min-max subsumes minRange[i]-maxRange[i] |
| else if (min <= minRange[i] && max >= maxRange[i]) |
| { |
| delete(i); |
| merge(min, max); |
| return; |
| } |
| |
| // Min is in the range, but max is outside |
| else if (min >= minRange[i] && min <= maxRange[i]) |
| { |
| delete(i); |
| min = minRange[i]; |
| merge(min, max); |
| return; |
| } |
| |
| // Max is in the range, but min is outside |
| else if (max >= minRange[i] && max <= maxRange[i]) |
| { |
| delete(i); |
| max = maxRange[i]; |
| merge(min, max); |
| return; |
| } |
| } |
| |
| // Must not overlap any other ranges |
| if (num >= size) |
| { |
| size *= 2; |
| int[] newMin = new int[size]; |
| int[] newMax = new int[size]; |
| System.arraycopy(minRange, 0, newMin, 0, num); |
| System.arraycopy(maxRange, 0, newMax, 0, num); |
| minRange = newMin; |
| maxRange = newMax; |
| } |
| minRange[num] = min; |
| maxRange[num] = max; |
| num++; |
| } |
| |
| /** |
| * Removes a range by deleting or shrinking all other ranges |
| * @param min Minimum end of range |
| * @param max Maximum end of range |
| */ |
| void remove(int min, int max) |
| { |
| // Loop through ranges |
| for (int i = 0; i < num; i++) |
| { |
| // minRange[i]-maxRange[i] is subsumed by min-max |
| if (minRange[i] >= min && maxRange[i] <= max) |
| { |
| delete(i); |
| i--; |
| return; |
| } |
| |
| // min-max is subsumed by minRange[i]-maxRange[i] |
| else if (min >= minRange[i] && max <= maxRange[i]) |
| { |
| int minr = minRange[i]; |
| int maxr = maxRange[i]; |
| delete(i); |
| if (minr < min) |
| { |
| merge(minr, min - 1); |
| } |
| if (max < maxr) |
| { |
| merge(max + 1, maxr); |
| } |
| return; |
| } |
| |
| // minRange is in the range, but maxRange is outside |
| else if (minRange[i] >= min && minRange[i] <= max) |
| { |
| minRange[i] = max + 1; |
| return; |
| } |
| |
| // maxRange is in the range, but minRange is outside |
| else if (maxRange[i] >= min && maxRange[i] <= max) |
| { |
| maxRange[i] = min - 1; |
| return; |
| } |
| } |
| } |
| |
| /** |
| * Includes (or excludes) the range from min to max, inclusive. |
| * @param min Minimum end of range |
| * @param max Maximum end of range |
| * @param include True if range should be included. False otherwise. |
| */ |
| void include(int min, int max, boolean include) |
| { |
| if (include) |
| { |
| merge(min, max); |
| } |
| else |
| { |
| remove(min, max); |
| } |
| } |
| |
| /** |
| * Includes a range with the same min and max |
| * @param minmax Minimum and maximum end of range (inclusive) |
| * @param include True if range should be included. False otherwise. |
| */ |
| void include(char minmax, boolean include) |
| { |
| include(minmax, minmax, include); |
| } |
| } |
| } |