blob: 31812c9babb1440d9a626d46f8c08821c70bf576 [file] [log] [blame]
// Copyright 2008 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.common.labs.matcher;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.base.Preconditions;
/**
* This class parses a Google URL pattern into an immutable representation that
* provides equivalent Java regexes,
* exact-match patterns and prefix patterns, as appropriate. For a description
* of Google URL patterns, see the
* documentation in <a
* href="http://code.google.com/apis/searchappliance/documentation/50/admin/URL_patterns.html">
* this document</a>.
* <p>
* All Google URL patterns can be translated into an equivalent Java regex (with
* some exceptions and caveats, see below). This class provides access to an
* equivalent Java regex through {@link #getUrlRegex()}.
* <p>
* In addition, the class provides further analysis and special kinds of
* patterns, depending on these top-level predicates:
* <ul>
* <li> {@link #isHostPathType()}: Returns {@code true} if the parsed pattern
* is a "host-path" pattern. A "host-path" pattern is a pattern that can be
* parsed into two regexes, a host regex and a path regex, such that a subject
* URL matches the original URL pattern iff the host portion matches the host
* regex and the path portion matches the path regex. If
* {@code isHostPathType()} is true, then {@link #getHostRegex()} and
* {@link #getPathRegex()} return the corresponding regexes. </li>
* <li> {@link #isPathPrefixMatch()}: Returns {@code true} if the parsed
* pattern is a "host-path" pattern and the path portion of the pattern is
* simply a fixed string that must appear at the beginning of the path. In this
* case, {@link #getPathPrefixString()} returns a simple string (not a regex)
* that can be matched against the start of the subject URL's path. </li>
* <li> {@link #isPathExactMatch()}: Returns {@code true} if if the parsed
* pattern is a "host-path" pattern and the path portion of the pattern is an
* exact-match string. In this case, {@link #getPathExactString()} returns a
* simple string (not a regex) that can be matched exactly against the subject
* URL's path. </li>
* </ul>
* In summary:
* <ul>
* <li> {@code getUrlRegex()} provides an equivalent Java regex for the entire
* pattern. </li>
* <li> If {@code isHostPathType()} is true, then, {@code getHostRegex()} and
* {@code getPathRegex()} return regexes for the two portions.</li>
* <li> If {@code isPrefixPathMatch()} is true, then,
* {@code getPrefixPathMatchPattern()} returns a simple string pattern for
* prefix match.</li>
* <li> If {@code isPathExactMatch()} is true, then, in addition,
* {@code getPathExactMatchPattern()} returns a simple string pattern for exact
* match.
* </ul>
* <p>
* Note: the "path" portion is the hierarchical part, that is, everything
* following the first slash (not the {@code ://}). The "host" portion is
* everything before that. For example: for the URL
* {@code http://www.example.com/foo/bar}, the protocol-authority portion is
* {@code http://www.example.com/} and the file portion is {@code /foo/bar}.
* Note the the middle slash appears in both portions.
* <p>
* A parser is provided to separate a URL string into host and path portions:
* {@link AnalyzedUrl}. You can access the host and path portions through
* {@link AnalyzedUrl#getHostPart()} and {@link AnalyzedUrl#getPathPart()}. It
* is recommended that this parser be used rather than the standard
* {@code getHost()} and {@code getPath()} functions of {@link java.net.URL},
* because this class and {@code AnalyzedUrl} share parsing infrastructure and
* at present, there is at least one significant difference:
* {@code AnalyzedUrl.getPathPart()} includes the leading slash but
* {@code java.net.URL.getPath()} does not. TODO: fix this.
* <p>
* Exceptions and caveats: not all forms of Google URL patterns are currently
* supported. At present, these exceptions and special cases apply:
* <ul>
* <li> {@code www?:} patterns are not supported </li>
* <li> {@code regexp:} and {@code regexpCase:} patterns are translated simply
* by removing those two prefixes. Thus, the remaining pattern is assumed to be
* a Java regex, not a GNU regex (as documented on the <a
* href="http://code.google.com/apis/searchappliance/documentation/50/admin/URL_patterns.html">
* reference site</a>). </li>
* <li> {@code regexpIgnoreCase:} patterns are handled similarly. In this case,
* the prefix is removed and the pattern is enclosed in {@code (?i:}...{@code )}</li>
* <li> Exception patterns (patterns with leading {@code -} or {@code +-}) are
* not supported.</li>
* </ul>
*/
public class ParsedUrlPattern {
private final String urlPattern;
private final String urlRegex;
private final boolean hostPathType;
private final String hostRegex;
private final String pathRegex;
private final boolean pathExactMatch;
private final String pathExactMatchPattern;
private final boolean prefixPathMatch;
private final String prefixPathMatchPattern;
/**
* Parses a Google URL pattern to Java regexes. Google URL patterns are
* publicly documented <a
* href="http://code.google.com/apis/searchappliance/documentation/50/admin/URL_patterns.html">
* here </a>.
*
* @param urlPattern A Google URL pattern
* @throws IllegalArgumentException if the URL pattern is unsupported or can
* not be parsed
*/
public ParsedUrlPattern(String urlPattern) {
ParsedUrlPatternBuilder t = new ParsedUrlPatternBuilder(urlPattern);
this.urlPattern = t.urlPattern;
this.urlRegex = t.urlRegex;
this.hostPathType = t.hostPathType;
this.hostRegex = t.hostRegex;
this.pathRegex = t.pathRegex;
this.pathExactMatch = t.pathExactMatch;
this.pathExactMatchPattern = t.pathExactMatchPattern;
this.prefixPathMatch = t.prefixPathMatch;
this.prefixPathMatchPattern = t.prefixPathMatchPattern;
}
/**
* Returns a regex that matches the entire URL. A subject string matches the
* URL pattern iff it matches this regex.
*
* @return a regex that matches the entire URL
*/
public String getUrlRegex() {
return urlRegex;
}
/**
* Returns {@code true} if the parsed pattern is a "host-path" pattern. A
* "host-path" pattern is a pattern that can be parsed into two regexes, a
* host regex and a path regex, such that a subject url matches the pattern
* iff the host portion matches the host regex and the path portion matches
* the path regex.
* <p>
* For example, the pattern {@code example.com/foo} might be parsed into two
* regexes, host regex: {@code example.com/$} and path regex: {@code ^/foo}.
*/
public boolean isHostPathType() {
return hostPathType;
}
/**
* Returns a regex that matches the host (protocol and authority) portion of
* the URL. If this is a host-path regex then a subject string matches the url
* pattern iff the host portion matches this regex and the the path portion
* matches the corresponding path regex (obtained by {@link #getPathRegex()}).
* <p>
* This should be used against URLs that have been parsed using the
* {@link AnalyzedUrl} class.
* <p>
* Note: this should only be used if {@code isHostPathType()} is true; if not,
* then this method throws an {@code IllegalStateException}.
*
* @return a regex that matches the host (protocol and authority) portion of
* the URL
* @throws IllegalStateException if {@code isHostPathType()} is false
*/
public String getHostRegex() {
Preconditions.checkState(isHostPathType());
return hostRegex;
}
/**
* Returns a regex that matches the path (hierarchical) portion of the URL.
* <p>
* This should be used against URLs that have been parsed using the
* {@link AnalyzedUrl} class.
* <p>
* Note: this should only be used if {@link #isHostPathType()} is true; if
* not, then this method throws an {@code IllegalStateException}.
*
* @return a regex that matches the path (hierarchical) portion of the URL
* @throws IllegalStateException if {@code isHostPathType()} is false
*/
public String getPathRegex() {
Preconditions.checkState(isHostPathType());
return pathRegex;
}
/**
* Indicates whether the parsed pattern gives a prefix match pattern. If this
* is true, then this pattern can be obtained using
* {@link #getPathPrefixString()}.
*
* @return {@code true} if the parsed pattern gives an prefix match pattern.
*/
public boolean isPathPrefixMatch() {
return prefixPathMatch;
}
/**
* If {@link #isPathPrefixMatch()} is true, then this returns a simple string
* that can be matched against the path portion of a subject string using
* {@link String#startsWith(String)}.
* <p>
* Note: this should only be used if {@code isPrefixPathMatch()} is true; if
* not, then this method throws an {@code IllegalStateException}.
*
* @return a string that matches a prefix of the path portion of the URL
* @throws IllegalStateException if {@code isPathPrefixMatch()} is false
*/
public String getPathPrefixString() {
Preconditions.checkState(isPathPrefixMatch());
return prefixPathMatchPattern;
}
/**
* Returns whether the parsed pattern gives an exact match pattern. If this is
* true, then this pattern can be obtained using {@link #getPathExactString()}.
*
* @return {@code true} if the parsed pattern gives an exact match pattern.
*/
public boolean isPathExactMatch() {
return pathExactMatch;
}
/**
* If {@link #isPathExactMatch()} is true, then this returns a simple string
* that can be matched against the path portion of a subject string using
* {@link String#equals(Object)}. Note: this should only be used if
* {@code isPathExactMatch()} is true; if not, then this method throws an
* {@code IllegalStateException}.
*
* @return a string that matches the entire path
* @throws IllegalStateException if {@code isPathExactMatch()} is false
*/
public String getPathExactString() {
Preconditions.checkState(isPathExactMatch());
return pathExactMatchPattern;
}
/**
* Returns the original URL pattern.
*
* @return the original URL pattern.
*/
public String getUrlPattern() {
return urlPattern;
}
// This is the master meta-regex. This is used both for parsing URL patterns
// and for parsing URLs
private static final String URL_METAPATTERN_STRING =
"\\A(\\^)?((?:([^/:$<]*)((?:(?::|(?::/))?\\Z)|(?:://)))?" +
// ___1_____2a__3_________4b__c____d____________e
"(?:([^/:@]*)@)?([^/:<]*)?(?::([^/<]*))?)(/|(?:</>))?(?:(.*?)(\\Z|\\$)?)?\\Z"
// f__5___________6_________g___7__________8__h________i__9____0
;
// Groups: (capturing groups are numbered, non-capturing are lettered)
// 1 anchor (^)
// 2 protocol + authority (not including /)
// a protocol + ((nothing or : or :/ followed by end of pattern) or ::/)
// 3 protocol
// 4 protocol separator ((nothing or : or :/ followed by end of pattern) or
// ::/)
// b nothing or : or :/ followed by end of pattern
// c : or :/
// d :/
// e ::/
// f userinfo + @
// 5 userinfo
// 6 host
// g : + port
// 7 port
// 8 slash (after authority) (could be a slash or "</>")
// h </>
// i file + anchor
// 9 file
// 10 anchor ($)
// This Pattern is package visible so it can be used by AnalyzedUrl
static final Pattern URL_METAPATTERN = Pattern.compile(URL_METAPATTERN_STRING);
// As above, the enum is package visible so it can be used by AnalyzedUrl
// Note: if you change the master regex, you should change this enum to match
static enum MetaRegexGroup {
LEFT_ANCHOR(1), PROTOCOL_AUTHORITY(2), PROTOCOL(3), PROTOCOL_SEPARATOR(4), USERINFO(5),
HOST(6), PORT(7), SLASH_AFTER_AUTHORITY(8), FILE(9), RIGHT_ANCHOR(10);
private int n;
MetaRegexGroup(int n) {
this.n = n;
}
int intValue() {
return n;
}
}
// This static helper is also shared with the AnalyzedUrl
static String getGroup(Matcher m, MetaRegexGroup g) {
String s = m.group(g.intValue());
return (s == null) ? "" : s;
}
private static class ParsedUrlPatternBuilder {
public String urlPattern;
public String urlRegex;
public boolean hostPathType;
public String hostRegex;
public String pathRegex;
public boolean pathExactMatch;
public String pathExactMatchPattern;
public boolean prefixPathMatch;
public String prefixPathMatchPattern;
ParsedUrlPatternBuilder(String urlPattern) {
checkPatternValidity(urlPattern);
this.urlPattern = urlPattern;
analyze();
}
private void analyze() {
if (urlPattern.startsWith(CONTAINS_PATTERNS_METAPATTERN_PREFIX)) {
urlRegex =
Pattern.quote(urlPattern.substring(CONTAINS_PATTERNS_METAPATTERN_PREFIX.length()));
initNonHostPathPattern();
return;
}
if (urlPattern.startsWith(REGEXP_PATTERNS_METAPATTERN_PREFIX)) {
urlRegex = urlPattern.substring(REGEXP_PATTERNS_METAPATTERN_PREFIX.length());
initNonHostPathPattern();
return;
}
if (urlPattern.startsWith(REGEXPCASE_PATTERNS_METAPATTERN_PREFIX)) {
urlRegex = urlPattern.substring(REGEXPCASE_PATTERNS_METAPATTERN_PREFIX.length());
initNonHostPathPattern();
return;
}
if (urlPattern.startsWith(REGEXPIGNORECASE_PATTERNS_METAPATTERN_PREFIX)) {
urlRegex =
"(?i:" + urlPattern.substring(REGEXPIGNORECASE_PATTERNS_METAPATTERN_PREFIX.length())
+ ")";
initNonHostPathPattern();
return;
}
initHostPathPattern();
if (isNullOrEmpty(urlPattern)) {
prefixPathMatch = true;
return;
}
if (testForAndHandleNoSlashSuffixPattern()) {
return;
}
Matcher m = URL_METAPATTERN.matcher(urlPattern);
Preconditions.checkArgument(m.find(), "problem parsing urlpattern: " + urlPattern);
urlRegex = buildUrlRegex(m);
pathRegex = buildPathRegex(m);
hostRegex = buildHostRegex(m);
}
private void initNonHostPathPattern() {
hostPathType = false;
pathRegex = null;
hostRegex = null;
pathExactMatch = false;
pathExactMatchPattern = null;
prefixPathMatch = false;
prefixPathMatchPattern = null;
}
private void initHostPathPattern() {
hostPathType = true;
urlRegex = "";
pathRegex = "";
hostRegex = "";
pathExactMatch = false;
pathExactMatchPattern = null;
prefixPathMatch = false;
prefixPathMatchPattern = "/";
}
// A suffix pattern (ends in $) that has no slash just doesn't parse well
// with
// the metapattern. So we use a special pattern for this case.
private boolean testForAndHandleNoSlashSuffixPattern() {
Matcher m = NO_SLASH_SUFFIX_PATTERN.matcher(urlPattern);
if (!m.find()) {
return false;
}
urlRegex = Pattern.quote(m.group(1)) + OUTPUT_RIGHT_ANCHOR_PATTERN_STRING;
pathRegex = urlRegex;
hostRegex = "";
pathExactMatch = false;
pathExactMatchPattern = null;
prefixPathMatch = false;
prefixPathMatchPattern = null;
return true;
}
// suffix patterns that contain no slash jam up my master meta-regex: the
// string before the $ gets put in the wrong capturing group. I fought with
// it
// a while but then bailed and just made a special meta-regex for them
private static final String NO_SLASH_SUFFIX_PATTERN_STRING = "\\A([^/]*)\\$\\Z";
private static final Pattern NO_SLASH_SUFFIX_PATTERN =
Pattern.compile(NO_SLASH_SUFFIX_PATTERN_STRING);
private static final String CONTAINS_PATTERNS_METAPATTERN_PREFIX = "contains:";
private static final String REGEXP_PATTERNS_METAPATTERN_PREFIX = "regexp:";
private static final String REGEXPCASE_PATTERNS_METAPATTERN_PREFIX = "regexpCase:";
private static final String REGEXPIGNORECASE_PATTERNS_METAPATTERN_PREFIX = "regexpIgnoreCase:";
private static final String UNSUPPORTED_PATTERNS_METAPATTERN_STRING = "\\A(?:(www\\?:)|(-))";
private static final Pattern UNSUPPORTED_PATTERNS_METAPATTERN =
Pattern.compile(UNSUPPORTED_PATTERNS_METAPATTERN_STRING);
private static final String OUTPUT_RIGHT_ANCHOR_PATTERN_STRING = "\\Z";
private static final String OUTPUT_LEFT_ANCHOR_PATTERN_STRING = "\\A";
private static final String OUTPUT_SLASH = "/";
private static final String OUTPUT_ANY_OR_NO_PORT_PATTERN = "(\\:[^/]*)?";
private static final String OUTPUT_ANY_PORT_PATTERN = "\\:[^/]*";
private static boolean isNullOrEmpty(String s) {
return (s == null || s.length() < 1);
}
// These helper functions whose names match buildSOMETHINGPattern build a
// regex to match the SOMETHING in their names. They should be usable,
// appropriately quoted regexes
private static String buildProtocolUserinfoHostPattern(Matcher m) {
StringBuilder sb = new StringBuilder();
sb.append(getGroup(m, MetaRegexGroup.PROTOCOL));
sb.append(getGroup(m, MetaRegexGroup.PROTOCOL_SEPARATOR));
String userInfo = getGroup(m, MetaRegexGroup.USERINFO);
if (!isNullOrEmpty(userInfo)) {
sb.append(userInfo);
sb.append("@");
}
sb.append(getGroup(m, MetaRegexGroup.HOST));
String unquotedPattern = sb.toString();
return isNullOrEmpty(unquotedPattern) ? "" : Pattern.quote(unquotedPattern);
}
// port is tricky because the absence of a port in a pattern should match
// any
// specific port in a target
private static String buildPortPattern(Matcher m) {
StringBuilder sb = new StringBuilder();
String port = getGroup(m, MetaRegexGroup.PORT);
if (isNullOrEmpty(port)) {
// port was empty - match any port - default or explicit
sb.append(OUTPUT_ANY_OR_NO_PORT_PATTERN);
} else {
if (port.equals("*")) {
// port was explicitly "*" - match any explicitly specified port
sb.append(OUTPUT_ANY_PORT_PATTERN);
} else {
// port was explicit and not "*" - match only that port
sb.append("\\:");
sb.append(Pattern.quote(port));
}
}
return sb.toString();
}
private static String buildUnquotedFilePattern(Matcher m) {
return getGroup(m, MetaRegexGroup.FILE);
}
private static String buildQuotedFilePattern(Matcher m) {
String unquotedPattern = buildUnquotedFilePattern(m);
return isNullOrEmpty(unquotedPattern) ? "" : Pattern.quote(unquotedPattern);
}
// the helper functions whose names match buildSOMETHINGRegex each build one
// of the three public regexes: the urlRegex, the protocolAuthorityRegex and
// the fileRegex.
// The main reason that the urlRegex is not simply the concatenation of the
// protocolAuthorityRegex and the fileRegex is the anchors. Both for
// correctness and efficiency, we want to use anchors only where
// appropriate:
// using ^A.*foo is considerably slower than just using foo.
private String buildUrlRegex(Matcher m) {
StringBuilder sb = new StringBuilder();
String leftAnchor = getGroup(m, MetaRegexGroup.LEFT_ANCHOR);
String protocolUserinfoHostPattern = buildProtocolUserinfoHostPattern(m);
String portPattern = buildPortPattern(m);
String slashAfterAuthority = getGroup(m, MetaRegexGroup.SLASH_AFTER_AUTHORITY);
String filePattern = buildQuotedFilePattern(m);
String rightAnchor = getGroup(m, MetaRegexGroup.RIGHT_ANCHOR);
// prefix patterns need to be handled specially
if (!isNullOrEmpty(leftAnchor)) {
sb.append(OUTPUT_LEFT_ANCHOR_PATTERN_STRING);
}
if (!isNullOrEmpty(protocolUserinfoHostPattern)) {
sb.append(protocolUserinfoHostPattern);
}
if (!isNullOrEmpty(portPattern)) {
if (sb.length() > 0) {
sb.append(portPattern);
}
}
if (!isNullOrEmpty(slashAfterAuthority)) {
if ("</>".equals(slashAfterAuthority)) {
if (sb.length() < 1) {
sb.append(OUTPUT_LEFT_ANCHOR_PATTERN_STRING);
sb.append("[^/]*//[^/]*");
}
}
sb.append(OUTPUT_SLASH);
}
if (!isNullOrEmpty(filePattern)) {
sb.append(filePattern);
}
if (!isNullOrEmpty(rightAnchor)) {
sb.append(rightAnchor);
}
return sb.toString();
}
private String buildHostRegex(Matcher m) {
StringBuilder sb = new StringBuilder();
String leftAnchor = getGroup(m, MetaRegexGroup.LEFT_ANCHOR);
String protocolUserinfoHostPattern = buildProtocolUserinfoHostPattern(m);
String portPattern = buildPortPattern(m);
String slashAfterAuthority = getGroup(m, MetaRegexGroup.SLASH_AFTER_AUTHORITY);
// prefix patterns need to be handled specially
if (!isNullOrEmpty(leftAnchor)) {
sb.append(OUTPUT_LEFT_ANCHOR_PATTERN_STRING);
}
if (!isNullOrEmpty(protocolUserinfoHostPattern)) {
sb.append(protocolUserinfoHostPattern);
}
if (!isNullOrEmpty(portPattern)) {
sb.append(portPattern);
}
if (!isNullOrEmpty(slashAfterAuthority)) {
sb.append(OUTPUT_SLASH);
}
return sb.toString();
}
// We expect that, in practice, the fileRegex will be used much more often
// than the protocolAuthority regex (there will probably be a hashtable for
// the protocol-authority portion), so we really want to makes sure that the
// fileRegexes are simple prefix matches, as often as possible.
private String buildPathRegex(Matcher m) {
boolean hasLeftAnchor = false;
boolean hasRightAnchor = false;
StringBuilder sb = new StringBuilder();
String protocolAuthority = getGroup(m, MetaRegexGroup.PROTOCOL_AUTHORITY);
String slashAfterAuthority = getGroup(m, MetaRegexGroup.SLASH_AFTER_AUTHORITY);
String unquotedFilePattern = buildUnquotedFilePattern(m);
String rightAnchor = getGroup(m, MetaRegexGroup.RIGHT_ANCHOR);
// two conditions for this being an prefix pattern:
// either there was a protocolAuthority OR there was a </>
// slashAfterAuthority
if (!isNullOrEmpty(protocolAuthority) || "</>".equals(slashAfterAuthority)) {
hasLeftAnchor = true;
sb.append(OUTPUT_LEFT_ANCHOR_PATTERN_STRING);
}
if (!isNullOrEmpty(slashAfterAuthority)) {
sb.append(OUTPUT_SLASH);
}
sb.append(Pattern.quote(unquotedFilePattern));
if (!isNullOrEmpty(rightAnchor)) {
hasRightAnchor = true;
sb.append(OUTPUT_RIGHT_ANCHOR_PATTERN_STRING);
}
if (hasLeftAnchor) {
if (hasRightAnchor) {
this.pathExactMatch = true;
this.pathExactMatchPattern = "/" + unquotedFilePattern;
this.prefixPathMatch = false;
this.prefixPathMatchPattern = null;
} else {
this.pathExactMatch = false;
this.pathExactMatchPattern = null;
this.prefixPathMatch = true;
this.prefixPathMatchPattern = "/" + unquotedFilePattern;
}
}
return sb.toString();
}
private static void checkPatternValidity(String s) {
Preconditions.checkNotNull(s);
Matcher m = UNSUPPORTED_PATTERNS_METAPATTERN.matcher(s);
Preconditions.checkArgument(!m.find(), "unsupported urlpattern: " + s);
}
}
}