| // Copyright 2011 Google Inc. All Rights Reserved. | 
 | // | 
 | // Licensed under the Apache License, Version 2.0 (the "License"); | 
 | // you may not use this file except in compliance with the License. | 
 | // You may obtain a copy of the License at | 
 | // | 
 | //      http://www.apache.org/licenses/LICENSE-2.0 | 
 | // | 
 | // Unless required by applicable law or agreed to in writing, software | 
 | // distributed under the License is distributed on an "AS IS" BASIS, | 
 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
 | // See the License for the specific language governing permissions and | 
 | // limitations under the License. | 
 |  | 
 | package com.google.enterprise.adaptor; | 
 |  | 
 | import java.io.ByteArrayOutputStream; | 
 | import java.io.IOException; | 
 | import java.io.InputStream; | 
 | import java.net.URI; | 
 | import java.net.URISyntaxException; | 
 | import java.nio.ByteBuffer; | 
 | import java.nio.charset.Charset; | 
 | import java.util.ArrayList; | 
 | import java.util.Collections; | 
 | import java.util.Date; | 
 | import java.util.HashMap; | 
 | import java.util.Map; | 
 | import java.util.Set; | 
 | import java.util.TreeSet; | 
 | import java.util.logging.Level; | 
 | import java.util.logging.Logger; | 
 | import java.util.regex.Matcher; | 
 | import java.util.regex.Pattern; | 
 |  | 
 | /** | 
 |  * Parses the adaptor data format into individual commands with associated data. | 
 |  * | 
 |  * This format is used for communication between the adaptor library and various command line | 
 |  * adaptor components (lister, retriever, transformer, authorizer, etc.). It supports responses | 
 |  * coming back from the command line adaptor implementation. The format supports a mixture of | 
 |  * character and binary data. All character data must be encoded in UTF-8.<p> | 
 |  * | 
 |  * <h3>Header Format</h3> | 
 |  * | 
 |  * Communications (via either file or stream) begin with the header:<p> | 
 |  * | 
 |  * {@code GSA Adaptor Data Version 1 [<delimiter>]}<p> | 
 |  * | 
 |  * The version number must be proceeded by a single space and followed by a single space. The | 
 |  * version number may increase in the future should the format be enhanced.<p> | 
 |  * | 
 |  * The string between the two square brackets will be used as the delimiter for the remainder of the | 
 |  * file being read or for the duration of the communication session.<p> | 
 |  * | 
 |  * Care must be taken that the delimiter character string can never occur in a document ID, metadata | 
 |  * name, metadata value, user name, or any other data that will be represented using the format with | 
 |  * the exception of document contents, which can contain the delimiter string. The safest delimiter | 
 |  * is likely to be the null character (the character with a value of zero). This character is | 
 |  * unlikely to be present in existing names, paths, metadata, etc. Another possible choice is the | 
 |  * newline character, though in many systems it is possible for this character to be present in | 
 |  * document names and document paths, etc. If in doubt, the null character is recommended. A | 
 |  * delimiter can be made up of more than one character so it is possible to have a delimiter that is | 
 |  * <CR><LF> or a highly unique string (such as a GUID) that has an exceptionally low probability of | 
 |  * occurring in the data.<p> | 
 |  * | 
 |  * The following characters may not be used in the delimiter:<p> 'A'-'Z', 'a'-'z' and '0'-'9' the | 
 |  * alphanumeric characters<br> ':'  colon<br> '/'  slash<br> '-'  hyphen<br> '_'  underscore<br> ' ' | 
 |  * space<br> '=' equals<br> '+' plus<br> '[' left square bracket<br> ']' right square bracket<p> | 
 |  * | 
 |  * <h3>Body Format</h3> Elements in the file start with one of the following commands. Commands | 
 |  * where data precedes the next delimiter include an equal sign. Commands that are immediately | 
 |  * followed by a delimiter do not include an equal sign. The first command must specify a document | 
 |  * ID ("id=" or "id-list"). Command that don't specify a document ID are associated with the most | 
 |  * recent previously specified document ID.<p> | 
 |  * | 
 |  * <h1>Common Commands:</h1> | 
 |  * | 
 |  * "id=" -- specifies a document id<p> | 
 |  * | 
 |  * "id-list" -- Starts a list of document ids each separated by | 
 |  * the specified delimiter, the list is terminated by two consecutive delimiters or EOS | 
 |  * (End-Of-Stream). ids in an id-list cannot have any of the associated commands listed below.<p> | 
 |  * | 
 |  * "repository-unavailable=" -- the document repository is unavailable. The string following the "=" | 
 |  * character includes additional information that will be logged with the error. | 
 |  * | 
 |  * | 
 |  * <h1>Lister Commands:</h1> | 
 |  * | 
 |  * "result-link=" -- specifies an alternative link to be displayed in the search results. | 
 |  * This must be a properly formed URL. A "result link" is sometimes referred to as a "display URL". | 
 |  * If no results-link is specified then the URL used for crawling is also used in the | 
 |  * search results.<p> | 
 |  * | 
 |  * "last-modified=" -- Specifies the last time the document or its metadata has changed. | 
 |  * The argument is a number representing the number of seconds since the standard base | 
 |  * time known as the epoch", namely January 1, 1970, 00:00:00 GMT. If last-modified is specified | 
 |  * and the document has never been crawled before or has been crawled prior to the last-modified | 
 |  * time then the ocument will be marked as "crawl-immediately" by the GSA.<p> | 
 |  * | 
 |  * "crawl-immediately" -- Increases the crawling priority of the document such | 
 |  * that the GSA will retrieve it sooner than normally crawled documents.<p> | 
 |  * | 
 |  * "crawl-once" -- specifies that the document will be crawled by the | 
 |  *                 GSA one time but then never re-crawled.<p> | 
 |  * | 
 |  * "lock" -- Causes the document to remain in the index unless explicitly removed. | 
 |  * Failure to retrieve the document during re-crawling will not result in | 
 |  * removal of the document. If every document in the GSA is | 
 |  * locked then locked document may be forced out when maximum capacity is | 
 |  * reached.<p> | 
 |  * | 
 |  * "delete" -- this document should be deleted from the GSA index.<p> | 
 |  * | 
 |  * <h1>Retriever Commands:</h1> | 
 |  * | 
 |  * "up-to-date" -- specifies that the document is up-to-date with respect to its last crawled | 
 |  * time.<p> | 
 |  * | 
 |  * "not-found" -- the document does not exists in the repository<p> | 
 |  * | 
 |  * "mime-type=" -- specifies the document's mime-type. If unspecified then the GSA will | 
 |  * automatically assign a type to the document. <p> | 
 |  * | 
 |  * "meta-name=" -- specifies a metadata key, to be followed by a metadata-value<p> | 
 |  * | 
 |  * "meta-value=" -- specifies a metadata value associated with | 
 |  * immediately preceding metadata-name<p> | 
 |  * | 
 |  * "content" -- signals the beginning of binary content which | 
 |  * continues to the end of the file or stream<p> | 
 |  * | 
 |  * "last-modified=" -- specifies the last time the document or its metadata has changed. | 
 |  * The argument is a number representing the number of seconds since the standard base | 
 |  * time known as the epoch", namely January 1, 1970, 00:00:00 GMT.<p> | 
 |  * | 
 |  * "secure=" -- specifies whether the document is non-public. The argument is either 'true' or | 
 |  * 'false'.<p> | 
 |  * | 
 |  * "anchor-uri=" -- specifies an anchor URI, to be followed by anchor-text.<p> | 
 |  * | 
 |  * "anchor-text=" -- specifies the text associated with an anchor-uri.<p> | 
 |  * | 
 |  * "no-index=" -- specifies whether the document should be indexed by the GSA. The argument is | 
 |  * either 'true' or 'false'.<p> | 
 |  * | 
 |  * "no-follow=" -- specifies whether the document's links should be followed by the GSA. The | 
 |  * argument is either 'true' or 'false'.<p> | 
 |  * | 
 |  * "no-archive=" -- specifies whether GSA document will allow the user to see a cached version of | 
 |  * the document. The argument is either 'true' or 'false'.<p> | 
 |  * | 
 |  * "display-url=" -- specifies an alternative link to be displayed in the search results. | 
 |  * This must be a properly formed URL.<p> | 
 |  * | 
 |  * "crawl-once=" -- specifies that the document will be crawled by the | 
 |  * GSA one time but then never re-crawled. The argument should be 'true' or 'false'.<p> | 
 |  * | 
 |  * "lock=" -- Causes the document to remain in the index unless explicitly removed. | 
 |  * If every document in the GSA is locked then locked document may be forced out when maximum | 
 |  * capacity is reached.<p> | 
 |  * | 
 |  * "acl" -- when provided, an ACL is sent along with document. The ACL is made of  | 
 |  * values provided for other commands starting with "acl-" and "namespace" | 
 |  * command. If no acl command is provided then all other ACL commands are  | 
 |  * ignored. <p> | 
 |  * | 
 |  * "namespace=" -- namespace used on all user and group principals until another | 
 |  * another namespace is provided.  Defaults to the default namespace.<p> | 
 |  * | 
 |  * "acl-permit-user=" -- a user name, either with domain or without, that will | 
 |  * be permitted to view document being returned.<p> | 
 |  * | 
 |  * "acl-deny-user=" -- a user name, either with domain or without, that will | 
 |  * be denied access to document being returned.<p> | 
 |  * | 
 |  * "acl-permit-group=" -- a group name, either with domain or without, that  | 
 |  * will be permitted to view document being returned.<p> | 
 |  * | 
 |  * "acl-deny-group=" -- a group name, either with domain or without, that | 
 |  * will be denied to view document being returned.<p> | 
 |  * | 
 |  * "acl-inherit-from=" -- document id that this document inherits permissions | 
 |  * from. <p> | 
 |  * | 
 |  * "acl-inherit-fragment=" -- optional fragment supplementing acl-inherit-from. | 
 |  * Together acl-inherit-from and acl-inherit-fragment are what is being | 
 |  * inherited from.<p> | 
 |  *  | 
 |  * "acl-inheritance-type=" -- the type of inheritance {@code | 
 |  * com.google.enterprise.adaptor.Acl.InheritanceType}.  Valid values are: | 
 |  * and-both-permit, child-overrides, leaf-node, and parent-overrides<p> | 
 |  * | 
 |  * "acl-case-sensitive=" -- the principals of this document are case sensitive. | 
 |  * <p> | 
 |  * | 
 |  * "acl-case-insensitive=" -- the principals of this document are case | 
 |  * insensitive. <p> | 
 |  * | 
 |  * <h1>Authorizer Commands:</h1> | 
 |  * | 
 |  * "authz-status=" -- specifies whether a document is visible to a | 
 |  *     specified user. The argument must be PERMIT, DENY or INDETERMINATE<p> | 
 |  * | 
 |  *  "user=" -- specifies the user for whom the authorization check will be made<p> | 
 |  * | 
 |  *  "password=" -- specifies the password for the user. (optional)<p> | 
 |  *     | 
 |  *  "group=" -- specifies a security group to which the user belongs.<p> | 
 |  * | 
 |  * End-of-stream terminates the data transmission. Multiple consecutive delimiters are collapsed | 
 |  * into a single delimiter and terminates the current id-list should one exist.<p> | 
 |  * | 
 |  * Unrecognized commands generate a warning but are otherwise ignored. | 
 |  * | 
 |  * <h3>Examples</h3> | 
 |  * | 
 |  * Example 1:<p> | 
 |  * | 
 |  * <pre> | 
 |  * {@code | 
 |  * GSA Adaptor Data Version 1 [<delimiter>] | 
 |  * id-list | 
 |  * /home/repository/docs/file1 | 
 |  * /home/repository/docs/file2 | 
 |  * /home/repository/docs/file3 | 
 |  * /home/repository/docs/file4 | 
 |  * /home/repository/docs/file5 | 
 |  * }</pre> | 
 |  * | 
 |  * Example 2:<p> | 
 |  * | 
 |  * <pre> | 
 |  * {@code | 
 |  * GSA Adaptor Data Version 1 [<delimiter>] | 
 |  * id=/home/repository/docs/file1 | 
 |  * id=/home/repository/docs/file2 | 
 |  * crawl-immediately | 
 |  * last-modified=20110803 16:07:23 | 
 |  * | 
 |  * meta-name=Department | 
 |  * meta-content=Engineering | 
 |  * | 
 |  * meta-name=Creator | 
 |  * meta-content=howardhawks | 
 |  * | 
 |  * id=/home/repository/docs/file3 | 
 |  * id=/home/repository/docs/file4 | 
 |  * id=/home/repository/docs/file5 | 
 |  * }</pre> | 
 |  * | 
 |  * Data passed to command line authorizer via stdin for authz check. | 
 |  * Entries will always occur in this order: user, password, group, id. | 
 |  * password and group information is optional. Any number of group and | 
 |  * id entries can exist. Each of the documents with a listed id should | 
 |  * be checked. | 
 |  * <pre> | 
 |  * {@code | 
 |  * GSA Adaptor Data Version 1 [<delimiter>] | 
 |  * user=tim_smith | 
 |  * password=abc123 | 
 |  * group=managers | 
 |  * group=research | 
 |  * id=/home/repository/docs/file1 | 
 |  * id=/home/repository/docs/file2 | 
 |  * }</pre> | 
 |  * | 
 |  * AuthZ response passed from command line authorizer via stdout. | 
 |  * Each doc id must include an authz-status entry. | 
 |  * <pre> | 
 |  * {@code | 
 |  * GSA Adaptor Data Version 1 [<delimiter>] | 
 |  * id=/home/repository/docs/file1 | 
 |  * authz-status=PERMIT | 
 |  * id=/home/repository/docs/file2 | 
 |  * authz-status=DENY | 
 |  * }</pre> | 
 |  */ | 
 | public class CommandStreamParser { | 
 |  | 
 |   private static enum Operation { | 
 |     ID("id"), | 
 |     RESULT_LINK("result-link"), | 
 |     LAST_MODIFIED("last-modified"), | 
 |     CRAWL_IMMEDIATELY("crawl-immediately"), | 
 |     CRAWL_ONCE("crawl-once"), | 
 |     LOCK("lock"), | 
 |     DELETE("delete"), | 
 |     UP_TO_DATE("up-to-date"), | 
 |     NOT_FOUND("not-found"), | 
 |     MIME_TYPE("mime-type"), | 
 |     META_NAME("meta-name"), | 
 |     META_VALUE("meta-value"), | 
 |     CONTENT("content"), | 
 |     AUTHZ_STATUS("authz-status"), | 
 |     SECURE("secure"), | 
 |     ANCHOR_URI("anchor-uri"), | 
 |     ANCHOR_TEXT("anchor-text"), | 
 |     NO_INDEX("no-index"), | 
 |     NO_FOLLOW("no-follow"), | 
 |     NO_ARCHIVE("no-archive"), | 
 |     DISPLAY_URL("display-url"), | 
 |     ACL("acl"), | 
 |     NAMESPACE("namespace"), | 
 |     ACL_PERMIT_USER("acl-permit-user"), | 
 |     ACL_DENY_USER("acl-deny-user"), | 
 |     ACL_PERMIT_GROUP("acl-permit-group"), | 
 |     ACL_DENY_GROUP("acl-deny-group"), | 
 |     ACL_INHERIT_FROM("acl-inherit-from"), | 
 |     ACL_INHERIT_FRAGMENT("acl-inherit-fragment"), | 
 |     ACL_INHERITANCE_TYPE("acl-inheritance-type"), | 
 |     ACL_CASE_SENSITIVE("acl-case-sensitive"), | 
 |     ACL_CASE_INSENSITIVE("acl-case-insensitive"), | 
 |     ; | 
 |  | 
 |     private final String commandName; | 
 |  | 
 |     private Operation(String commandName) { | 
 |       this.commandName = commandName; | 
 |     } | 
 |  | 
 |     public String getCommandName() { | 
 |       return commandName; | 
 |     } | 
 |   } | 
 |  | 
 |   private static final Logger log = Logger.getLogger(CommandStreamParser.class.getName()); | 
 |   private static final String HEADER_PREFIX = "GSA Adaptor Data Version"; | 
 |   private static final String DISALLOWED_DELIMITER_CHARS_REGEX = "[a-zA-Z0-9:/\\-_ =\\+\\[\\]]"; | 
 |   private static final Charset CHARSET = Charset.forName("UTF-8"); | 
 |  | 
 |   private static final Map<String, Operation> STRING_TO_OPERATION; | 
 |   static { | 
 |     Map<String, Operation> stringToOperation = new HashMap<String, Operation>(); | 
 |     for (Operation operation : Operation.values()) { | 
 |       stringToOperation.put(operation.getCommandName(), operation); | 
 |     } | 
 |     STRING_TO_OPERATION = Collections.unmodifiableMap(stringToOperation); | 
 |   } | 
 |  | 
 |   private static final Map<String, Acl.InheritanceType> STRING_TO_INHERITANCE_TYPE; | 
 |   static { | 
 |     Map<String, Acl.InheritanceType> stringToType | 
 |         = new HashMap<String, Acl.InheritanceType>(); | 
 |     for (Acl.InheritanceType type : Acl.InheritanceType.values()) { | 
 |       stringToType.put(type.getCommonForm(), type); | 
 |     } | 
 |     STRING_TO_INHERITANCE_TYPE = Collections.unmodifiableMap(stringToType); | 
 |   } | 
 |  | 
 |   private InputStream inputStream; | 
 |   private int versionNumber = 0; | 
 |   private String delimiter; | 
 |   private boolean inIdList; | 
 |  | 
 |   /** */ | 
 |   private static class Command { | 
 |  | 
 |     private Operation operation; | 
 |     private String argument; | 
 |  | 
 |     Command(Operation operation, String argument) { | 
 |       this.operation = operation; | 
 |       this.argument = argument; | 
 |     } | 
 |  | 
 |     public Operation getOperation() { | 
 |       return operation; | 
 |     } | 
 |  | 
 |     public String getArgument() { | 
 |       return argument; | 
 |     } | 
 |  | 
 |     public boolean hasArgument() { | 
 |       return argument != null; | 
 |     } | 
 |   } | 
 |  | 
 |   public CommandStreamParser(InputStream inputStream) { | 
 |     this.inputStream = inputStream; | 
 |     inIdList = false; | 
 |   } | 
 |  | 
 |   public int getVersionNumber() throws IOException { | 
 |     checkHeader(); | 
 |     return versionNumber; | 
 |   } | 
 |  | 
 |   public Map<DocId, AuthzStatus> readFromAuthorizer() throws IOException { | 
 |     Map<DocId, AuthzStatus> result = new HashMap<DocId, AuthzStatus>(); | 
 |     String docId = null; | 
 |     AuthzStatus authzStatus = null; | 
 |     Command command = readCommand(); | 
 |  | 
 |     // Starting out at end-of-stream so return an empty list. | 
 |     if (command == null) { | 
 |       return result; | 
 |     } | 
 |  | 
 |     // The first operation must be a doc ID. | 
 |     if (command.getOperation() != Operation.ID) { | 
 |       throw new IOException("Authorizer Error: the first operator must be a document ID. " | 
 |           + " Instead encountered '" + command.getOperation() + "'."); | 
 |     } | 
 |     while (command != null) { | 
 |       switch (command.getOperation()) { | 
 |         case ID: | 
 |           if (docId != null) { | 
 |             result.put(new DocId(docId), authzStatus); | 
 |           } | 
 |           docId = command.getArgument(); | 
 |           authzStatus = null; | 
 |           break; | 
 |         case AUTHZ_STATUS: | 
 |           String authzStatusString = command.getArgument(); | 
 |           try { | 
 |             authzStatus = AuthzStatus.valueOf(authzStatusString); | 
 |           } catch (IllegalArgumentException ex) { | 
 |             log.warning("Unrecognized authz-status of '" + authzStatusString + "' for document: '" + | 
 |                 docId + "'"); | 
 |           } | 
 |           break; | 
 |         default: | 
 |           throw new IOException("Authorizer Error: invalid operation: '" + command.getOperation() + | 
 |               (command.hasArgument() ? "' with argument: '"  + command.getArgument() + "'" : "'")); | 
 |       } | 
 |       command = readCommand(); | 
 |     } | 
 |     result.put(new DocId(docId), authzStatus); | 
 |  | 
 |     return Collections.unmodifiableMap(result); | 
 |   } | 
 |  | 
 |   public void readFromRetriever(DocId docId, Response response) throws IOException { | 
 |     Command command = readCommand(); | 
 |  | 
 |     if (command == null) { | 
 |       throw new IOException("Invalid or missing retriever data."); | 
 |     } else if (command.getOperation() != Operation.ID) { | 
 |       throw new IOException("Retriever Error: the first operator must be a document ID. " | 
 |           + " Instead encountered '" + command.getOperation() + "'."); | 
 |     } | 
 |  | 
 |     DocId foundDocId = new DocId(command.getArgument()); | 
 |     if (!docId.equals(foundDocId)) { | 
 |       throw new IOException("requested document "  + docId + " does not match retrieved " | 
 |           + "document " + foundDocId + "."); | 
 |     } | 
 |  | 
 |     boolean sendAclWithDocument = false; | 
 |     Acl.Builder aclBuilder = new Acl.Builder(); | 
 |     DocId inheritFrom = null;  // saves inherit-from in case fragment comes | 
 |     Set<Principal> permits = new TreeSet<Principal>();  // accumulates acl state | 
 |     Set<Principal> denies = new TreeSet<Principal>();  // accumulates acl state | 
 |     String namespace = Principal.DEFAULT_NAMESPACE;  // last given namespace | 
 |  | 
 |     command = readCommand(); | 
 |     while (command != null) { | 
 |       switch (command.getOperation()) { | 
 |         case ID: | 
 |           throw new IOException("Only one document ID can be specified in a retriever message"); | 
 |         case CONTENT: | 
 |           IOHelper.copyStream(inputStream, response.getOutputStream()); | 
 |           break; | 
 |         case META_NAME: | 
 |           String metaName = command.getArgument(); | 
 |           command = readCommand(); | 
 |           if (command == null || command.getOperation() != Operation.META_VALUE) { | 
 |             throw new IOException("meta-name must be immediately followed by meta-value"); | 
 |           } | 
 |           log.log(Level.FINEST, "Retriever: {0} has metadata {1}={2}", | 
 |               new Object[] {docId.getUniqueId(), metaName, | 
 |                 command.getArgument()}); | 
 |           response.addMetadata(metaName, command.getArgument()); | 
 |           break; | 
 |         case UP_TO_DATE: | 
 |           log.log(Level.FINEST, "Retriever: {0} is up to date.", docId.getUniqueId()); | 
 |           response.respondNotModified(); | 
 |           break; | 
 |         case NOT_FOUND: | 
 |           response.respondNotFound(); | 
 |           break; | 
 |         case MIME_TYPE: | 
 |           log.log(Level.FINEST, "Retriever: {0} has mime-type {1}", | 
 |               new Object[] {docId.getUniqueId(), command.getArgument()}); | 
 |           response.setContentType(command.getArgument()); | 
 |           break; | 
 |         case LAST_MODIFIED: | 
 |           // Convert seconds to milliseconds for Date constructor. | 
 |           response.setLastModified(new Date(Long.parseLong(command.getArgument()) * 1000)); | 
 |           break; | 
 |         case SECURE: | 
 |           response.setSecure(Boolean.parseBoolean(command.getArgument())); | 
 |           break; | 
 |         case ANCHOR_URI: | 
 |           URI anchorUri = URI.create(command.getArgument()); | 
 |           command = readCommand(); | 
 |           if (command == null || command.getOperation() != Operation.ANCHOR_TEXT) { | 
 |             throw new IOException("anchor-uri must be immediately followed by anchor-text"); | 
 |           } | 
 |           response.addAnchor(anchorUri, command.getArgument()); | 
 |           break; | 
 |         case NO_INDEX: | 
 |           response.setNoIndex(Boolean.parseBoolean(command.getArgument())); | 
 |           break; | 
 |         case NO_FOLLOW: | 
 |           response.setNoFollow(Boolean.parseBoolean(command.getArgument())); | 
 |           break; | 
 |         case NO_ARCHIVE: | 
 |           response.setNoArchive(Boolean.parseBoolean(command.getArgument())); | 
 |           break; | 
 |         case DISPLAY_URL: | 
 |           response.setDisplayUrl(URI.create(command.getArgument())); | 
 |           break; | 
 |         case CRAWL_ONCE: | 
 |           response.setCrawlOnce(Boolean.parseBoolean(command.getArgument())); | 
 |           break; | 
 |         case LOCK: | 
 |           response.setLock(Boolean.parseBoolean(command.getArgument())); | 
 |           break; | 
 |         case ACL: | 
 |           sendAclWithDocument = true; | 
 |           break; | 
 |         case NAMESPACE: | 
 |           namespace = command.getArgument(); | 
 |           break; | 
 |         case ACL_PERMIT_USER: | 
 |           permits.add(new UserPrincipal(command.getArgument(), namespace)); | 
 |           break; | 
 |         case ACL_DENY_USER: | 
 |           denies.add(new UserPrincipal(command.getArgument(), namespace)); | 
 |           break; | 
 |         case ACL_PERMIT_GROUP: | 
 |           permits.add(new GroupPrincipal(command.getArgument(), namespace)); | 
 |           break; | 
 |         case ACL_DENY_GROUP: | 
 |           denies.add(new GroupPrincipal(command.getArgument(), namespace)); | 
 |           break; | 
 |         case ACL_INHERIT_FROM: | 
 |           inheritFrom = new DocId(command.getArgument()); | 
 |           aclBuilder.setInheritFrom(inheritFrom); | 
 |           break; | 
 |         case ACL_INHERIT_FRAGMENT: | 
 |           if (null == inheritFrom) { | 
 |             throw new IOException("acl-inherit-fragment cannot preceed acl-inherit-from"); | 
 |           } | 
 |           aclBuilder.setInheritFrom(inheritFrom, command.getArgument()); | 
 |           break; | 
 |         case ACL_INHERITANCE_TYPE: | 
 |           Acl.InheritanceType type = STRING_TO_INHERITANCE_TYPE.get(command.getArgument()); | 
 |           if (null == type) { | 
 |             throw new IOException("invalid acl-inheritance-type: " + command.getArgument()); | 
 |           } | 
 |           aclBuilder.setInheritanceType(type); | 
 |           break; | 
 |         case ACL_CASE_SENSITIVE: | 
 |           aclBuilder.setEverythingCaseSensitive(); | 
 |           break; | 
 |         case ACL_CASE_INSENSITIVE: | 
 |           aclBuilder.setEverythingCaseInsensitive(); | 
 |           break; | 
 |         default: | 
 |           throw new IOException("Retriever Error: invalid operation: '" + command.getOperation() + | 
 |               (command.hasArgument() ? "' with argument: '"  + command.getArgument() + "'" : "'")); | 
 |       } | 
 |       command = readCommand(); | 
 |     } | 
 |     // Finish by putting accumulated ACL into response. | 
 |     if (sendAclWithDocument) { | 
 |       aclBuilder.setPermits(permits); | 
 |       aclBuilder.setDenies(denies); | 
 |       response.setAcl(aclBuilder.build()); | 
 |     } | 
 |   } | 
 |  | 
 |   /** | 
 |    * Parse a listing response, sending results to {@code pusher}. If {@code handler} is {@code | 
 |    * null}, then {@code pusher}'s default handler will be used. In case of failure sending in | 
 |    * {@code pusher}, the rest of the input stream may not be read. | 
 |    * | 
 |    * @return {@code null} on success, otherwise the first Record to fail | 
 |    */ | 
 |   public DocIdPusher.Record readFromLister(DocIdPusher pusher, ExceptionHandler handler) | 
 |       throws IOException, InterruptedException { | 
 |     ArrayList<DocIdPusher.Record> result = new ArrayList<DocIdPusher.Record>(); | 
 |     DocIdPusher.Record.Builder builder = null; | 
 |     Command command = readCommand(); | 
 |  | 
 |     // Starting out at end-of-stream so don't send anything. | 
 |     if (command == null) { | 
 |       return null; | 
 |     } | 
 |  | 
 |     // The first operation must be a doc ID. | 
 |     if (command.getOperation() != Operation.ID) { | 
 |       throw new IOException("Lister Error: the first operator must be a document ID. " | 
 |           + " Instead encountered '" + command.getOperation() + "'."); | 
 |     } | 
 |     while (command != null) { | 
 |       switch (command.getOperation()) { | 
 |         case ID: | 
 |           if (builder != null) { | 
 |             result.add(builder.build()); | 
 |             // TODO(ejona): make threshold smarter. | 
 |             if (result.size() >= 10000) { | 
 |               DocIdPusher.Record errorRecord = pusher.pushRecords(result, handler); | 
 |               if (errorRecord != null) { | 
 |                 return errorRecord; | 
 |               } | 
 |               result.clear(); | 
 |             } | 
 |           } | 
 |           builder = new DocIdPusher.Record.Builder(new DocId(command.getArgument())); | 
 |           break; | 
 |         case LAST_MODIFIED: | 
 |           // Convert seconds to milliseconds for Date constructor. | 
 |           builder.setLastModified(new Date(Long.parseLong(command.getArgument()) * 1000)); | 
 |           break; | 
 |         case RESULT_LINK: | 
 |           try { | 
 |             builder.setResultLink(new URI(command.getArgument())); | 
 |           } catch (URISyntaxException e) { | 
 |             throw new IOException("Lister Error: invalid URL: '" + command.getOperation() | 
 |                 + (command.hasArgument() ? "' with argument: '" | 
 |                 + command.getArgument() + "'" : "'"), e); | 
 |           } | 
 |           break; | 
 |         case CRAWL_IMMEDIATELY: | 
 |           builder.setCrawlImmediately(true); | 
 |           break; | 
 |         case CRAWL_ONCE: | 
 |           builder.setCrawlOnce(true); | 
 |           break; | 
 |         case LOCK: | 
 |           builder.setLock(true); | 
 |           break; | 
 |         case DELETE: | 
 |           builder.setDeleteFromIndex(true); | 
 |           break; | 
 |         default: | 
 |           throw new IOException("Lister Error: invalid operation: '" + command.getOperation() + | 
 |               (command.hasArgument() ? "' with argument: '"  + command.getArgument() + "'" : "'")); | 
 |       } | 
 |       command = readCommand(); | 
 |     } | 
 |     result.add(builder.build()); | 
 |     return pusher.pushRecords(result, handler); | 
 |   } | 
 |  | 
 |   /** | 
 |    * Read a command from the command stream | 
 |    * | 
 |    * @return The next command from the command stream. for end-of-steam null is returned. | 
 |    * @throws IOException on stream read error | 
 |    */ | 
 |   private Command readCommand() throws IOException { | 
 |  | 
 |     Command result = null; | 
 |  | 
 |     while (result == null) { | 
 |       String commandTokens[] = parseNextLine(); | 
 |       if (commandTokens == null) { | 
 |         return null; | 
 |       } else if ((commandTokens[0].equals("repository-unavailable"))) { | 
 |         throw new IOException("Error: repository unavailable. " | 
 |             + (commandTokens.length > 1 ? commandTokens[1] : "")); | 
 |       } | 
 |  | 
 |       Operation operation = STRING_TO_OPERATION.get(commandTokens[0]); | 
 |       // Skip over unrecognized commands | 
 |       if (operation == null) { | 
 |         log.warning("Unrecognized command: " + commandTokens[0]); | 
 |         continue; | 
 |       } | 
 |  | 
 |       String argument = null; | 
 |  | 
 |       if (commandTokens.length > 1) { | 
 |         argument = commandTokens[1]; | 
 |       } | 
 |  | 
 |       result = new Command(operation, argument); | 
 |     } | 
 |     return result; | 
 |   } | 
 |  | 
 |   private String[] parseNextLine() throws IOException { | 
 |     checkHeader(); | 
 |     String line = ""; | 
 |     while (line.length() == 0) { | 
 |       line = readCharsUntilMarker(delimiter); | 
 |       // On End-Of-Stream return the end-message command | 
 |       if (line == null) { | 
 |         return null; | 
 |       } | 
 |       // If nothing is between the last delimiter and this one then exit ID list mode | 
 |       if (inIdList && line.length() == 0) { | 
 |         inIdList = false; | 
 |       } else if (!inIdList && line.equals("id-list")) { | 
 |         inIdList = true; | 
 |         line = ""; // loop again | 
 |       } | 
 |     } | 
 |     if (inIdList) { | 
 |       return new String[]{"id", line}; | 
 |     } | 
 |     return line.split("=", 2); | 
 |   } | 
 |  | 
 |   /** | 
 |    * Read and verify the data format header if needed. | 
 |    */ | 
 |   private void checkHeader() throws IOException { | 
 |     if (this.delimiter != null) { | 
 |       return; | 
 |     } | 
 |  | 
 |     String line = readCharsUntilMarker("["); | 
 |     if ((line == null) || (line.length() < HEADER_PREFIX.length()) || | 
 |         !line.substring(0, HEADER_PREFIX.length()).equals(HEADER_PREFIX)) { | 
 |       throw new IOException("Adaptor data must begin with '" + HEADER_PREFIX + "'"); | 
 |     } | 
 |  | 
 |     String versionNumberString = line.substring(HEADER_PREFIX.length()); | 
 |     if (versionNumberString.length() < 3) { | 
 |       throw new IOException("Format version '" + versionNumberString + "' is invalid. " + | 
 |           "The version must be at least one digit with one leading space and one trailing space."); | 
 |     } | 
 |  | 
 |     delimiter = readCharsUntilMarker("]"); | 
 |     if ((delimiter == null) || (delimiter.length() < 1)) { | 
 |       throw new IOException("Delimiter must be at least one character long."); | 
 |     } | 
 |  | 
 |     Pattern pattern = Pattern.compile(DISALLOWED_DELIMITER_CHARS_REGEX); | 
 |     Matcher matcher = pattern.matcher(delimiter); | 
 |  | 
 |     if (matcher.find()) { | 
 |       throw new IOException("Invalid character in delimiter."); | 
 |     } | 
 |  | 
 |     try { | 
 |       versionNumber = Integer.parseInt(versionNumberString.trim()); | 
 |     } catch (NumberFormatException e) { | 
 |       throw new IOException("Format version '" + versionNumberString + "' is invalid."); | 
 |     } | 
 |   } | 
 |  | 
 |  | 
 |   private byte[] readBytesUntilMarker(byte[] marker) throws IOException { | 
 |  | 
 |     if (marker.length == 0) { | 
 |       throw new IOException("Internal Error: Marker length must be greater than zero."); | 
 |     } | 
 |     ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); | 
 |     int matchPosition = 0; | 
 |     int nextByte = 0; | 
 |  | 
 |     while (matchPosition < marker.length) { | 
 |       nextByte = inputStream.read(); | 
 |       if (nextByte == ((int) marker[matchPosition] & 0xff)) { | 
 |         matchPosition += 1; | 
 |       } else { | 
 |         if (matchPosition > 0) { | 
 |           byteArrayOutputStream.write(marker, 0, matchPosition); | 
 |           matchPosition = 0; | 
 |         } | 
 |         if (nextByte == -1) { | 
 |           break; | 
 |         } else { | 
 |           byteArrayOutputStream.write(nextByte); | 
 |         } | 
 |       } | 
 |     } | 
 |     byte[] result = byteArrayOutputStream.toByteArray(); | 
 |     if (nextByte == -1 && result.length == 0) { | 
 |       return null; | 
 |     } else { | 
 |       return result; | 
 |     } | 
 |   } | 
 |  | 
 |   private String readCharsUntilMarker(String marker) throws IOException { | 
 |     byte[] byteMarker = marker.getBytes(CHARSET); | 
 |     byte[] bytes = readBytesUntilMarker(byteMarker); | 
 |     if (bytes == null) { | 
 |       return null; | 
 |     } | 
 |     return CHARSET.newDecoder().decode(ByteBuffer.wrap(bytes)).toString(); | 
 |   } | 
 | } |