src/adaptorlib/CommandStreamParser.java - plexi - Git at Google

 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package adaptorlib;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
  * Parses the adaptor data format into individual commands with associated data.
  *
  * This format is used for communication between the adaptor library and various command line
  * adaptor components (lister, retriever, transformer, authorizor, etc.). It supports responses
  * coming back from the command line adaptor implementation. The format supports a mixture of
  * character and binary data. All character data must be encoded in UTF-8.<p>
  *
  * <h3>Header Format</h3>
  *
  * Communications (via either file or stream) begin with the header:<p>
  *
  * {@code GSA Adaptor Data Version 1 [<delimiter>]}<p>
  *
  * The version number must be proceeded by a single space and followed by a single space. The
  * version number may increase in the future should the format be enhanced.<p>
  *
  * The string between the two square brackets will be used as the delimiter for the remainder of the
  * file being read or for the duration of the communication session.<p>
  *
  * Care must be taken that the delimiter character string can never occur in a document ID, metadata
  * name, metadata value, user name, or any other data that will be represented using the format with
  * the exception of document contents, which can contain the delimiter string. The safest delimiter
  * is likely to be the null character (the character with a value of zero). This character is
  * unlikely to be present in existing names, paths, metadata, etc. Another possible choice is the
  * newline character, though in many systems it is possible for this character to be present in
  * document names and document paths, etc. If in doubt, the null character is recommended. A
  * delimiter can be made up of more than one character so it is possible to have a delimiter that is
  * <CR><LF> or a highly unique string (such as a GUID) that has an exceptionally low probability of
  * occurring in the data.<p>
  *
  * The following characters may not be used in the delimiter:<p> 'A'-'Z', 'a'-'z' and '0'-'9' the
  * alphanumeric characters<br> ':'  colon<br> '/'  slash<br> '-'  hyphen<br> '_'  underscore<br> ' '
  * space<br> '=' equals<br> '+' plus<br> '[' left square bracket<br> ']' right square bracket<p>
  *
  * <h3>Body Format</h3> Elements in the file start with one of the following commands. Commands
  * where data precedes the next delimiter include an equal sign. Commands that are immediately
  * followed by a delimiter do not include an equal sign. The first command must specify a document
  * ID ("id=" or "id-list"). Command that don't specify a document ID are associated with the most
  * recent previously specified document ID.<p>
  *
  * <h1>Common Commands:</h1>
  *
  * "id=" -- specifies a document id<p>
  *
  * "id-list" -- Starts a list of document ids each separated by
  * the specified delimiter, the list is terminated by two consecutive delimiters or EOS
  * (End-Of-Stream). ids in an id-list cannot have any of the associated commands listed below.<p>
  *
  * "repository-unavailable=" -- the document repository is unavailable. The string following the "="
  * character includes additional information that will be logged with the error.
  *
  *
  * <h1>Lister Commands:</h1>
  *
  * "last-modified=" -- specifies the last time the document or its metadata has changed in
  * milliseconds from epoch. If last-modified is specified and the document has never been crawled
  * before or have been crawled prior to the last-modified time then the document will be marked as
  * "crawl-immediate".<p>
  *
  * "crawl-immediately" -- Increases the crawling priority of the document such
  * that the GSA will retrieve it sooner than normally crawled documents.<p>
  *
  * "crawl-once" -- specifies that the document will be crawled by the
  *                 GSA one time but then never re-crawled.<p>
  *
  * "lock" -- Causes the document to remain in the index unless explicitly removed.
  * Failure to retrieve the document during re-crawling will not result in
  * removal of the document. If every document in the GSA is
  * locked then locked document may be forced out when maximum capacity is
  * reached.<p>
  *
  * "delete" -- this document should be deleted from the GSA index.<p>
  *
  * <h1>Retriever Commands:</h1>
  *
  * "up-to-date" -- specifies that the document is up-to-date with respect to its last crawled
  * time.<p>
  *
  * "document-not-found" -- the document does not exists in the repository<p>
  *
  * "mime-type=" -- specifies the document's mime-type. If unspecified then the GSA will
  * automatically assign a type to the document. <p>
  *
  * "meta-value=" -- specifies a metadata value associated with
  * immediately preceding metadata-name<p>
  *
  * "content" -- signals the beginning of binary content which
  * continues to the end of the file or stream<p>
  *
  *
  * End-of-stream terminates the data transmission. Multiple consecutive delimiters are collapsed
  * into a single delimiter and terminates the current id-list should one exist.<p>
  *
  * Unrecognized commands generate a warning but are otherwise ignored.
  *
  * <h3>Examples</h3>
  *
  * Example 1:<p>
  *
  * <pre>
  * {@code
  * GSA Adaptor Data Version 1 [<delimiter>]
  * id-list
  * /home/repository/docs/file1
  * /home/repository/docs/file2
  * /home/repository/docs/file3
  * /home/repository/docs/file4
  * /home/repository/docs/file5
  * }
  * </pre>
  *
  * Example 2:<p>
  *
  * <pre>
  * {@code
  * GSA Adaptor Data Version 1 [<delimiter>]
  * id=/home/repository/docs/file1
  * id=/home/repository/docs/file2
  * crawl-immediately
  * last-modified=20110803 16:07:23
  *
  * meta-name=Department
  * meta-content=Engineering
  *
  * meta-name=Creator
  * meta-content=howardhawks
  *
  * id=/home/repository/docs/file3
  * id=/home/repository/docs/file4
  * id=/home/repository/docs/file5
  * }
  * </pre>
  */
 public class CommandStreamParser {


   public static enum Operation {
     ID,
     LAST_MODIFIED,
     CRAWL_IMMEDIATELY,
     CRAWL_ONCE,
     LOCK,
     DELETE,
     UP_TO_DATE,
     NOT_FOUND,
     MIME_TYPE,
     META_NAME,
     META_VALUE,
     CONTENT
   }

   private static final String HEADER_PREFIX = "GSA Adaptor Data Version";
   private static final String DISALLOWED_DELIMITER_CHARS_REGEX = "[a-zA-Z0-9:/\\-_ =\\+\\[\\]]";
   private static final Charset CHARSET = Charset.forName("UTF-8");

   private static final Map<String, Operation> STRING_TO_OPERATION;

   static {
     Map<String, Operation> stringToOperation = new HashMap<String, Operation>();
     stringToOperation.put("id", Operation.ID);
     stringToOperation.put("last-modified", Operation.LAST_MODIFIED);
     stringToOperation.put("crawl-immediately", Operation.CRAWL_IMMEDIATELY);
     stringToOperation.put("crawl-once", Operation.CRAWL_ONCE);
     stringToOperation.put("lock", Operation.LOCK);
     stringToOperation.put("delete", Operation.DELETE);
     stringToOperation.put("up-to-date", Operation.UP_TO_DATE);
     stringToOperation.put("not-found", Operation.NOT_FOUND);
     stringToOperation.put("mime-type", Operation.MIME_TYPE);
     stringToOperation.put("meta-name", Operation.META_NAME);
     stringToOperation.put("meta-value", Operation.META_VALUE);
     stringToOperation.put("content", Operation.CONTENT);

     // Confirm that every operation is in the map exactly once
     Collection<Operation> opsInMap = stringToOperation.values();
     Operation[] opsInEnum = Operation.class.getEnumConstants();

     if (!opsInMap.containsAll(Arrays.asList(opsInEnum)) || opsInMap.size() != opsInEnum.length) {
       throw new RuntimeException("Internal Error: Every operation must have exactly one"
           + "entry in the stringToOperation map");
     }

     STRING_TO_OPERATION = Collections.unmodifiableMap(stringToOperation);
   }

   private InputStream inputStream;
   private int versionNumber = 0;
   private String delimiter;
   private boolean inIdList;
   private CharsetDecoder charsetDecoder = CHARSET.newDecoder();

   /** */
   public static class RetrieverInfo {

     private boolean upToDate;
     private boolean notFound;
     private DocId docId;
     private String mimeType;
     private Metadata metadata;
     private byte[] contents;

     RetrieverInfo(DocId docId, Metadata metadata, byte[] contents, boolean upToDate,
         String mimeType, boolean notFound) {
       this.docId = docId;
       this.metadata = metadata;
       this.contents = contents;
       this.upToDate = upToDate;
       this.mimeType = mimeType;
       this.notFound = notFound;
     }

     public String getMimeType() {
       return mimeType;
     }

     public boolean isUpToDate() {
       return upToDate;
     }

     public boolean notFound() {
       return notFound;
     }

     public DocId getDocId() {
       return docId;
     }

     public Metadata getMetadata() {
       return metadata;
     }

     public byte[] getContents() {
       return contents;
     }
   }

   /** */
   private static class Command {

     private Operation operation;
     private String argument;
     private byte[] contents;

     Command(Operation operation, String argument, byte[] contents) {
       this.operation = operation;
       this.argument = argument;
       this.contents = contents;
     }

     public Operation getOperation() {
       return operation;
     }

     public String getArgument() {
       return argument;
     }

     public byte[] getContents() {
       return contents;
     }
   }

   public CommandStreamParser(InputStream inputStream) {
     this.inputStream = inputStream;
     inIdList = false;
   }

   public int getVersionNumber() throws IOException {
     checkHeader();
     return versionNumber;
   }

   public ArrayList<DocIdPusher.Record> readFromLister() throws IOException {
     ArrayList<DocIdPusher.Record> result = new ArrayList<DocIdPusher.Record>();
     String docId = null;
     String lastModified = null;
     boolean crawlOnce = false;
     boolean crawlImmediately = false;
     boolean lock = false;
     boolean deleteDocument = false;
     Command command = readCommand();

     // Starting out at end-of-stream so return an empty list.
     if (command == null) {
       return result;
     }

     // The first operation must be a doc ID.
     if (command.getOperation() != Operation.ID) {
       throw new IOException("Lister Error: the first operator must be a document ID. "
           + " Instead encountered '" + command.getOperation() + "'.");
     }
     while (command != null) {
       switch (command.getOperation()) {
         case ID:
           if (docId != null) {
             // TODO (johnfelton) add lister options when API is available
             result.add(new DocIdPusher.Record.Builder(new DocId(docId)).build());
           }
           docId = command.getArgument();
           lastModified = null;
           crawlOnce = false;
           crawlImmediately = false;
           lock = false;
           deleteDocument = false;
           break;
         case LAST_MODIFIED:
           lastModified = command.getArgument();
           break;
         case CRAWL_IMMEDIATELY:
           crawlImmediately = true;
           break;
         case CRAWL_ONCE:
           crawlOnce = true;
           break;
         case LOCK:
           lock = true;
           break;
         case DELETE:
           deleteDocument = true;
           break;
         default:
           throw new IOException("Lister Error: invalid operation: '" + command.getArgument() + "");
       }
       command = readCommand();
     }
     // TODO (johnfelton) add lister options when API is available
     result.add(new DocIdPusher.Record.Builder(new DocId(docId)).build());

     return result;
   }

   public RetrieverInfo readFromRetriever() throws IOException {

     Set<MetaItem> metadata = new HashSet<MetaItem>();
     byte[] content = null;
     boolean upToDate = false;
     boolean notFound = false;
     String mimeType = null;
     Command command = readCommand();

     if (command == null) {
       throw new IOException("Invalid or missing retriever data.");
     } else if (command.getOperation() != Operation.ID) {
       throw new IOException("Retriever Error: the first operator must be a document ID. "
           + " Instead encountered '" + command.getOperation() + "'.");
     }

     String docId = command.getArgument();
     command = readCommand();
     while (command != null) {
       switch (command.getOperation()) {
         case ID:
           throw new IOException("Only one document ID can be specified in a retriever message");
         case CONTENT:
           content = command.getContents();
           break;
         case META_NAME:
           String metaName = command.getArgument();
           command = readCommand();
           if (command == null || command.getOperation() != Operation.META_VALUE) {
             throw new IOException("meta-name must be immediately followed by meta-value");
           }
           metadata.add(MetaItem.raw(metaName, command.getArgument()));
           break;
         case UP_TO_DATE:
           upToDate = true;
           break;
         case NOT_FOUND:
           notFound = true;
           break;
         case MIME_TYPE:
           mimeType = command.getArgument();
           break;
         default:
           throw new IOException(
               "Retriever Error: invalid operation: '" + command.getArgument() + "");
       }
       command = readCommand();
     }

     return new RetrieverInfo(new DocId(docId), new Metadata(metadata),
         content, upToDate, mimeType, notFound);
   }

   /**
    * Read a command from the command stream
    *
    * @return The next command from the command stream. for end-of-steam null is returned.
    * @throws IOException on stream read error
    */
   private Command readCommand() throws IOException {

     Command result = null;

     while (result == null) {
       String commandTokens[] = parseNextLine();
       if (commandTokens == null) {
         return null;
       } else if ((commandTokens[0].equals("repository-unavailable"))) {
         throw new IOException("Error: repository unavailable. "
             + (commandTokens.length > 1 ? commandTokens[1] : ""));
       }

       Operation operation = STRING_TO_OPERATION.get(commandTokens[0]);
       // Skip over unrecognized commands
       if (operation == null) {
         // TODO (johnfelton) add a warning about an unrecognized command
         continue;
       }

       String argument = null;
       byte content[] = null;

       if (commandTokens.length > 1) {
         argument = commandTokens[1];
       }

       if (operation == Operation.CONTENT) {
         content = readBytesUntilEnd();
       }
       result = new Command(operation, argument, content);
     }
     return result;
   }

   private String[] parseNextLine() throws IOException {
     checkHeader();
     String line = "";
     while (line.length() == 0) {
       line = readCharsUntilMarker(delimiter);
       // On End-Of-Stream return the end-message command
       if (line == null) {
         return null;
       }
       // If nothing is between the last delimiter and this one then exit ID list mode
       if (inIdList && line.length() == 0) {
         inIdList = false;
       } else if (!inIdList && line.equals("id-list")) {
         inIdList = true;
         line = ""; // loop again
       }
     }
     if (inIdList) {
       return new String[]{"id", line};
     }
     return line.split("=", 2);
   }

   /**
    * Read and verify the data format header if needed.
    */
   private void checkHeader() throws IOException {
     if (this.delimiter != null) {
       return;
     }

     String line = readCharsUntilMarker("[");
     if ((line == null) || (line.length() < HEADER_PREFIX.length()) ||
         !line.substring(0, HEADER_PREFIX.length()).equals(HEADER_PREFIX)) {
       throw new IOException("Adaptor data must begin with '" + HEADER_PREFIX + "'");
     }

     String versionNumberString = line.substring(HEADER_PREFIX.length());
     if (versionNumberString.length() < 3) {
       throw new IOException("Format version '" + versionNumberString + "' is invalid. " +
           "The version must be at least one digit with one leading space and one trailing space.");
     }

     delimiter = readCharsUntilMarker("]");
     if ((delimiter == null) || (delimiter.length() < 1)) {
       throw new IOException("Delimiter must be at least one character long.");
     }

     Pattern pattern = Pattern.compile(DISALLOWED_DELIMITER_CHARS_REGEX);
     Matcher matcher = pattern.matcher(delimiter);

     if (matcher.find()) {
       throw new IOException("Invalid character in delimiter.");
     }

     try {
       versionNumber = Integer.parseInt(versionNumberString.trim());
     } catch (NumberFormatException e) {
       throw new IOException("Format version '" + versionNumberString + "' is invalid.");
     }
   }


   private byte[] readBytesUntilMarker(byte[] marker) throws IOException {

     if (marker.length == 0) {
       throw new IOException("Internal Error: Marker length must be greater than zero.");
     }
     ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
     int matchPosition = 0;
     int nextByte = 0;

     while (matchPosition < marker.length) {
       nextByte = inputStream.read();
       if (nextByte == ((int) marker[matchPosition] & 0xff)) {
         matchPosition += 1;
       } else {
         if (matchPosition > 0) {
           byteArrayOutputStream.write(marker, 0, matchPosition);
           matchPosition = 0;
         }
         if (nextByte == -1) {
           break;
         } else {
           byteArrayOutputStream.write(nextByte);
         }
       }
     }
     byte[] result = byteArrayOutputStream.toByteArray();
     if (nextByte == -1 && result.length == 0) {
       return null;
     } else {
       return result;
     }

   }

   private String readCharsUntilMarker(String marker) throws IOException {
     byte[] byteMarker = marker.getBytes(CHARSET);
     byte[] bytes = readBytesUntilMarker(byteMarker);
     if (bytes == null) {
       return null;
     }
     return charsetDecoder.decode(ByteBuffer.wrap(bytes)).toString();
   }

   private byte[] readBytesUntilEnd() throws IOException {
     return IOHelper.readInputStreamToByteArray(inputStream);
   }


   private byte[] readBytes(int byteCount) throws IOException {
     byte[] result = new byte[byteCount];
     int bytesRead = IOHelper.readFully(inputStream, result, 0, byteCount);
     if (bytesRead != byteCount) {
       return null;
     } else {
       return result;
     }
   }

 }
	// Copyright 2011 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package adaptorlib;

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.nio.ByteBuffer;
	import java.nio.charset.Charset;
	import java.nio.charset.CharsetDecoder;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Map;
	import java.util.Set;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	/**
	* Parses the adaptor data format into individual commands with associated data.
	*
	* This format is used for communication between the adaptor library and various command line
	* adaptor components (lister, retriever, transformer, authorizor, etc.). It supports responses
	* coming back from the command line adaptor implementation. The format supports a mixture of
	* character and binary data. All character data must be encoded in UTF-8.<p>
	*
	* <h3>Header Format</h3>
	*
	* Communications (via either file or stream) begin with the header:<p>
	*
	* {@code GSA Adaptor Data Version 1 [<delimiter>]}<p>
	*
	* The version number must be proceeded by a single space and followed by a single space. The
	* version number may increase in the future should the format be enhanced.<p>
	*
	* The string between the two square brackets will be used as the delimiter for the remainder of the
	* file being read or for the duration of the communication session.<p>
	*
	* Care must be taken that the delimiter character string can never occur in a document ID, metadata
	* name, metadata value, user name, or any other data that will be represented using the format with
	* the exception of document contents, which can contain the delimiter string. The safest delimiter
	* is likely to be the null character (the character with a value of zero). This character is
	* unlikely to be present in existing names, paths, metadata, etc. Another possible choice is the
	* newline character, though in many systems it is possible for this character to be present in
	* document names and document paths, etc. If in doubt, the null character is recommended. A
	* delimiter can be made up of more than one character so it is possible to have a delimiter that is
	* <CR><LF> or a highly unique string (such as a GUID) that has an exceptionally low probability of
	* occurring in the data.<p>
	*
	* The following characters may not be used in the delimiter:<p> 'A'-'Z', 'a'-'z' and '0'-'9' the
	* alphanumeric characters<br> ':' colon<br> '/' slash<br> '-' hyphen<br> '_' underscore<br> ' '
	* space<br> '=' equals<br> '+' plus<br> '[' left square bracket<br> ']' right square bracket<p>
	*
	* <h3>Body Format</h3> Elements in the file start with one of the following commands. Commands
	* where data precedes the next delimiter include an equal sign. Commands that are immediately
	* followed by a delimiter do not include an equal sign. The first command must specify a document
	* ID ("id=" or "id-list"). Command that don't specify a document ID are associated with the most
	* recent previously specified document ID.<p>
	*
	* <h1>Common Commands:</h1>
	*
	* "id=" -- specifies a document id<p>
	*
	* "id-list" -- Starts a list of document ids each separated by
	* the specified delimiter, the list is terminated by two consecutive delimiters or EOS
	* (End-Of-Stream). ids in an id-list cannot have any of the associated commands listed below.<p>
	*
	* "repository-unavailable=" -- the document repository is unavailable. The string following the "="
	* character includes additional information that will be logged with the error.
	*
	*
	* <h1>Lister Commands:</h1>
	*
	* "last-modified=" -- specifies the last time the document or its metadata has changed in
	* milliseconds from epoch. If last-modified is specified and the document has never been crawled
	* before or have been crawled prior to the last-modified time then the document will be marked as
	* "crawl-immediate".<p>
	*
	* "crawl-immediately" -- Increases the crawling priority of the document such
	* that the GSA will retrieve it sooner than normally crawled documents.<p>
	*
	* "crawl-once" -- specifies that the document will be crawled by the
	* GSA one time but then never re-crawled.<p>
	*
	* "lock" -- Causes the document to remain in the index unless explicitly removed.
	* Failure to retrieve the document during re-crawling will not result in
	* removal of the document. If every document in the GSA is
	* locked then locked document may be forced out when maximum capacity is
	* reached.<p>
	*
	* "delete" -- this document should be deleted from the GSA index.<p>
	*
	* <h1>Retriever Commands:</h1>
	*
	* "up-to-date" -- specifies that the document is up-to-date with respect to its last crawled
	* time.<p>
	*
	* "document-not-found" -- the document does not exists in the repository<p>
	*
	* "mime-type=" -- specifies the document's mime-type. If unspecified then the GSA will
	* automatically assign a type to the document. <p>
	*
	* "meta-value=" -- specifies a metadata value associated with
	* immediately preceding metadata-name<p>
	*
	* "content" -- signals the beginning of binary content which
	* continues to the end of the file or stream<p>
	*
	*
	* End-of-stream terminates the data transmission. Multiple consecutive delimiters are collapsed
	* into a single delimiter and terminates the current id-list should one exist.<p>
	*
	* Unrecognized commands generate a warning but are otherwise ignored.
	*
	* <h3>Examples</h3>
	*
	* Example 1:<p>
	*
	* <pre>
	* {@code
	* GSA Adaptor Data Version 1 [<delimiter>]
	* id-list
	* /home/repository/docs/file1
	* /home/repository/docs/file2
	* /home/repository/docs/file3
	* /home/repository/docs/file4
	* /home/repository/docs/file5
	* }
	* </pre>
	*
	* Example 2:<p>
	*
	* <pre>
	* {@code
	* GSA Adaptor Data Version 1 [<delimiter>]
	* id=/home/repository/docs/file1
	* id=/home/repository/docs/file2
	* crawl-immediately
	* last-modified=20110803 16:07:23
	*
	* meta-name=Department
	* meta-content=Engineering
	*
	* meta-name=Creator
	* meta-content=howardhawks
	*
	* id=/home/repository/docs/file3
	* id=/home/repository/docs/file4
	* id=/home/repository/docs/file5
	* }
	* </pre>
	*/
	public class CommandStreamParser {


	public static enum Operation {
	ID,
	LAST_MODIFIED,
	CRAWL_IMMEDIATELY,
	CRAWL_ONCE,
	LOCK,
	DELETE,
	UP_TO_DATE,
	NOT_FOUND,
	MIME_TYPE,
	META_NAME,
	META_VALUE,
	CONTENT
	}

	private static final String HEADER_PREFIX = "GSA Adaptor Data Version";
	private static final String DISALLOWED_DELIMITER_CHARS_REGEX = "[a-zA-Z0-9:/\\-_ =\\+\\[\\]]";
	private static final Charset CHARSET = Charset.forName("UTF-8");

	private static final Map<String, Operation> STRING_TO_OPERATION;

	static {
	Map<String, Operation> stringToOperation = new HashMap<String, Operation>();
	stringToOperation.put("id", Operation.ID);
	stringToOperation.put("last-modified", Operation.LAST_MODIFIED);
	stringToOperation.put("crawl-immediately", Operation.CRAWL_IMMEDIATELY);
	stringToOperation.put("crawl-once", Operation.CRAWL_ONCE);
	stringToOperation.put("lock", Operation.LOCK);
	stringToOperation.put("delete", Operation.DELETE);
	stringToOperation.put("up-to-date", Operation.UP_TO_DATE);
	stringToOperation.put("not-found", Operation.NOT_FOUND);
	stringToOperation.put("mime-type", Operation.MIME_TYPE);
	stringToOperation.put("meta-name", Operation.META_NAME);
	stringToOperation.put("meta-value", Operation.META_VALUE);
	stringToOperation.put("content", Operation.CONTENT);

	// Confirm that every operation is in the map exactly once
	Collection<Operation> opsInMap = stringToOperation.values();
	Operation[] opsInEnum = Operation.class.getEnumConstants();

	if (!opsInMap.containsAll(Arrays.asList(opsInEnum)) \|\| opsInMap.size() != opsInEnum.length) {
	throw new RuntimeException("Internal Error: Every operation must have exactly one"
	+ "entry in the stringToOperation map");
	}

	STRING_TO_OPERATION = Collections.unmodifiableMap(stringToOperation);
	}

	private InputStream inputStream;
	private int versionNumber = 0;
	private String delimiter;
	private boolean inIdList;
	private CharsetDecoder charsetDecoder = CHARSET.newDecoder();

	/** */
	public static class RetrieverInfo {

	private boolean upToDate;
	private boolean notFound;
	private DocId docId;
	private String mimeType;
	private Metadata metadata;
	private byte[] contents;

	RetrieverInfo(DocId docId, Metadata metadata, byte[] contents, boolean upToDate,
	String mimeType, boolean notFound) {
	this.docId = docId;
	this.metadata = metadata;
	this.contents = contents;
	this.upToDate = upToDate;
	this.mimeType = mimeType;
	this.notFound = notFound;
	}

	public String getMimeType() {
	return mimeType;
	}

	public boolean isUpToDate() {
	return upToDate;
	}

	public boolean notFound() {
	return notFound;
	}

	public DocId getDocId() {
	return docId;
	}

	public Metadata getMetadata() {
	return metadata;
	}

	public byte[] getContents() {
	return contents;
	}
	}

	/** */
	private static class Command {

	private Operation operation;
	private String argument;
	private byte[] contents;

	Command(Operation operation, String argument, byte[] contents) {
	this.operation = operation;
	this.argument = argument;
	this.contents = contents;
	}

	public Operation getOperation() {
	return operation;
	}

	public String getArgument() {
	return argument;
	}

	public byte[] getContents() {
	return contents;
	}
	}

	public CommandStreamParser(InputStream inputStream) {
	this.inputStream = inputStream;
	inIdList = false;
	}

	public int getVersionNumber() throws IOException {
	checkHeader();
	return versionNumber;
	}

	public ArrayList<DocIdPusher.Record> readFromLister() throws IOException {
	ArrayList<DocIdPusher.Record> result = new ArrayList<DocIdPusher.Record>();
	String docId = null;
	String lastModified = null;
	boolean crawlOnce = false;
	boolean crawlImmediately = false;
	boolean lock = false;
	boolean deleteDocument = false;
	Command command = readCommand();

	// Starting out at end-of-stream so return an empty list.
	if (command == null) {
	return result;
	}

	// The first operation must be a doc ID.
	if (command.getOperation() != Operation.ID) {
	throw new IOException("Lister Error: the first operator must be a document ID. "
	+ " Instead encountered '" + command.getOperation() + "'.");
	}
	while (command != null) {
	switch (command.getOperation()) {
	case ID:
	if (docId != null) {
	// TODO (johnfelton) add lister options when API is available
	result.add(new DocIdPusher.Record.Builder(new DocId(docId)).build());
	}
	docId = command.getArgument();
	lastModified = null;
	crawlOnce = false;
	crawlImmediately = false;
	lock = false;
	deleteDocument = false;
	break;
	case LAST_MODIFIED:
	lastModified = command.getArgument();
	break;
	case CRAWL_IMMEDIATELY:
	crawlImmediately = true;
	break;
	case CRAWL_ONCE:
	crawlOnce = true;
	break;
	case LOCK:
	lock = true;
	break;
	case DELETE:
	deleteDocument = true;
	break;
	default:
	throw new IOException("Lister Error: invalid operation: '" + command.getArgument() + "");
	}
	command = readCommand();
	}
	// TODO (johnfelton) add lister options when API is available
	result.add(new DocIdPusher.Record.Builder(new DocId(docId)).build());

	return result;
	}

	public RetrieverInfo readFromRetriever() throws IOException {

	Set<MetaItem> metadata = new HashSet<MetaItem>();
	byte[] content = null;
	boolean upToDate = false;
	boolean notFound = false;
	String mimeType = null;
	Command command = readCommand();

	if (command == null) {
	throw new IOException("Invalid or missing retriever data.");
	} else if (command.getOperation() != Operation.ID) {
	throw new IOException("Retriever Error: the first operator must be a document ID. "
	+ " Instead encountered '" + command.getOperation() + "'.");
	}

	String docId = command.getArgument();
	command = readCommand();
	while (command != null) {
	switch (command.getOperation()) {
	case ID:
	throw new IOException("Only one document ID can be specified in a retriever message");
	case CONTENT:
	content = command.getContents();
	break;
	case META_NAME:
	String metaName = command.getArgument();
	command = readCommand();
	if (command == null \|\| command.getOperation() != Operation.META_VALUE) {
	throw new IOException("meta-name must be immediately followed by meta-value");
	}
	metadata.add(MetaItem.raw(metaName, command.getArgument()));
	break;
	case UP_TO_DATE:
	upToDate = true;
	break;
	case NOT_FOUND:
	notFound = true;
	break;
	case MIME_TYPE:
	mimeType = command.getArgument();
	break;
	default:
	throw new IOException(
	"Retriever Error: invalid operation: '" + command.getArgument() + "");
	}
	command = readCommand();
	}

	return new RetrieverInfo(new DocId(docId), new Metadata(metadata),
	content, upToDate, mimeType, notFound);
	}

	/**
	* Read a command from the command stream
	*
	* @return The next command from the command stream. for end-of-steam null is returned.
	* @throws IOException on stream read error
	*/
	private Command readCommand() throws IOException {

	Command result = null;

	while (result == null) {
	String commandTokens[] = parseNextLine();
	if (commandTokens == null) {
	return null;
	} else if ((commandTokens[0].equals("repository-unavailable"))) {
	throw new IOException("Error: repository unavailable. "
	+ (commandTokens.length > 1 ? commandTokens[1] : ""));
	}

	Operation operation = STRING_TO_OPERATION.get(commandTokens[0]);
	// Skip over unrecognized commands
	if (operation == null) {
	// TODO (johnfelton) add a warning about an unrecognized command
	continue;
	}

	String argument = null;
	byte content[] = null;

	if (commandTokens.length > 1) {
	argument = commandTokens[1];
	}

	if (operation == Operation.CONTENT) {
	content = readBytesUntilEnd();
	}
	result = new Command(operation, argument, content);
	}
	return result;
	}

	private String[] parseNextLine() throws IOException {
	checkHeader();
	String line = "";
	while (line.length() == 0) {
	line = readCharsUntilMarker(delimiter);
	// On End-Of-Stream return the end-message command
	if (line == null) {
	return null;
	}
	// If nothing is between the last delimiter and this one then exit ID list mode
	if (inIdList && line.length() == 0) {
	inIdList = false;
	} else if (!inIdList && line.equals("id-list")) {
	inIdList = true;
	line = ""; // loop again
	}
	}
	if (inIdList) {
	return new String[]{"id", line};
	}
	return line.split("=", 2);
	}

	/**
	* Read and verify the data format header if needed.
	*/
	private void checkHeader() throws IOException {
	if (this.delimiter != null) {
	return;
	}

	String line = readCharsUntilMarker("[");
	if ((line == null) \|\| (line.length() < HEADER_PREFIX.length()) \|\|
	!line.substring(0, HEADER_PREFIX.length()).equals(HEADER_PREFIX)) {
	throw new IOException("Adaptor data must begin with '" + HEADER_PREFIX + "'");
	}

	String versionNumberString = line.substring(HEADER_PREFIX.length());
	if (versionNumberString.length() < 3) {
	throw new IOException("Format version '" + versionNumberString + "' is invalid. " +
	"The version must be at least one digit with one leading space and one trailing space.");
	}

	delimiter = readCharsUntilMarker("]");
	if ((delimiter == null) \|\| (delimiter.length() < 1)) {
	throw new IOException("Delimiter must be at least one character long.");
	}

	Pattern pattern = Pattern.compile(DISALLOWED_DELIMITER_CHARS_REGEX);
	Matcher matcher = pattern.matcher(delimiter);

	if (matcher.find()) {
	throw new IOException("Invalid character in delimiter.");
	}

	try {
	versionNumber = Integer.parseInt(versionNumberString.trim());
	} catch (NumberFormatException e) {
	throw new IOException("Format version '" + versionNumberString + "' is invalid.");
	}
	}


	private byte[] readBytesUntilMarker(byte[] marker) throws IOException {

	if (marker.length == 0) {
	throw new IOException("Internal Error: Marker length must be greater than zero.");
	}
	ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
	int matchPosition = 0;
	int nextByte = 0;

	while (matchPosition < marker.length) {
	nextByte = inputStream.read();
	if (nextByte == ((int) marker[matchPosition] & 0xff)) {
	matchPosition += 1;
	} else {
	if (matchPosition > 0) {
	byteArrayOutputStream.write(marker, 0, matchPosition);
	matchPosition = 0;
	}
	if (nextByte == -1) {
	break;
	} else {
	byteArrayOutputStream.write(nextByte);
	}
	}
	}
	byte[] result = byteArrayOutputStream.toByteArray();
	if (nextByte == -1 && result.length == 0) {
	return null;
	} else {
	return result;
	}

	}

	private String readCharsUntilMarker(String marker) throws IOException {
	byte[] byteMarker = marker.getBytes(CHARSET);
	byte[] bytes = readBytesUntilMarker(byteMarker);
	if (bytes == null) {
	return null;
	}
	return charsetDecoder.decode(ByteBuffer.wrap(bytes)).toString();
	}

	private byte[] readBytesUntilEnd() throws IOException {
	return IOHelper.readInputStreamToByteArray(inputStream);
	}


	private byte[] readBytes(int byteCount) throws IOException {
	byte[] result = new byte[byteCount];
	int bytesRead = IOHelper.readFully(inputStream, result, 0, byteCount);
	if (bytesRead != byteCount) {
	return null;
	} else {
	return result;
	}
	}

	}