Remove unsupported character codes from XML response
b/16505221 Binary character references failed to parse in SP connector v4
Code Review : https://codereview.appspot.com/113520043/
diff --git a/src/com/google/enterprise/adaptor/sharepoint/SiteDataClient.java b/src/com/google/enterprise/adaptor/sharepoint/SiteDataClient.java
index 19b073c..70c9ce8 100644
--- a/src/com/google/enterprise/adaptor/sharepoint/SiteDataClient.java
+++ b/src/com/google/enterprise/adaptor/sharepoint/SiteDataClient.java
@@ -34,6 +34,7 @@
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.logging.Logger;
+import java.util.regex.Pattern;
 
 import javax.xml.XMLConstants;
 import javax.xml.bind.JAXBContext;
@@ -69,6 +70,22 @@
    */
   private static final Schema schema;
 
+  // Unused character range 1 : � - 
+  private static final String UNUSED_CHAR_RANGE1 = "(0[0-8])";
+  // Unused character range 2 :  - 
+  private static final String UNUSED_CHAR_RANGE2 = "(1[12])";
+  // Unused character range 3 :  - 
+  private static final String UNUSED_CHAR_RANGE3 = "(1[4-9]|2[0-9]|3[01])";
+  // Unused character range 4 :  - Ÿ
+  private static final String UNUSED_CHAR_RANGE4 = "(1(2[7-9]|[3-5][0-9]))";
+
+  /**
+   * Pattern to match unused character code ranges causing XML parsing to fail.
+   */
+  private static final Pattern BINARY_UNUSED_CHAR_PATTERN 
+      = Pattern.compile("&#(" + UNUSED_CHAR_RANGE1 + "|" + UNUSED_CHAR_RANGE2 
+          + "|" + UNUSED_CHAR_RANGE3 + "|" + UNUSED_CHAR_RANGE4 + ");");
+
   static {
     try {
       jaxbContext = JAXBContext.newInstance(
@@ -292,12 +309,13 @@
   @VisibleForTesting
   <T> T jaxbParse(String xml, Class<T> klass)
       throws XmlProcessingException {
-    if (xml.contains("&#31;")) {
-      // Unit separator is sometimes present in ows_MetaInfo of the response
-      // XML, but it prevents the XML from being parsed. Since we don't actually
-      // care about MetaInfo we strip it out.
-      xml = xml.replace("&#31;", "");
-    }
+
+    // Unsupported character codes such as Unit separator &#31; sometimes 
+    // present in response XML, but it prevents the XML from being parsed. 
+    // Since GSA can not handle these characters we strip it out.
+
+    xml = BINARY_UNUSED_CHAR_PATTERN.matcher(xml).replaceAll("");
+
     Source source = new StreamSource(new StringReader(xml));
     try {
       Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
diff --git a/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java b/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
index ae5d523..4fcfda9 100644
--- a/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
+++ b/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
@@ -2216,6 +2216,35 @@
   }
 
   @Test
+  public void testUnusedCharCodeStripping() throws Exception {
+    SiteDataClient client = new SiteDataClient(
+        new UnsupportedSiteData(), true);
+    String xml = loadTestString("sites-SiteCollection-Lists-CustomList-1-f.xml")
+        .replace("<Folder>",
+            "<Folder xmlns='http://schemas.microsoft.com/sharepoint/soap/'>")
+        .replace("MetaInfo='2;#'", "MetaInfo='2;#&#00;'");
+    assertNotNull(client.jaxbParse(xml, ItemData.class));
+    
+    xml = loadTestString("sites-SiteCollection-Lists-CustomList-1-f.xml")
+        .replace("<Folder>",
+            "<Folder xmlns='http://schemas.microsoft.com/sharepoint/soap/'>")
+        .replace("MetaInfo='2;#'", "MetaInfo='2;#&#11;'");
+    assertNotNull(client.jaxbParse(xml, ItemData.class));
+    
+    xml = loadTestString("sites-SiteCollection-Lists-CustomList-1-f.xml")
+        .replace("<Folder>",
+            "<Folder xmlns='http://schemas.microsoft.com/sharepoint/soap/'>")
+        .replace("MetaInfo='2;#'", "MetaInfo='2;#&#21;'");
+    assertNotNull(client.jaxbParse(xml, ItemData.class));
+    
+    xml = loadTestString("sites-SiteCollection-Lists-CustomList-1-f.xml")
+        .replace("<Folder>",
+            "<Folder xmlns='http://schemas.microsoft.com/sharepoint/soap/'>")
+        .replace("MetaInfo='2;#'", "MetaInfo='2;#&#128;'");
+    assertNotNull(client.jaxbParse(xml, ItemData.class));
+  }
+
+  @Test
   public void testParseUnknownXml() throws Exception {
     SiteDataClient client = new SiteDataClient(
         new UnsupportedSiteData(), true);