Remove unsupported character codes from XML response
b/16505221 Binary character references failed to parse in SP connector v4
Code Review : https://codereview.appspot.com/113520043/
diff --git a/src/com/google/enterprise/adaptor/sharepoint/SiteDataClient.java b/src/com/google/enterprise/adaptor/sharepoint/SiteDataClient.java
index 19b073c..70c9ce8 100644
--- a/src/com/google/enterprise/adaptor/sharepoint/SiteDataClient.java
+++ b/src/com/google/enterprise/adaptor/sharepoint/SiteDataClient.java
@@ -34,6 +34,7 @@
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Logger;
+import java.util.regex.Pattern;
import javax.xml.XMLConstants;
import javax.xml.bind.JAXBContext;
@@ -69,6 +70,22 @@
*/
private static final Schema schema;
+ // Unused character range 1 : � - 
+ private static final String UNUSED_CHAR_RANGE1 = "(0[0-8])";
+ // Unused character range 2 :  - 
+ private static final String UNUSED_CHAR_RANGE2 = "(1[12])";
+ // Unused character range 3 :  - 
+ private static final String UNUSED_CHAR_RANGE3 = "(1[4-9]|2[0-9]|3[01])";
+ // Unused character range 4 :  - Ÿ
+ private static final String UNUSED_CHAR_RANGE4 = "(1(2[7-9]|[3-5][0-9]))";
+
+ /**
+ * Pattern to match unused character code ranges causing XML parsing to fail.
+ */
+ private static final Pattern BINARY_UNUSED_CHAR_PATTERN
+ = Pattern.compile("&#(" + UNUSED_CHAR_RANGE1 + "|" + UNUSED_CHAR_RANGE2
+ + "|" + UNUSED_CHAR_RANGE3 + "|" + UNUSED_CHAR_RANGE4 + ");");
+
static {
try {
jaxbContext = JAXBContext.newInstance(
@@ -292,12 +309,13 @@
@VisibleForTesting
<T> T jaxbParse(String xml, Class<T> klass)
throws XmlProcessingException {
- if (xml.contains("")) {
- // Unit separator is sometimes present in ows_MetaInfo of the response
- // XML, but it prevents the XML from being parsed. Since we don't actually
- // care about MetaInfo we strip it out.
- xml = xml.replace("", "");
- }
+
+ // Unsupported character codes such as Unit separator  sometimes
+ // present in response XML, but it prevents the XML from being parsed.
+ // Since GSA can not handle these characters we strip it out.
+
+ xml = BINARY_UNUSED_CHAR_PATTERN.matcher(xml).replaceAll("");
+
Source source = new StreamSource(new StringReader(xml));
try {
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
diff --git a/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java b/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
index ae5d523..4fcfda9 100644
--- a/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
+++ b/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
@@ -2216,6 +2216,35 @@
}
@Test
+ public void testUnusedCharCodeStripping() throws Exception {
+ SiteDataClient client = new SiteDataClient(
+ new UnsupportedSiteData(), true);
+ String xml = loadTestString("sites-SiteCollection-Lists-CustomList-1-f.xml")
+ .replace("<Folder>",
+ "<Folder xmlns='http://schemas.microsoft.com/sharepoint/soap/'>")
+ .replace("MetaInfo='2;#'", "MetaInfo='2;#�'");
+ assertNotNull(client.jaxbParse(xml, ItemData.class));
+
+ xml = loadTestString("sites-SiteCollection-Lists-CustomList-1-f.xml")
+ .replace("<Folder>",
+ "<Folder xmlns='http://schemas.microsoft.com/sharepoint/soap/'>")
+ .replace("MetaInfo='2;#'", "MetaInfo='2;#'");
+ assertNotNull(client.jaxbParse(xml, ItemData.class));
+
+ xml = loadTestString("sites-SiteCollection-Lists-CustomList-1-f.xml")
+ .replace("<Folder>",
+ "<Folder xmlns='http://schemas.microsoft.com/sharepoint/soap/'>")
+ .replace("MetaInfo='2;#'", "MetaInfo='2;#'");
+ assertNotNull(client.jaxbParse(xml, ItemData.class));
+
+ xml = loadTestString("sites-SiteCollection-Lists-CustomList-1-f.xml")
+ .replace("<Folder>",
+ "<Folder xmlns='http://schemas.microsoft.com/sharepoint/soap/'>")
+ .replace("MetaInfo='2;#'", "MetaInfo='2;#€'");
+ assertNotNull(client.jaxbParse(xml, ItemData.class));
+ }
+
+ @Test
public void testParseUnknownXml() throws Exception {
SiteDataClient client = new SiteDataClient(
new UnsupportedSiteData(), true);