Add metadata to list item contents for snippets

For list items that are not treated as files, we provide their metadata
as contents so that the GSA can form snippets for found words. Although
we mark the metadata in the content to not be indexed (because it is
provided via headers as well), it is still used when forming the
snippets.

Since SharePoint commonly has HTML in the metadata and we can't safely
treat the HTML as HTML nor can we be guaranteed to properly detect HTML,
we always treat metadata values as text and we minimally try to detect
HTML. When we detect HTML we just strip its tags and resolve a few HTML
entities. This is simply so the HTML does not show up in the snippets.

We go ahead and print all the metadata keys in addition to their values
simply because it makes the cached version marginally less random/ugly
and it is more helpful when debugging, since we are printing the
metadata values anyway.
diff --git a/src/com/google/enterprise/adaptor/sharepoint/HtmlResponseWriter.java b/src/com/google/enterprise/adaptor/sharepoint/HtmlResponseWriter.java
index 86be21c..00cf2df 100644
--- a/src/com/google/enterprise/adaptor/sharepoint/HtmlResponseWriter.java
+++ b/src/com/google/enterprise/adaptor/sharepoint/HtmlResponseWriter.java
@@ -15,6 +15,7 @@
 package com.google.enterprise.adaptor.sharepoint;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Multimap;
 import com.google.common.io.CountingOutputStream;
 import com.google.enterprise.adaptor.DocId;
 import com.google.enterprise.adaptor.DocIdEncoder;
@@ -30,6 +31,7 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Locale;
+import java.util.Map;
 import java.util.concurrent.Executor;
 import java.util.logging.Level;
 import java.util.logging.Logger;
@@ -166,6 +168,37 @@
     writer.write("</a></li>");
   }
 
+  private void addComment(String comment) throws IOException {
+    writer.write("<!--");
+    writer.write(escapeContent(comment));
+    writer.write("-->");
+  }
+
+  private void googleoffIndex() throws IOException {
+    addComment("googleoff: index");
+  }
+
+  private void googleonIndex() throws IOException {
+    addComment("googleon: index");
+  }
+
+  public void addMetadata(Multimap<String, String> metadata)
+      throws IOException {
+    checkAndCloseSection();
+    googleoffIndex();
+    writer.write("<table style='border: none'>");
+    for (Map.Entry<String, String> me : metadata.entries()) {
+      writer.write("<tr><td>");
+      writer.write(escapeContent(me.getKey()));
+      writer.write("</td><td>");
+      writer.write(escapeContent(me.getValue()));
+      writer.write("</td></tr>");
+    }
+    writer.write("</table>");
+    googleonIndex();
+    state = State.STARTED;
+  }
+
   /**
    * Complete HTML body and flush.
    */
diff --git a/src/com/google/enterprise/adaptor/sharepoint/SharePointAdaptor.java b/src/com/google/enterprise/adaptor/sharepoint/SharePointAdaptor.java
index 3de9a5c..e2e203f 100644
--- a/src/com/google/enterprise/adaptor/sharepoint/SharePointAdaptor.java
+++ b/src/com/google/enterprise/adaptor/sharepoint/SharePointAdaptor.java
@@ -17,6 +17,8 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.cache.CacheBuilder;
 import com.google.common.cache.LoadingCache;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.TreeMultimap;
 import com.google.enterprise.adaptor.AbstractAdaptor;
 import com.google.enterprise.adaptor.Acl;
 import com.google.enterprise.adaptor.AdaptorContext;
@@ -208,6 +210,29 @@
   private static final Pattern METADATA_ESCAPE_PATTERN
       = Pattern.compile("_x([0-9a-f]{4})_");
 
+  private static final String HTML_NAME = "[a-zA-Z:_][a-zA-Z:_0-9.-]*";
+  private static final Pattern HTML_TAG_PATTERN
+      = Pattern.compile(
+          // Tag and attributes
+          "<" + HTML_NAME + "(?:[ \n\t]+" + HTML_NAME + "="
+            // Attribute values
+            + "(?:'[^']*'|\"[^\"]*\"|[a-zA-Z0-9._:-]*))*[ \n\t]*/?>"
+          // Close tags
+          + "|</" + HTML_NAME + ">", Pattern.DOTALL);
+  private static final Pattern HTML_ENTITY_PATTERN
+      = Pattern.compile("&(#[0-9]+|[a-zA-Z0-9]+);");
+  private static final Map<String, String> HTML_ENTITIES;
+  static {
+    HashMap<String, String> map = new HashMap<String, String>();
+    map.put("quot", "\"");
+    map.put("amp", "&");
+    map.put("lt", "<");
+    map.put("gt", ">");
+    map.put("nbsp", "\u00a0");
+    map.put("apos", "'");
+    HTML_ENTITIES = Collections.unmodifiableMap(map);
+  }
+
   private static final String SITE_COLLECTION_ADMIN_FRAGMENT = "admin";
 
   private static final Logger log
@@ -883,6 +908,42 @@
         uri.getScheme(), uri.getAuthority(), null, null, null).toString();
   }
 
+  /**
+   * Convert from text/html to text/plain. Although we hope for good fidelity,
+   * getting the conversion perfect is not necessary.
+   */
+  @VisibleForTesting
+  static String stripHtml(String html) {
+    html = HTML_TAG_PATTERN.matcher(html).replaceAll("");
+    Matcher m = HTML_ENTITY_PATTERN.matcher(html);
+    StringBuffer sb = new StringBuffer();
+    while (m.find()) {
+      String entity = m.group(1);
+      String decodedEntity;
+      if (entity.startsWith("#")) {
+        entity = entity.substring(1);
+        try {
+          // HTML entities are only in UCS-2 range, so no need to worry about
+          // converting to surrogates.
+          char c = (char) Integer.parseInt(entity);
+          decodedEntity = Character.toString(c);
+        } catch (NumberFormatException ex) {
+          log.log(Level.FINE, "Could not decode entity", ex);
+          decodedEntity = "";
+        }
+      } else {
+        entity = entity.toLowerCase(Locale.ENGLISH);
+        decodedEntity = HTML_ENTITIES.get(entity);
+        if (decodedEntity == null) {
+          decodedEntity = "";
+        }
+      }
+      m.appendReplacement(sb, Matcher.quoteReplacement(decodedEntity));
+    }
+    m.appendTail(sb);
+    return sb.toString();
+  }
+
   @VisibleForTesting
   class SiteAdaptor {
     private final SiteDataClient siteDataClient;
@@ -1432,6 +1493,11 @@
     }
 
     private long addMetadata(Response response, String name, String value) {
+      return addMetadata(response, name, value, null);
+    }
+
+    private long addMetadata(Response response, String name, String value,
+        Multimap<String, String> addedMetadata) {
       long size = 0;
       if ("ows_MetaInfo".equals(name)) {
         // ows_MetaInfo is parsed out into other fields for us by SharePoint.
@@ -1451,6 +1517,9 @@
             continue;
           }
           response.addMetadata(name, parts[i]);
+          if (addedMetadata != null) {
+            addedMetadata.put(name, parts[i]);
+          }
           // +30 for per-metadata-possible overhead, just to make sure that we
           // don't count too few.
           size += name.length() + parts[i].length() + 30;
@@ -1463,12 +1532,18 @@
             continue;
           }
           response.addMetadata(name, part);
+          if (addedMetadata != null) {
+            addedMetadata.put(name, part);
+          }
           // +30 for per-metadata-possible overhead, just to make sure that we
           // don't count too few.
           size += name.length() + part.length() + 30;
         }
       } else {
         response.addMetadata(name, value);
+        if (addedMetadata != null) {
+          addedMetadata.put(name, value);
+        }
         // +30 for per-metadata-possible overhead, just to make sure that we
         // don't count too few.
         size += name.length() + value.length() + 30;
@@ -1807,10 +1882,11 @@
       String title = row.getAttribute(OWS_TITLE_ATTRIBUTE);
       String serverUrl = row.getAttribute(OWS_SERVERURL_ATTRIBUTE);
 
+      Multimap<String, String> metadata = TreeMultimap.create();
       long metadataLength = 0;
       for (Attr attribute : getAllAttributes(row)) {
-        metadataLength
-            += addMetadata(response, attribute.getName(), attribute.getValue());
+        metadataLength += addMetadata(response, attribute.getName(),
+            attribute.getValue(), metadata);
       }
       metadataLength += addMetadata(response,
           METADATA_PARENT_WEB_TITLE, w.webTitle);
@@ -1846,6 +1922,7 @@
         writer.start(request.getDocId(), ObjectType.FOLDER, null);
         processAttachments(listId, itemId, row, writer);
         processFolder(listId, folder.substring(root.length()), writer);
+        writeMetadataAsContent(writer, metadata);
         writer.finish();
         log.exiting("SiteAdaptor", "getListItemDocContent");
         return;
@@ -1874,6 +1951,7 @@
             = createHtmlResponseWriter(response, metadataLength);
         writer.start(request.getDocId(), ObjectType.LIST_ITEM, title);
         processAttachments(listId, itemId, row, writer);
+        writeMetadataAsContent(writer, metadata);
         writer.finish();
       }
       log.exiting("SiteAdaptor", "getListItemDocContent");
@@ -1894,6 +1972,26 @@
       }
     }
 
+    /**
+     * Write out metadata as content so that snippets can be more helpful.
+     */
+    private void writeMetadataAsContent(HtmlResponseWriter writer,
+        Multimap<String, String> metadata) throws IOException {
+      Multimap<String, String> cleanedMetadata = TreeMultimap.create();
+      for (Map.Entry<String, String> me : metadata.entries()) {
+        String value = me.getValue();
+        if (value.startsWith("<") && value.endsWith(">")) {
+          // Assume it is HTML and remove the tags, since otherwise the HTML
+          // will be encoded and show up in snippets. If we assumed wrong, then
+          // we simply removed some content from showing up in snippets. In no
+          // way is this cleanup necessary for correctness.
+          value = stripHtml(value);
+        }
+        cleanedMetadata.put(me.getKey(), value);
+      }
+      writer.addMetadata(cleanedMetadata);
+    }
+
     private boolean getAttachmentDocContent(Request request, Response response)
         throws IOException {
       log.entering("SiteAdaptor", "getAttachmentDocContent", new Object[] {
diff --git a/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java b/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
index fa5ad7c..be5b7df 100644
--- a/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
+++ b/test/com/google/enterprise/adaptor/sharepoint/SharePointAdaptorTest.java
@@ -367,6 +367,24 @@
   }
 
   @Test
+  public void testStripHtml() {
+    assertEquals("<testing@example.com>",
+        SharePointAdaptor.stripHtml("<testing@example.com>"));
+    assertEquals("some text",
+        SharePointAdaptor.stripHtml("<div><b>some</b> text</div>"));
+    assertEquals("a 0 ",
+        SharePointAdaptor.stripHtml("<br><a href=\"test's\" hover=none "
+          + "\nz='fo\" '>a &#0048;</a> "));
+    // The space isn't a space, but a no-break space.
+    // The \u0000 is is simply to make sure we don't break. It is actually
+    // invalid input, so we don't care too much how we resolve it.
+    assertEquals(" &&<>\"'0\u2014\u0000$",
+        SharePointAdaptor.stripHtml(
+            "&nbsp;&&amp;&lt;&gt;&quot;&apos;&#0048;&#8212;&abcde;"
+            + "&#9999999999;&#65536;&#0036;"));
+  }
+
+  @Test
   public void testGetDocContentWrongServer() throws Exception {
     SoapFactory siteDataFactory = MockSoapFactory.blank()
         .endpoint(AUTH_ENDPOINT, new MockAuthenticationSoap())
@@ -901,7 +919,59 @@
         + "<body><h1>List Item Inside Folder</h1>"
         + "<p>Attachments</p><ul>"
         + "<li><a href=\"../Attachments/2/1046000.pdf\">1046000.pdf</a></li>"
-        + "</ul></body></html>";
+        + "</ul>"
+        + "<!--googleoff: index--><table style='border: none'>"
+        + "<tr><td>Attachments</td><td>1</td></tr>"
+        + "<tr><td>Author</td><td>System Account</td></tr>"
+        + "<tr><td>BaseName</td><td>2_</td></tr>"
+        + "<tr><td>ContentType</td><td>Item</td></tr>"
+        + "<tr><td>ContentTypeId</td>"
+        +   "<td>0x0100442459C9B5E59C4F9CFDC789A220FC92</td></tr>"
+        + "<tr><td>Created</td><td>2012-05-01T22:14:06Z</td></tr>"
+        + "<tr><td>Created Date</td><td>2012-05-01T22:14:06Z</td></tr>"
+        + "<tr><td>Editor</td><td>System Account</td></tr>"
+        + "<tr><td>EncodedAbsUrl</td>"
+        +   "<td>http://localhost:1/sites/SiteCollection/Lists/Custom%20List/"
+        +   "Test%20Folder/2_.000</td></tr>"
+        + "<tr><td>FSObjType</td><td>0</td></tr>"
+        + "<tr><td>FileDirRef</td>"
+        + "<td>sites/SiteCollection/Lists/Custom List/Test Folder</td></tr>"
+        + "<tr><td>FileLeafRef</td><td>2_.000</td></tr>"
+        + "<tr><td>FileRef</td>"
+        +   "<td>sites/SiteCollection/Lists/Custom List/Test Folder/2_.000</td>"
+        +   "</tr>"
+        + "<tr><td>GUID</td>"
+        +   "<td>{2C5BEF60-18FA-42CA-B472-7B5E1EC405A5}</td></tr>"
+        + "<tr><td>ID</td><td>2</td></tr>"
+        + "<tr><td>Last Modified</td><td>2012-05-01T22:14:06Z</td></tr>"
+        + "<tr><td>LinkFilename</td><td>2_.000</td></tr>"
+        + "<tr><td>LinkFilenameNoMenu</td><td>2_.000</td></tr>"
+        + "<tr><td>LinkTitle</td><td>Inside Folder</td></tr>"
+        + "<tr><td>LinkTitleNoMenu</td><td>Inside Folder</td></tr>"
+        + "<tr><td>Modified</td><td>2012-05-04T21:24:32Z</td></tr>"
+        + "<tr><td>Order</td><td>200.000000000000</td></tr>"
+        + "<tr><td>PermMask</td><td>0x7fffffffffffffff</td></tr>"
+        + "<tr><td>ScopeId</td>"
+        +   "<td>{2E29615C-59E7-493B-B08A-3642949CC069}</td></tr>"
+        + "<tr><td>SelectTitle</td><td>2</td></tr>"
+        + "<tr><td>ServerRedirected</td><td>0</td></tr>"
+        + "<tr><td>ServerUrl</td>"
+        +   "<td>/sites/SiteCollection/Lists/Custom List/Test Folder/2_.000"
+        +   "</td></tr>"
+        + "<tr><td>Title</td><td>Inside Folder</td></tr>"
+        + "<tr><td>UniqueId</td>"
+        +   "<td>{E7156244-AC2F-4402-AA74-7A365726CD02}</td></tr>"
+        + "<tr><td>WorkflowVersion</td><td>1</td></tr>"
+        + "<tr><td>_EditMenuTableEnd</td><td>2</td></tr>"
+        + "<tr><td>_EditMenuTableStart</td><td>2_.000</td></tr>"
+        + "<tr><td>_IsCurrentVersion</td><td>1</td></tr>"
+        + "<tr><td>_Level</td><td>1</td></tr>"
+        + "<tr><td>_ModerationStatus</td><td>0</td></tr>"
+        + "<tr><td>_UIVersion</td><td>512</td></tr>"
+        + "<tr><td>_UIVersionString</td><td>1.0</td></tr>"
+        + "<tr><td>owshiddenversion</td><td>4</td></tr>"
+        + "</table><!--googleon: index-->"
+        + "</body></html>";
     final Metadata goldenMetadata;
     {
       Metadata meta = new Metadata();
@@ -1053,6 +1123,57 @@
     final String golden = "<!DOCTYPE html>\n"
         + "<html><head><title>List Item Inside Folder</title></head>"
         + "<body><h1>List Item Inside Folder</h1>"
+        + "<!--googleoff: index--><table style='border: none'>"
+        + "<tr><td>Attachments</td><td>0</td></tr>"
+        + "<tr><td>Author</td><td>System Account</td></tr>"
+        + "<tr><td>BaseName</td><td>2_</td></tr>"
+        + "<tr><td>ContentType</td><td>Item</td></tr>"
+        + "<tr><td>ContentTypeId</td>"
+        +   "<td>0x0100442459C9B5E59C4F9CFDC789A220FC92</td></tr>"
+        + "<tr><td>Created</td><td>2012-05-01T22:14:06Z</td></tr>"
+        + "<tr><td>Created Date</td><td>2012-05-01T22:14:06Z</td></tr>"
+        + "<tr><td>Editor</td><td>System Account</td></tr>"
+        + "<tr><td>EncodedAbsUrl</td>"
+        +   "<td>http://localhost:1/sites/SiteCollection/Lists/Custom%20List/"
+        +   "Test%20Folder/2_.000</td></tr>"
+        + "<tr><td>FSObjType</td><td>0</td></tr>"
+        + "<tr><td>FileDirRef</td>"
+        + "<td>sites/SiteCollection/Lists/Custom List/Test Folder</td></tr>"
+        + "<tr><td>FileLeafRef</td><td>2_.000</td></tr>"
+        + "<tr><td>FileRef</td>"
+        +   "<td>sites/SiteCollection/Lists/Custom List/Test Folder/2_.000</td>"
+        +   "</tr>"
+        + "<tr><td>GUID</td>"
+        +   "<td>{2C5BEF60-18FA-42CA-B472-7B5E1EC405A5}</td></tr>"
+        + "<tr><td>ID</td><td>2</td></tr>"
+        + "<tr><td>Last Modified</td><td>2012-05-01T22:14:06Z</td></tr>"
+        + "<tr><td>LinkFilename</td><td>2_.000</td></tr>"
+        + "<tr><td>LinkFilenameNoMenu</td><td>2_.000</td></tr>"
+        + "<tr><td>LinkTitle</td><td>Inside Folder</td></tr>"
+        + "<tr><td>LinkTitleNoMenu</td><td>Inside Folder</td></tr>"
+        + "<tr><td>Modified</td><td>2012-05-04T21:24:32Z</td></tr>"
+        + "<tr><td>Order</td><td>200.000000000000</td></tr>"
+        + "<tr><td>PermMask</td><td>0x7fffffffffffffff</td></tr>"
+        + "<tr><td>ScopeId</td>"
+        +   "<td>{2E29615C-59E7-493B-B08A-3642949CC069}</td></tr>"
+        + "<tr><td>SelectTitle</td><td>2</td></tr>"
+        + "<tr><td>ServerRedirected</td><td>0</td></tr>"
+        + "<tr><td>ServerUrl</td>"
+        +   "<td>/sites/SiteCollection/Lists/Custom List/Test Folder/2_.000"
+        +   "</td></tr>"
+        + "<tr><td>Title</td><td>Inside Folder</td></tr>"
+        + "<tr><td>UniqueId</td>"
+        +   "<td>{E7156244-AC2F-4402-AA74-7A365726CD02}</td></tr>"
+        + "<tr><td>WorkflowVersion</td><td>1</td></tr>"
+        + "<tr><td>_EditMenuTableEnd</td><td>2</td></tr>"
+        + "<tr><td>_EditMenuTableStart</td><td>2_.000</td></tr>"
+        + "<tr><td>_IsCurrentVersion</td><td>1</td></tr>"
+        + "<tr><td>_Level</td><td>1</td></tr>"
+        + "<tr><td>_ModerationStatus</td><td>0</td></tr>"
+        + "<tr><td>_UIVersion</td><td>512</td></tr>"
+        + "<tr><td>_UIVersionString</td><td>1.0</td></tr>"
+        + "<tr><td>owshiddenversion</td><td>4</td></tr>"
+        + "</table><!--googleon: index-->"
         + "</body></html>";
 
     assertEquals(golden, responseString);
@@ -1153,7 +1274,57 @@
         + "<ul>"
         + "<li><a href=\"Test%20Folder/2_.000\">Inside Folder</a></li>"
         + "<li><a href=\"Test%20Folder/testing\">testing</a></li>"
-        + "</ul></body></html>";
+        + "</ul>"
+        + "<!--googleoff: index--><table style='border: none'>"
+        + "<tr><td>Attachments</td><td>0</td></tr>"
+        + "<tr><td>Author</td><td>System Account</td></tr>"
+        + "<tr><td>BaseName</td><td>Test Folder</td></tr>"
+        + "<tr><td>ContentType</td><td>Folder</td></tr>"
+        + "<tr><td>ContentTypeId</td>"
+        +   "<td>0x01200077DD29735CE61148A73F540231F24430</td></tr>"
+        + "<tr><td>Created</td><td>2012-05-01T22:13:47Z</td></tr>"
+        + "<tr><td>Created Date</td><td>2012-05-01T22:13:47Z</td></tr>"
+        + "<tr><td>Editor</td><td>System Account</td></tr>"
+        + "<tr><td>EncodedAbsUrl</td>"
+        + "<td>http://localhost:1/sites/SiteCollection/Lists/Custom%20List/"
+        +   "Test%20Folder</td></tr>"
+        + "<tr><td>FSObjType</td><td>1</td></tr>"
+        + "<tr><td>FileDirRef</td><td>sites/SiteCollection/Lists/Custom List"
+        +   "</td></tr>"
+        + "<tr><td>FileLeafRef</td><td>Test Folder</td></tr>"
+        + "<tr><td>FileRef</td><td>sites/SiteCollection/Lists/Custom List/"
+        +   "Test Folder</td></tr>"
+        + "<tr><td>GUID</td><td>{C099F4ED-6E96-4A00-B94A-EE443061EE49}</td>"
+        +   "</tr>"
+        + "<tr><td>ID</td><td>1</td></tr>"
+        + "<tr><td>Last Modified</td><td>2012-05-02T21:13:17Z</td></tr>"
+        + "<tr><td>LinkFilename</td><td>Test Folder</td></tr>"
+        + "<tr><td>LinkFilenameNoMenu</td><td>Test Folder</td></tr>"
+        + "<tr><td>LinkTitle</td><td>Test Folder</td></tr>"
+        + "<tr><td>LinkTitleNoMenu</td><td>Test Folder</td></tr>"
+        + "<tr><td>Modified</td><td>2012-05-01T22:13:47Z</td></tr>"
+        + "<tr><td>Order</td><td>100.000000000000</td></tr>"
+        + "<tr><td>PermMask</td><td>0x7fffffffffffffff</td></tr>"
+        + "<tr><td>ScopeId</td><td>{2E29615C-59E7-493B-B08A-3642949CC069}</td>"
+        +   "</tr>"
+        + "<tr><td>SelectTitle</td><td>1</td></tr>"
+        + "<tr><td>ServerRedirected</td><td>0</td></tr>"
+        + "<tr><td>ServerUrl</td><td>/sites/SiteCollection/Lists/Custom List/"
+        +   "Test Folder</td></tr>"
+        + "<tr><td>Title</td><td>Test Folder</td></tr>"
+        + "<tr><td>UniqueId</td><td>{CE33B6B7-9F5E-4224-8D77-9C42E6290FE6}</td>"
+        +   "</tr>"
+        + "<tr><td>WorkflowVersion</td><td>1</td></tr>"
+        + "<tr><td>_EditMenuTableEnd</td><td>1</td></tr>"
+        + "<tr><td>_EditMenuTableStart</td><td>Test Folder</td></tr>"
+        + "<tr><td>_IsCurrentVersion</td><td>1</td></tr>"
+        + "<tr><td>_Level</td><td>1</td></tr>"
+        + "<tr><td>_ModerationStatus</td><td>0</td></tr>"
+        + "<tr><td>_UIVersion</td><td>512</td></tr>"
+        + "<tr><td>_UIVersionString</td><td>1.0</td></tr>"
+        + "<tr><td>owshiddenversion</td><td>1</td></tr>"
+        + "</table><!--googleon: index-->"
+        + "</body></html>";
     final Metadata goldenMetadata;
     {
       Metadata meta = new Metadata();