Add Config Options to Expire Content Based on LastAccess/LastModified Time

This change adds configuration options to expire content based on
LastAccessTime and/or LastModifiedTime. Each allows two different
forms of specification:
  - Expire content older than some specific date.  (Absolute)
  - Expire content older than some number of days. (Relative)

The configuration parameters are:
filesystemadaptor.lastAccessedDate - Absolute last accessed time expiration
filesystemadaptor.lastAccessedDays - Relative last accessed time expiration
filesystemadaptor.lastModifiedDate - Absolute last modified time expiration
filesystemadaptor.lastModifiedDays - Relative last modified time expiration

Values for the Relative forms of configuration are positive integer
number of days.  For instance:
   filesystemadaptor.lastAccessedDays=365
would expire content that had not been accessed for a year.

Values for the Absolute forms of configuration are ISO8601 date
only format strings: yyyy-MM-dd.  For instance:
   filesystemadaptor.lastModifiedDate=2010-06-01
would expire content that had not been modified since the beginning
of June 2010.

Code Review: http://codereview.appspot.com/96330043
diff --git a/src/com/google/enterprise/adaptor/fs/FsAdaptor.java b/src/com/google/enterprise/adaptor/fs/FsAdaptor.java
index 6360ec4..efd8fae 100644
--- a/src/com/google/enterprise/adaptor/fs/FsAdaptor.java
+++ b/src/com/google/enterprise/adaptor/fs/FsAdaptor.java
@@ -33,7 +33,6 @@
 import com.google.enterprise.adaptor.Request;
 import com.google.enterprise.adaptor.Response;
 import com.google.enterprise.adaptor.StartupException;
-import com.google.enterprise.adaptor.UnsupportedPlatformException;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -47,6 +46,7 @@
 import java.nio.file.attribute.FileTime;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
+import java.util.Calendar;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
@@ -54,7 +54,9 @@
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Map;
+import java.text.ParseException;
 import java.util.Set;
+import java.text.SimpleDateFormat;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;
@@ -92,6 +94,22 @@
   private static final String CONFIG_CRAWL_HIDDEN_FILES =
       "filesystemadaptor.crawlHiddenFiles";    
 
+  /** Relative config parameter name for earliest last accessed time allowed. */
+  private static final String CONFIG_LAST_ACCESSED_DAYS =
+      "filesystemadaptor.lastAccessedDays";
+ 
+  /** Absolute config parameter name for earliest last accessed time allowed. */
+  private static final String CONFIG_LAST_ACCESSED_DATE =
+      "filesystemadaptor.lastAccessedDate";
+
+  /** Relative config parameter name for earliest last modified time allowed. */
+  private static final String CONFIG_LAST_MODIFIED_DAYS =
+      "filesystemadaptor.lastModifiedDays";
+
+  /** Absolute config parameter name for earliest last modified time allowed. */
+  private static final String CONFIG_LAST_MODIFIED_DATE =
+      "filesystemadaptor.lastModifiedDate";
+
   private static final String ALL_FOLDER_INHERIT_ACL = "allFoldersAcl";
   private static final String ALL_FILE_INHERIT_ACL = "allFilesAcl";
   private static final String CHILD_FOLDER_INHERIT_ACL = "childFoldersAcl";
@@ -152,6 +170,10 @@
   private FileDelegate delegate;
   private ShareAcls lastPushedShareAcls = null;
 
+  /** Filter that may exclude files whose last modified time is too old. */
+  private FileTimeFilter lastModifiedTimeFilter;
+  private FileTimeFilter lastAccessTimeFilter;
+
   public FsAdaptor() {
     // At the moment, we only support Windows.
     if (System.getProperty("os.name").startsWith("Windows")) {
@@ -191,6 +213,10 @@
     config.addKey(CONFIG_BUILTIN_PREFIX, "BUILTIN\\");
     config.addKey(CONFIG_NAMESPACE, Principal.DEFAULT_NAMESPACE);
     config.addKey(CONFIG_CRAWL_HIDDEN_FILES, "false");
+    config.addKey(CONFIG_LAST_ACCESSED_DAYS, "");
+    config.addKey(CONFIG_LAST_ACCESSED_DATE, "");
+    config.addKey(CONFIG_LAST_MODIFIED_DAYS, "");
+    config.addKey(CONFIG_LAST_MODIFIED_DATE, "");
     config.overrideKey(CONFIG_MAX_INCREMENTAL_LATENCY, "300");
   }
 
@@ -272,6 +298,12 @@
           + "property \"filesystemadaptor.crawlHiddenFiles\" to \"true\".");
     }
 
+    // Add filters that may exclude older content.
+    lastAccessTimeFilter = getFileTimeFilter(context.getConfig(),
+        CONFIG_LAST_ACCESSED_DAYS, CONFIG_LAST_ACCESSED_DATE);
+    lastModifiedTimeFilter = getFileTimeFilter(context.getConfig(),
+        CONFIG_LAST_MODIFIED_DAYS, CONFIG_LAST_MODIFIED_DATE);
+
     // Verify that the adaptor has permission to read the Acl and share Acl.
     try {
       readShareAcls();
@@ -294,6 +326,44 @@
     delegate.destroy();
   }
 
+  private FileTimeFilter getFileTimeFilter(Config config, String configDaysKey,
+       String configDateKey) throws StartupException {
+    String configDays = config.getValue(configDaysKey);
+    String configDate = config.getValue(configDateKey);
+    if (!configDays.isEmpty() && !configDate.isEmpty()) {
+      throw new InvalidConfigurationException("Please specify only one of "
+          + configDaysKey + " or " + configDateKey + ".");
+    } else if (!configDays.isEmpty()) {
+      log.log(Level.CONFIG, configDaysKey + ": " + configDays);
+      try {
+        return new ExpiringFileTimeFilter(Integer.parseInt(configDays));
+      } catch (NumberFormatException e) {
+        throw new InvalidConfigurationException(configDaysKey
+            + " must be specified as a positive integer number of days.", e);
+      } catch (IllegalArgumentException e) {
+        throw new InvalidConfigurationException(configDaysKey
+            + " must be specified as a positive integer number of days.", e);
+      }
+    } else if (!configDate.isEmpty()) {
+      log.log(Level.CONFIG, configDateKey + ": " + configDate);
+      SimpleDateFormat ISO8601DateFormat = new SimpleDateFormat("yyyy-MM-dd");
+      ISO8601DateFormat.setCalendar(Calendar.getInstance());
+      ISO8601DateFormat.setLenient(true);
+      try {
+        return new AbsoluteFileTimeFilter(FileTime.fromMillis(
+            ISO8601DateFormat.parse(configDate).getTime()));
+      } catch (ParseException e) {
+        throw new InvalidConfigurationException(configDateKey
+            + " must be specified in the format \"YYYY-MM-DD\".", e);
+      } catch (IllegalArgumentException e) {
+        throw new InvalidConfigurationException(configDateKey
+            + " must be a date in the past.", e);
+      }
+    } else {
+      return new AlwaysAllowFileTimeFilter();
+    }
+  }
+
   private ShareAcls readShareAcls() throws IOException {
     Acl shareAcl;
     Acl dfsShareAcl;
@@ -408,6 +478,22 @@
     BasicFileAttributes attrs = delegate.readBasicAttributes(doc);
     final FileTime lastAccessTime = attrs.lastAccessTime();
 
+    if (!docIsDirectory) {
+      if (lastAccessTimeFilter.excluded(lastAccessTime)) {
+        log.log(Level.WARNING, "Skipping {0} because it was last accessed {1}.",
+            new Object[] {doc, lastAccessTime.toString().substring(0, 10)});
+        resp.respondNotFound();
+        return;
+      }
+      if (lastModifiedTimeFilter.excluded(attrs.lastModifiedTime())) {
+        log.log(Level.WARNING, "Skipping {0} because it was last modified {1}.",
+            new Object[] {doc, 
+                attrs.lastModifiedTime().toString().substring(0, 10)});
+        resp.respondNotFound();
+        return;
+      }
+    }
+
     resp.setDisplayUrl(doc.toUri());
     resp.setLastModified(new Date(attrs.lastModifiedTime().toMillis()));
     resp.addMetadata("Creation Time", dateFormatter.get().format(
@@ -590,6 +676,52 @@
     }
   }
 
+  private static interface FileTimeFilter {
+    public boolean excluded(FileTime fileTime);
+  }
+
+  private static class AlwaysAllowFileTimeFilter implements FileTimeFilter {
+    @Override
+    public boolean excluded(FileTime fileTime) {
+      return false;
+    }
+  }
+
+  private static class AbsoluteFileTimeFilter implements FileTimeFilter {
+    private final FileTime oldestAllowed;
+
+    public AbsoluteFileTimeFilter(FileTime oldestAllowed) {
+      Preconditions.checkArgument(oldestAllowed.compareTo(
+          FileTime.fromMillis(System.currentTimeMillis())) < 0,
+          oldestAllowed.toString().substring(0, 10)
+          + " is in the future.");
+      this.oldestAllowed = oldestAllowed;
+    }
+
+    @Override
+    public boolean excluded(FileTime fileTime) {
+      return fileTime.compareTo(oldestAllowed) < 0;
+    }
+  }
+
+  private static class ExpiringFileTimeFilter implements FileTimeFilter {
+    private static final long MILLIS_PER_DAY = 24 * 60 * 60 * 1000L;
+    private final long relativeMillis;
+
+    public ExpiringFileTimeFilter(int daysOld) {
+      Preconditions.checkArgument(daysOld > 0, "The number of days old for "
+          + "expired content must be greater than zero.");
+      this.relativeMillis = daysOld * MILLIS_PER_DAY;
+    }
+
+    @Override
+    public boolean excluded(FileTime fileTime) {
+      FileTime oldestAllowed =
+          FileTime.fromMillis(System.currentTimeMillis() - relativeMillis);
+      return fileTime.compareTo(oldestAllowed) < 0;
+    }
+  }
+
   /** Call default main for adaptors. */
   public static void main(String[] args) {
     AbstractAdaptor.main(new FsAdaptor(), args);
diff --git a/src/overview.html b/src/overview.html
index bd158d4..dbdc5f3 100644
--- a/src/overview.html
+++ b/src/overview.html
@@ -155,7 +155,7 @@
   folders is platform dependent. On Windows file sytems a file or
   folder is considered hidden if the DOS <code>hidden</code>
   attribute is set.
-  <br>
+  <p/>
   By default, hidden files are not indexed and the contents of
   hidden folders are not indexed. Setting
   <code>filesystemadaptor.crawlHiddenFiles</code> to <code>true</code>
@@ -165,7 +165,84 @@
   false
   </pre>
   </dd>
+  <dt>
+  <code>filesystemadaptor.lastAccessedDate</code>
+  </dt>
+  <dd>
+  This configuration property can be used to disable crawling of files
+  whose time of last access is earlier than a specific date.  The cut-off
+  date is specified in <a href="http://www.w3.org/TR/NOTE-datetime">
+  ISO8601</a> date format, <code>YYYY-MM-DD</code>.
+  <p/>
+  Setting <code>filesystemadaptor.lastAccessedDate</code> to
+  <code>2010-01-01</code> would only crawl content that has been accessed
+  since the beginning of 2010.
+  <p/>
+  By default, filtering content based upon last accessed time is disabled.
   <br>
+  Only one of <code>filesystemadaptor.lastAccessedDate</code> or
+  <code>filesystemadaptor.lastAccessedDays</code> may be specified.
+  </dd>
+  <dt>
+  <code>filesystemadaptor.lastAccessedDays</code>
+  </dt>
+  <dd>
+  This configuration property can be used to disable crawling of files
+  that have not been accessed within the specified number of days. Unlike the
+  absolute cut-off date used by <code>filesystemadaptor.lastAccessedDate</code>,
+  this property can be used to expire previously indexed content if it
+  has not been accessed in a while.
+  <p/>
+  The expiration window is specified as a positive integer number of days.
+  <br>
+  Setting <code>filesystemadaptor.lastAccessedDays</code> to
+  <code>365</code> would only crawl content that has been accessed
+  in the last year.
+  <p/>
+  By default, filtering content based upon last accessed time is disabled.
+  <br>
+  Only one of <code>filesystemadaptor.lastAccessedDate</code> or
+  <code>filesystemadaptor.lastAccessedDays</code> may be specified.
+  </dd>
+  <dt>
+  <code>filesystemadaptor.lastModifiedDate</code>
+  </dt>
+  <dd>
+  This configuration property can be used to disable crawling of files
+  whose time of last access is earlier than a specific date.  The cut-off
+  date is specified in <a href="http://www.w3.org/TR/NOTE-datetime">
+  ISO8601</a> date format, <code>YYYY-MM-DD</code>.
+  <p/>
+  Setting <code>filesystemadaptor.lastModifiedDate</code> to
+  <code>2010-01-01</code> would only crawl content that has been modified
+  since the beginning of 2010.
+  <p/>
+  By default, filtering content based upon last modified time is disabled.
+  <br>
+  Only one of <code>filesystemadaptor.lastModifiedDate</code> or
+  <code>filesystemadaptor.lastModifiedDays</code> may be specified.
+  </dd>
+  <dt>
+  <code>filesystemadaptor.lastModifiedDays</code>
+  </dt>
+  <dd>
+  This configuration property can be used to disable crawling of files
+  that have not been modified within the specified number of days. Unlike the
+  absolute cut-off date used by <code>filesystemadaptor.lastModifiedDate</code>,
+  this property can be used to expire previously indexed content if it
+  has not been modified in a while.
+  <p/>
+  The expiration window is specified as a positive integer number of days.
+  <br>
+  Setting <code>filesystemadaptor.lastModifiedDays</code> to
+  <code>365</code> would only crawl content that has been modified
+  in the last year.
+  <p/>
+  By default, filtering content based upon last modified time is disabled.
+  <br>
+  Only one of <code>filesystemadaptor.lastModifiedDate</code> or
+  <code>filesystemadaptor.lastModifiedDays</code> may be specified.
+  </dd>
   <dt>
   <code>adaptor.incrementalPollPeriodSecs</code>
   </dt>
diff --git a/test/com/google/enterprise/adaptor/fs/FsAdaptorTest.java b/test/com/google/enterprise/adaptor/fs/FsAdaptorTest.java
index 569bc6c..c756af8 100644
--- a/test/com/google/enterprise/adaptor/fs/FsAdaptorTest.java
+++ b/test/com/google/enterprise/adaptor/fs/FsAdaptorTest.java
@@ -52,6 +52,8 @@
 import java.nio.file.Paths;
 import java.nio.file.attribute.AclFileAttributeView;
 import java.nio.file.attribute.FileTime;
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
 import java.util.Collections;
 import java.util.Date;
 import java.util.List;
@@ -996,6 +998,225 @@
     adaptor.init(context);
   }
 
+  @Test
+  public void testAdaptorInitLastAccessDays() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDays", "365");
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitInvalidLastAccessDaysNonNumeric()
+      throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDays", "ten");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitInvalidLastAccessDaysNegative() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDays", "-365");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitLastAccessDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31");
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitInvalidLastAccessDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDate", "01/31/2000");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitFutureLastAccessDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDate", "2999-12-31");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitInvalidLastAccessDaysAndDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDays", "365");
+    config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitLastModifiedDays() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDays", "365");
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitInvalidLastModifiedDaysNonNumeric()
+      throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDays", "ten");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitInvalidLastModifiedDaysNegative()
+      throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDays", "-365");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitLastModifiedDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31");
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitInvalidLastModifiedDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDate", "01/31/2000");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitFutureLastModifiedDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDate", "2999-12-31");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAdaptorInitInvalidLastModifiedDaysAndDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDays", "365");
+    config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31");
+    thrown.expect(InvalidConfigurationException.class);
+    adaptor.init(context);
+  }
+
+  @Test
+  public void testAbsoluteLastAccessTimeFilterTooEarly() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31");
+    testLastAccessTimeFilter("2000-01-30", true);
+  }
+
+  @Test
+  public void testAbsoluteLastAccessTimeFilterStartDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31");
+    testLastAccessTimeFilter("2000-01-31", false);
+  }
+
+  @Test
+  public void testAbsoluteLastAccessTimeFilterMuchLater() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31");
+    testLastAccessTimeFilter("2014-01-31", false);
+  }
+
+  @Test
+  public void testRelativeLastAccessTimeFilterTooEarly() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDays", "1");
+    long yesterday = System.currentTimeMillis() - (25 * 60 * 60 * 1000L);
+    testLastAccessTimeFilter(yesterday, true);
+  }
+
+  @Test
+  public void testRelativeLastAccessTimeFilterStartTime() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDays", "1");
+    long squeekedBy = System.currentTimeMillis() - (24 * 59 * 60 * 1000L);
+    testLastAccessTimeFilter(squeekedBy, false);
+  }
+
+  @Test
+  public void testRelativeLastAccessTimeFilterMuchLater() throws Exception {
+    config.overrideKey("filesystemadaptor.lastAccessedDays", "1");
+    long now = System.currentTimeMillis();
+    testLastAccessTimeFilter(now, false);
+  }
+
+  private void testLastAccessTimeFilter(String date, boolean excluded)
+      throws Exception {
+    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    dateFormat.setCalendar(Calendar.getInstance());
+    dateFormat.setLenient(true);
+    testLastAccessTimeFilter(dateFormat.parse(date).getTime(), excluded);
+  }
+
+  private void testLastAccessTimeFilter(long fileTime, boolean excluded)
+      throws Exception {
+    MockFile file = new MockFile("test.html");
+    file.setLastAccessTime(FileTime.fromMillis(fileTime));
+    testFileTimeFilter(file, excluded);
+  }
+
+  private void testFileTimeFilter(MockFile file, boolean excluded)
+      throws Exception {
+    file.setFileContents("<html><title>Hello World</title></html>");
+    file.setContentType("text/html");
+    root.addChildren(file);
+    adaptor.init(context);
+    MockRequest request = new MockRequest(getDocId(file.getPath()));
+    MockResponse response = new MockResponse();
+    adaptor.getDocContent(request, response);
+    assertEquals(excluded, response.notFound);
+  }
+
+  @Test
+  public void testAbsoluteLastModifiedTimeFilterTooEarly() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31");
+    testLastModifiedTimeFilter("2000-01-30", true);
+  }
+
+  @Test
+  public void testAbsoluteLastModifiedTimeFilterStartDate() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31");
+    testLastModifiedTimeFilter("2000-01-31", false);
+  }
+
+  @Test
+  public void testAbsoluteLastModifiedTimeFilterMuchLater() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31");
+    testLastModifiedTimeFilter("2014-01-31", false);
+  }
+
+  @Test
+  public void testRelativeLastModifiedTimeFilterTooEarly() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDays", "1");
+    long yesterday = System.currentTimeMillis() - (25 * 60 * 60 * 1000L);
+    testLastModifiedTimeFilter(yesterday, true);
+  }
+
+  @Test
+  public void testRelativeLastModifiedTimeFilterStartTime() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDays", "1");
+    long squeekedBy = System.currentTimeMillis() - (24 * 59 * 60 * 1000L);
+    testLastModifiedTimeFilter(squeekedBy, false);
+  }
+
+  @Test
+  public void testRelativeLastModifiedTimeFilterMuchLater() throws Exception {
+    config.overrideKey("filesystemadaptor.lastModifiedDays", "1");
+    long now = System.currentTimeMillis();
+    testLastModifiedTimeFilter(now, false);
+  }
+
+  private void testLastModifiedTimeFilter(String date, boolean excluded)
+      throws Exception {
+    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    dateFormat.setCalendar(Calendar.getInstance());
+    dateFormat.setLenient(true);
+    testLastModifiedTimeFilter(dateFormat.parse(date).getTime(), excluded);
+  }
+
+  private void testLastModifiedTimeFilter(long fileTime, boolean excluded)
+      throws Exception {
+    MockFile file = new MockFile("test.html");
+    file.setLastModifiedTime(FileTime.fromMillis(fileTime));
+    testFileTimeFilter(file, excluded);
+  }
+
   private static class DenyShareAclAccessMockFile extends MockFile {
     DenyShareAclAccessMockFile(String name, boolean isDirectory) {
       super(name, isDirectory);