Add Config Options to Expire Content Based on LastAccess/LastModified Time This change adds configuration options to expire content based on LastAccessTime and/or LastModifiedTime. Each allows two different forms of specification: - Expire content older than some specific date. (Absolute) - Expire content older than some number of days. (Relative) The configuration parameters are: filesystemadaptor.lastAccessedDate - Absolute last accessed time expiration filesystemadaptor.lastAccessedDays - Relative last accessed time expiration filesystemadaptor.lastModifiedDate - Absolute last modified time expiration filesystemadaptor.lastModifiedDays - Relative last modified time expiration Values for the Relative forms of configuration are positive integer number of days. For instance: filesystemadaptor.lastAccessedDays=365 would expire content that had not been accessed for a year. Values for the Absolute forms of configuration are ISO8601 date only format strings: yyyy-MM-dd. For instance: filesystemadaptor.lastModifiedDate=2010-06-01 would expire content that had not been modified since the beginning of June 2010. Code Review: http://codereview.appspot.com/96330043
diff --git a/src/com/google/enterprise/adaptor/fs/FsAdaptor.java b/src/com/google/enterprise/adaptor/fs/FsAdaptor.java index 6360ec4..efd8fae 100644 --- a/src/com/google/enterprise/adaptor/fs/FsAdaptor.java +++ b/src/com/google/enterprise/adaptor/fs/FsAdaptor.java
@@ -33,7 +33,6 @@ import com.google.enterprise.adaptor.Request; import com.google.enterprise.adaptor.Response; import com.google.enterprise.adaptor.StartupException; -import com.google.enterprise.adaptor.UnsupportedPlatformException; import java.io.IOException; import java.io.InputStream; @@ -47,6 +46,7 @@ import java.nio.file.attribute.FileTime; import java.text.SimpleDateFormat; import java.util.Arrays; +import java.util.Calendar; import java.util.Collection; import java.util.Collections; import java.util.Date; @@ -54,7 +54,9 @@ import java.util.HashSet; import java.util.Locale; import java.util.Map; +import java.text.ParseException; import java.util.Set; +import java.text.SimpleDateFormat; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; @@ -92,6 +94,22 @@ private static final String CONFIG_CRAWL_HIDDEN_FILES = "filesystemadaptor.crawlHiddenFiles"; + /** Relative config parameter name for earliest last accessed time allowed. */ + private static final String CONFIG_LAST_ACCESSED_DAYS = + "filesystemadaptor.lastAccessedDays"; + + /** Absolute config parameter name for earliest last accessed time allowed. */ + private static final String CONFIG_LAST_ACCESSED_DATE = + "filesystemadaptor.lastAccessedDate"; + + /** Relative config parameter name for earliest last modified time allowed. */ + private static final String CONFIG_LAST_MODIFIED_DAYS = + "filesystemadaptor.lastModifiedDays"; + + /** Absolute config parameter name for earliest last modified time allowed. */ + private static final String CONFIG_LAST_MODIFIED_DATE = + "filesystemadaptor.lastModifiedDate"; + private static final String ALL_FOLDER_INHERIT_ACL = "allFoldersAcl"; private static final String ALL_FILE_INHERIT_ACL = "allFilesAcl"; private static final String CHILD_FOLDER_INHERIT_ACL = "childFoldersAcl"; @@ -152,6 +170,10 @@ private FileDelegate delegate; private ShareAcls lastPushedShareAcls = null; + /** Filter that may exclude files whose last modified time is too old. */ + private FileTimeFilter lastModifiedTimeFilter; + private FileTimeFilter lastAccessTimeFilter; + public FsAdaptor() { // At the moment, we only support Windows. if (System.getProperty("os.name").startsWith("Windows")) { @@ -191,6 +213,10 @@ config.addKey(CONFIG_BUILTIN_PREFIX, "BUILTIN\\"); config.addKey(CONFIG_NAMESPACE, Principal.DEFAULT_NAMESPACE); config.addKey(CONFIG_CRAWL_HIDDEN_FILES, "false"); + config.addKey(CONFIG_LAST_ACCESSED_DAYS, ""); + config.addKey(CONFIG_LAST_ACCESSED_DATE, ""); + config.addKey(CONFIG_LAST_MODIFIED_DAYS, ""); + config.addKey(CONFIG_LAST_MODIFIED_DATE, ""); config.overrideKey(CONFIG_MAX_INCREMENTAL_LATENCY, "300"); } @@ -272,6 +298,12 @@ + "property \"filesystemadaptor.crawlHiddenFiles\" to \"true\"."); } + // Add filters that may exclude older content. + lastAccessTimeFilter = getFileTimeFilter(context.getConfig(), + CONFIG_LAST_ACCESSED_DAYS, CONFIG_LAST_ACCESSED_DATE); + lastModifiedTimeFilter = getFileTimeFilter(context.getConfig(), + CONFIG_LAST_MODIFIED_DAYS, CONFIG_LAST_MODIFIED_DATE); + // Verify that the adaptor has permission to read the Acl and share Acl. try { readShareAcls(); @@ -294,6 +326,44 @@ delegate.destroy(); } + private FileTimeFilter getFileTimeFilter(Config config, String configDaysKey, + String configDateKey) throws StartupException { + String configDays = config.getValue(configDaysKey); + String configDate = config.getValue(configDateKey); + if (!configDays.isEmpty() && !configDate.isEmpty()) { + throw new InvalidConfigurationException("Please specify only one of " + + configDaysKey + " or " + configDateKey + "."); + } else if (!configDays.isEmpty()) { + log.log(Level.CONFIG, configDaysKey + ": " + configDays); + try { + return new ExpiringFileTimeFilter(Integer.parseInt(configDays)); + } catch (NumberFormatException e) { + throw new InvalidConfigurationException(configDaysKey + + " must be specified as a positive integer number of days.", e); + } catch (IllegalArgumentException e) { + throw new InvalidConfigurationException(configDaysKey + + " must be specified as a positive integer number of days.", e); + } + } else if (!configDate.isEmpty()) { + log.log(Level.CONFIG, configDateKey + ": " + configDate); + SimpleDateFormat ISO8601DateFormat = new SimpleDateFormat("yyyy-MM-dd"); + ISO8601DateFormat.setCalendar(Calendar.getInstance()); + ISO8601DateFormat.setLenient(true); + try { + return new AbsoluteFileTimeFilter(FileTime.fromMillis( + ISO8601DateFormat.parse(configDate).getTime())); + } catch (ParseException e) { + throw new InvalidConfigurationException(configDateKey + + " must be specified in the format \"YYYY-MM-DD\".", e); + } catch (IllegalArgumentException e) { + throw new InvalidConfigurationException(configDateKey + + " must be a date in the past.", e); + } + } else { + return new AlwaysAllowFileTimeFilter(); + } + } + private ShareAcls readShareAcls() throws IOException { Acl shareAcl; Acl dfsShareAcl; @@ -408,6 +478,22 @@ BasicFileAttributes attrs = delegate.readBasicAttributes(doc); final FileTime lastAccessTime = attrs.lastAccessTime(); + if (!docIsDirectory) { + if (lastAccessTimeFilter.excluded(lastAccessTime)) { + log.log(Level.WARNING, "Skipping {0} because it was last accessed {1}.", + new Object[] {doc, lastAccessTime.toString().substring(0, 10)}); + resp.respondNotFound(); + return; + } + if (lastModifiedTimeFilter.excluded(attrs.lastModifiedTime())) { + log.log(Level.WARNING, "Skipping {0} because it was last modified {1}.", + new Object[] {doc, + attrs.lastModifiedTime().toString().substring(0, 10)}); + resp.respondNotFound(); + return; + } + } + resp.setDisplayUrl(doc.toUri()); resp.setLastModified(new Date(attrs.lastModifiedTime().toMillis())); resp.addMetadata("Creation Time", dateFormatter.get().format( @@ -590,6 +676,52 @@ } } + private static interface FileTimeFilter { + public boolean excluded(FileTime fileTime); + } + + private static class AlwaysAllowFileTimeFilter implements FileTimeFilter { + @Override + public boolean excluded(FileTime fileTime) { + return false; + } + } + + private static class AbsoluteFileTimeFilter implements FileTimeFilter { + private final FileTime oldestAllowed; + + public AbsoluteFileTimeFilter(FileTime oldestAllowed) { + Preconditions.checkArgument(oldestAllowed.compareTo( + FileTime.fromMillis(System.currentTimeMillis())) < 0, + oldestAllowed.toString().substring(0, 10) + + " is in the future."); + this.oldestAllowed = oldestAllowed; + } + + @Override + public boolean excluded(FileTime fileTime) { + return fileTime.compareTo(oldestAllowed) < 0; + } + } + + private static class ExpiringFileTimeFilter implements FileTimeFilter { + private static final long MILLIS_PER_DAY = 24 * 60 * 60 * 1000L; + private final long relativeMillis; + + public ExpiringFileTimeFilter(int daysOld) { + Preconditions.checkArgument(daysOld > 0, "The number of days old for " + + "expired content must be greater than zero."); + this.relativeMillis = daysOld * MILLIS_PER_DAY; + } + + @Override + public boolean excluded(FileTime fileTime) { + FileTime oldestAllowed = + FileTime.fromMillis(System.currentTimeMillis() - relativeMillis); + return fileTime.compareTo(oldestAllowed) < 0; + } + } + /** Call default main for adaptors. */ public static void main(String[] args) { AbstractAdaptor.main(new FsAdaptor(), args);
diff --git a/src/overview.html b/src/overview.html index bd158d4..dbdc5f3 100644 --- a/src/overview.html +++ b/src/overview.html
@@ -155,7 +155,7 @@ folders is platform dependent. On Windows file sytems a file or folder is considered hidden if the DOS <code>hidden</code> attribute is set. - <br> + <p/> By default, hidden files are not indexed and the contents of hidden folders are not indexed. Setting <code>filesystemadaptor.crawlHiddenFiles</code> to <code>true</code> @@ -165,7 +165,84 @@ false </pre> </dd> + <dt> + <code>filesystemadaptor.lastAccessedDate</code> + </dt> + <dd> + This configuration property can be used to disable crawling of files + whose time of last access is earlier than a specific date. The cut-off + date is specified in <a href="http://www.w3.org/TR/NOTE-datetime"> + ISO8601</a> date format, <code>YYYY-MM-DD</code>. + <p/> + Setting <code>filesystemadaptor.lastAccessedDate</code> to + <code>2010-01-01</code> would only crawl content that has been accessed + since the beginning of 2010. + <p/> + By default, filtering content based upon last accessed time is disabled. <br> + Only one of <code>filesystemadaptor.lastAccessedDate</code> or + <code>filesystemadaptor.lastAccessedDays</code> may be specified. + </dd> + <dt> + <code>filesystemadaptor.lastAccessedDays</code> + </dt> + <dd> + This configuration property can be used to disable crawling of files + that have not been accessed within the specified number of days. Unlike the + absolute cut-off date used by <code>filesystemadaptor.lastAccessedDate</code>, + this property can be used to expire previously indexed content if it + has not been accessed in a while. + <p/> + The expiration window is specified as a positive integer number of days. + <br> + Setting <code>filesystemadaptor.lastAccessedDays</code> to + <code>365</code> would only crawl content that has been accessed + in the last year. + <p/> + By default, filtering content based upon last accessed time is disabled. + <br> + Only one of <code>filesystemadaptor.lastAccessedDate</code> or + <code>filesystemadaptor.lastAccessedDays</code> may be specified. + </dd> + <dt> + <code>filesystemadaptor.lastModifiedDate</code> + </dt> + <dd> + This configuration property can be used to disable crawling of files + whose time of last access is earlier than a specific date. The cut-off + date is specified in <a href="http://www.w3.org/TR/NOTE-datetime"> + ISO8601</a> date format, <code>YYYY-MM-DD</code>. + <p/> + Setting <code>filesystemadaptor.lastModifiedDate</code> to + <code>2010-01-01</code> would only crawl content that has been modified + since the beginning of 2010. + <p/> + By default, filtering content based upon last modified time is disabled. + <br> + Only one of <code>filesystemadaptor.lastModifiedDate</code> or + <code>filesystemadaptor.lastModifiedDays</code> may be specified. + </dd> + <dt> + <code>filesystemadaptor.lastModifiedDays</code> + </dt> + <dd> + This configuration property can be used to disable crawling of files + that have not been modified within the specified number of days. Unlike the + absolute cut-off date used by <code>filesystemadaptor.lastModifiedDate</code>, + this property can be used to expire previously indexed content if it + has not been modified in a while. + <p/> + The expiration window is specified as a positive integer number of days. + <br> + Setting <code>filesystemadaptor.lastModifiedDays</code> to + <code>365</code> would only crawl content that has been modified + in the last year. + <p/> + By default, filtering content based upon last modified time is disabled. + <br> + Only one of <code>filesystemadaptor.lastModifiedDate</code> or + <code>filesystemadaptor.lastModifiedDays</code> may be specified. + </dd> <dt> <code>adaptor.incrementalPollPeriodSecs</code> </dt>
diff --git a/test/com/google/enterprise/adaptor/fs/FsAdaptorTest.java b/test/com/google/enterprise/adaptor/fs/FsAdaptorTest.java index 569bc6c..c756af8 100644 --- a/test/com/google/enterprise/adaptor/fs/FsAdaptorTest.java +++ b/test/com/google/enterprise/adaptor/fs/FsAdaptorTest.java
@@ -52,6 +52,8 @@ import java.nio.file.Paths; import java.nio.file.attribute.AclFileAttributeView; import java.nio.file.attribute.FileTime; +import java.text.SimpleDateFormat; +import java.util.Calendar; import java.util.Collections; import java.util.Date; import java.util.List; @@ -996,6 +998,225 @@ adaptor.init(context); } + @Test + public void testAdaptorInitLastAccessDays() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDays", "365"); + adaptor.init(context); + } + + @Test + public void testAdaptorInitInvalidLastAccessDaysNonNumeric() + throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDays", "ten"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitInvalidLastAccessDaysNegative() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDays", "-365"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitLastAccessDate() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31"); + adaptor.init(context); + } + + @Test + public void testAdaptorInitInvalidLastAccessDate() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDate", "01/31/2000"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitFutureLastAccessDate() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDate", "2999-12-31"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitInvalidLastAccessDaysAndDate() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDays", "365"); + config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitLastModifiedDays() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDays", "365"); + adaptor.init(context); + } + + @Test + public void testAdaptorInitInvalidLastModifiedDaysNonNumeric() + throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDays", "ten"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitInvalidLastModifiedDaysNegative() + throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDays", "-365"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitLastModifiedDate() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31"); + adaptor.init(context); + } + + @Test + public void testAdaptorInitInvalidLastModifiedDate() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDate", "01/31/2000"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitFutureLastModifiedDate() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDate", "2999-12-31"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAdaptorInitInvalidLastModifiedDaysAndDate() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDays", "365"); + config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31"); + thrown.expect(InvalidConfigurationException.class); + adaptor.init(context); + } + + @Test + public void testAbsoluteLastAccessTimeFilterTooEarly() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31"); + testLastAccessTimeFilter("2000-01-30", true); + } + + @Test + public void testAbsoluteLastAccessTimeFilterStartDate() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31"); + testLastAccessTimeFilter("2000-01-31", false); + } + + @Test + public void testAbsoluteLastAccessTimeFilterMuchLater() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDate", "2000-01-31"); + testLastAccessTimeFilter("2014-01-31", false); + } + + @Test + public void testRelativeLastAccessTimeFilterTooEarly() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDays", "1"); + long yesterday = System.currentTimeMillis() - (25 * 60 * 60 * 1000L); + testLastAccessTimeFilter(yesterday, true); + } + + @Test + public void testRelativeLastAccessTimeFilterStartTime() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDays", "1"); + long squeekedBy = System.currentTimeMillis() - (24 * 59 * 60 * 1000L); + testLastAccessTimeFilter(squeekedBy, false); + } + + @Test + public void testRelativeLastAccessTimeFilterMuchLater() throws Exception { + config.overrideKey("filesystemadaptor.lastAccessedDays", "1"); + long now = System.currentTimeMillis(); + testLastAccessTimeFilter(now, false); + } + + private void testLastAccessTimeFilter(String date, boolean excluded) + throws Exception { + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + dateFormat.setCalendar(Calendar.getInstance()); + dateFormat.setLenient(true); + testLastAccessTimeFilter(dateFormat.parse(date).getTime(), excluded); + } + + private void testLastAccessTimeFilter(long fileTime, boolean excluded) + throws Exception { + MockFile file = new MockFile("test.html"); + file.setLastAccessTime(FileTime.fromMillis(fileTime)); + testFileTimeFilter(file, excluded); + } + + private void testFileTimeFilter(MockFile file, boolean excluded) + throws Exception { + file.setFileContents("<html><title>Hello World</title></html>"); + file.setContentType("text/html"); + root.addChildren(file); + adaptor.init(context); + MockRequest request = new MockRequest(getDocId(file.getPath())); + MockResponse response = new MockResponse(); + adaptor.getDocContent(request, response); + assertEquals(excluded, response.notFound); + } + + @Test + public void testAbsoluteLastModifiedTimeFilterTooEarly() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31"); + testLastModifiedTimeFilter("2000-01-30", true); + } + + @Test + public void testAbsoluteLastModifiedTimeFilterStartDate() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31"); + testLastModifiedTimeFilter("2000-01-31", false); + } + + @Test + public void testAbsoluteLastModifiedTimeFilterMuchLater() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDate", "2000-01-31"); + testLastModifiedTimeFilter("2014-01-31", false); + } + + @Test + public void testRelativeLastModifiedTimeFilterTooEarly() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDays", "1"); + long yesterday = System.currentTimeMillis() - (25 * 60 * 60 * 1000L); + testLastModifiedTimeFilter(yesterday, true); + } + + @Test + public void testRelativeLastModifiedTimeFilterStartTime() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDays", "1"); + long squeekedBy = System.currentTimeMillis() - (24 * 59 * 60 * 1000L); + testLastModifiedTimeFilter(squeekedBy, false); + } + + @Test + public void testRelativeLastModifiedTimeFilterMuchLater() throws Exception { + config.overrideKey("filesystemadaptor.lastModifiedDays", "1"); + long now = System.currentTimeMillis(); + testLastModifiedTimeFilter(now, false); + } + + private void testLastModifiedTimeFilter(String date, boolean excluded) + throws Exception { + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + dateFormat.setCalendar(Calendar.getInstance()); + dateFormat.setLenient(true); + testLastModifiedTimeFilter(dateFormat.parse(date).getTime(), excluded); + } + + private void testLastModifiedTimeFilter(long fileTime, boolean excluded) + throws Exception { + MockFile file = new MockFile("test.html"); + file.setLastModifiedTime(FileTime.fromMillis(fileTime)); + testFileTimeFilter(file, excluded); + } + private static class DenyShareAclAccessMockFile extends MockFile { DenyShareAclAccessMockFile(String name, boolean isDirectory) { super(name, isDirectory);