Merge branch 'master' of https://code.google.com/p/plexi
diff --git a/src/com/google/enterprise/adaptor/Config.java b/src/com/google/enterprise/adaptor/Config.java
index 56db897..9d1cd53 100644
--- a/src/com/google/enterprise/adaptor/Config.java
+++ b/src/com/google/enterprise/adaptor/Config.java
@@ -75,13 +75,17 @@
* are already URLs and avoid them being inserted into adaptor
generated URLs. Defaults to false
* <tr><td> </td><td>feed.crawlImmediatelyBitEnabled </td><td> send bit telling
- * GSA to crawl immediately. Defaults to false
+ * GSA to crawl immediately.
+ * Defaults to not overriding adaptor's decision which is typically to send
+ * updates as crawl-immediately and let GSA schedule crawl of all other ids
+ * <tr><td> </td><td>feed.noRecrawlBitEnabled </td><td> send bit telling
+ * GSA to crawl your documents only once.
+ * Defaults to not overriding adaptor's decision which is typically to send
+ * all documents as recrawlable (equivalent to value of false)
* <tr><td> </td><td>feed.maxUrls </td><td> set max number of URLs included
* per feed file. Defaults to 5000
* <tr><td> </td><td>feed.name </td><td> source name used in feeds. Generated
* if not provided
- * <tr><td> </td><td>feed.noRecrawlBitEnabled </td><td> send bit telling
- * GSA to crawl your documents only once. Defaults to false
* <tr><td> </td><td>feed.archiveDirectory </td><td> specifies a directory in
* which all feeds sent to the GSA will be archived. Feeds that failed to
* be sent to the GSA will be tagged with "FAILED" in the file name.
@@ -235,8 +239,8 @@
return rawValue;
}
});
- addKey("feed.noRecrawlBitEnabled", "false");
- addKey("feed.crawlImmediatelyBitEnabled", "false");
+ addKey("feed.noRecrawlBitEnabled", "");
+ addKey("feed.crawlImmediatelyBitEnabled", "");
//addKey("feed.noFollowBitEnabled", "false");
addKey("feed.maxUrls", "5000");
addKey("adaptor.pushDocIdsOnStartup", "true");
@@ -442,21 +446,43 @@
return Boolean.parseBoolean(getValue("gsa.acceptsDocControlsHeader"));
}
- /**
- * Optional (default false): Adds no-recrawl bit with sent records in feed
- * file. If connector handles updates and deletes then GSA does not have to
- * recrawl periodically to notice that a document is changed or deleted.
- */
- boolean isFeedNoRecrawlBitEnabled() {
- return Boolean.getBoolean(getValue("feed.noRecrawlBitEnabled"));
+ static class OverridableBoolean {
+ final boolean isOverriden; // whether value is to be overriden
+ final boolean value; // the overriding value
+ private OverridableBoolean(boolean override) {
+ isOverriden = true;
+ value = override;
+ }
+ private OverridableBoolean() {
+ isOverriden = false;
+ value = false; // whatever
+ }
}
/**
- * Optional (default false): Adds crawl-immediately bit with sent records in
+ * Optional: Adds crawl-immediately bit with sent records in
* feed file. This bit makes the sent URL get crawl priority.
*/
- boolean isCrawlImmediatelyBitEnabled() {
- return Boolean.parseBoolean(getValue("feed.crawlImmediatelyBitEnabled"));
+ OverridableBoolean isCrawlImmediatelyBitEnabled() {
+ String provided = getValue("feed.crawlImmediatelyBitEnabled");
+ if ("".equals(provided.trim())) {
+ return new OverridableBoolean();
+ }
+ return new OverridableBoolean(Boolean.parseBoolean(provided));
+ }
+
+ /**
+ * Optional: Adds no-recrawl bit with sent records in feed
+ * file. If connector handles updates and deletes then GSA
+ * does not have to recrawl periodically to notice that a
+ * document is changed or deleted.
+ */
+ OverridableBoolean isFeedNoRecrawlBitEnabled() {
+ String provided = getValue("feed.noRecrawlBitEnabled");
+ if ("".equals(provided.trim())) {
+ return new OverridableBoolean();
+ }
+ return new OverridableBoolean(Boolean.parseBoolean(provided));
}
/**
diff --git a/src/com/google/enterprise/adaptor/GsaCommunicationHandler.java b/src/com/google/enterprise/adaptor/GsaCommunicationHandler.java
index dcd785e..db77785 100644
--- a/src/com/google/enterprise/adaptor/GsaCommunicationHandler.java
+++ b/src/com/google/enterprise/adaptor/GsaCommunicationHandler.java
@@ -208,7 +208,11 @@
aclTransform = createAclTransform();
GsaFeedFileMaker fileMaker = new GsaFeedFileMaker(docIdCodec, aclTransform,
config.isGsa614FeedWorkaroundEnabled(),
- config.isGsa70AuthMethodWorkaroundEnabled());
+ config.isGsa70AuthMethodWorkaroundEnabled(),
+ config.isCrawlImmediatelyBitEnabled().isOverriden,
+ config.isCrawlImmediatelyBitEnabled().value,
+ config.isFeedNoRecrawlBitEnabled().isOverriden,
+ config.isFeedNoRecrawlBitEnabled().value);
GsaFeedFileArchiver fileArchiver =
new GsaFeedFileArchiver(config.getFeedArchiveDirectory());
docIdSender = new DocIdSender(fileMaker, fileSender, fileArchiver, journal,
diff --git a/src/com/google/enterprise/adaptor/GsaFeedFileMaker.java b/src/com/google/enterprise/adaptor/GsaFeedFileMaker.java
index 37a329d..82056fe 100644
--- a/src/com/google/enterprise/adaptor/GsaFeedFileMaker.java
+++ b/src/com/google/enterprise/adaptor/GsaFeedFileMaker.java
@@ -65,6 +65,10 @@
private final AclTransform aclTransform;
private final boolean separateClosingRecordTagWorkaround;
private final boolean useAuthMethodWorkaround;
+ private final boolean crawlImmediatelyIsOverriden;
+ private final boolean crawlImmediatelyOverrideValue;
+ private final boolean crawlOnceIsOverriden;
+ private final boolean crawlOnceOverrideValue;
public GsaFeedFileMaker(DocIdEncoder encoder, AclTransform aclTransform) {
this(encoder, aclTransform, false, false);
@@ -73,11 +77,26 @@
public GsaFeedFileMaker(DocIdEncoder encoder, AclTransform aclTransform,
boolean separateClosingRecordTagWorkaround,
boolean useAuthMethodWorkaround) {
+ this(encoder, aclTransform, separateClosingRecordTagWorkaround,
+ useAuthMethodWorkaround, false, false, false, false);
+ }
+
+ public GsaFeedFileMaker(DocIdEncoder encoder, AclTransform aclTransform,
+ boolean separateClosingRecordTagWorkaround,
+ boolean useAuthMethodWorkaround,
+ boolean overrideCrawlImmediately,
+ boolean crawlImmediately,
+ boolean overrideCrawlOnce,
+ boolean crawlOnce) {
this.idEncoder = encoder;
this.aclTransform = aclTransform;
this.separateClosingRecordTagWorkaround
= separateClosingRecordTagWorkaround;
this.useAuthMethodWorkaround = useAuthMethodWorkaround;
+ this.crawlImmediatelyIsOverriden = overrideCrawlImmediately;
+ this.crawlImmediatelyOverrideValue = crawlImmediately;
+ this.crawlOnceIsOverriden = overrideCrawlOnce;
+ this.crawlOnceOverrideValue = crawlOnce;
}
/** Adds header to document's root.
@@ -123,10 +142,15 @@
if (docRecord.isToBeLocked()) {
record.setAttribute("lock", "true");
}
- if (docRecord.isToBeCrawledImmediately()) {
+ if (crawlImmediatelyIsOverriden) {
+ record.setAttribute("crawl-immediately",
+ "" + crawlImmediatelyOverrideValue);
+ } else if (docRecord.isToBeCrawledImmediately()) {
record.setAttribute("crawl-immediately", "true");
}
- if (docRecord.isToBeCrawledOnce()) {
+ if (crawlOnceIsOverriden) {
+ record.setAttribute("crawl-once", "" + crawlOnceOverrideValue);
+ } else if (docRecord.isToBeCrawledOnce()) {
record.setAttribute("crawl-once", "true");
}
if (useAuthMethodWorkaround) {
diff --git a/test/com/google/enterprise/adaptor/GsaFeedFileMakerTest.java b/test/com/google/enterprise/adaptor/GsaFeedFileMakerTest.java
index 7eefa33..787448b 100644
--- a/test/com/google/enterprise/adaptor/GsaFeedFileMakerTest.java
+++ b/test/com/google/enterprise/adaptor/GsaFeedFileMakerTest.java
@@ -489,4 +489,174 @@
xml = xml.replaceAll("\r\n", "\n");
assertEquals(golden, xml);
}
+
+ @Test
+ public void testCrawlImmediatelyOverride() throws java.net.URISyntaxException {
+ GsaFeedFileMaker lclMeker = new GsaFeedFileMaker(encoder, aclTransform,
+ false, false,
+ /*override crawl-immediately?*/ true,
+ /*crawl-immediately value*/ false,
+ /*override crawl-once?*/ false,
+ /*crawl-once value*/ false);
+ String golden =
+ "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"
+ + "<!DOCTYPE gsafeed PUBLIC \"-//Google//DTD GSA Feeds//EN\" \"\">\n"
+ + "<gsafeed>\n"
+ + "<!--GSA EasyConnector-->\n"
+ + "<header>\n"
+ + "<datasource>t3sT</datasource>\n"
+ + "<feedtype>metadata-and-url</feedtype>\n"
+ + "</header>\n"
+ + "<group>\n"
+
+ // (1)
+ + "<record crawl-immediately=\"false\""
+ + " displayurl=\"http://f000nkey.net\" mimetype=\"text/plain\""
+ + " url=\"http://localhost/E11\"/>\n"
+
+ // (2)
+ + "<record crawl-immediately=\"false\""
+ + " displayurl=\"http://yankee.doodle.com\""
+ + " last-modified=\"Thu, 01 Jan 1970 00:00:00 +0000\""
+ + " mimetype=\"text/plain\" url=\"http://localhost/elefenta\"/>\n"
+
+ // (3)
+ + "<record crawl-immediately=\"false\""
+ + " displayurl=\"http://google.com/news\""
+ + " last-modified=\"Fri, 02 Jan 1970 00:00:00 +0000\""
+ + " mimetype=\"text/plain\" url=\"http://localhost/gone\"/>\n"
+
+ // (4)
+ + "<record crawl-immediately=\"false\" crawl-once=\"true\""
+ + " lock=\"true\" mimetype=\"text/plain\""
+ + " url=\"http://localhost/flagson\"/>\n"
+
+ // (5)
+ + "<record action=\"delete\" crawl-immediately=\"false\""
+ + " mimetype=\"text/plain\""
+ + " url=\"http://localhost/deleted\"/>\n"
+
+ + "</group>\n"
+ + "</gsafeed>\n";
+
+ ArrayList<DocIdPusher.Record> ids = new ArrayList<DocIdPusher.Record>();
+ DocIdPusher.Record.Builder attrBuilder
+ = new DocIdPusher.Record.Builder(new DocId("E11"));
+
+ // (1)
+ attrBuilder.setResultLink(new URI("http://f000nkey.net"));
+ ids.add(attrBuilder.build());
+
+ // (2)
+ attrBuilder.setResultLink(new URI("http://yankee.doodle.com"));
+ attrBuilder.setLastModified(new Date(0));
+ attrBuilder.setCrawlImmediately(true);
+ attrBuilder.setDocId(new DocId("elefenta"));
+ ids.add(attrBuilder.build());
+
+ // (3)
+ attrBuilder.setResultLink(new URI("http://google.com/news"));
+ attrBuilder.setLastModified(new Date(1000 * 60 * 60 * 24));
+ attrBuilder.setCrawlImmediately(false);
+ attrBuilder.setCrawlOnce(false);
+ attrBuilder.setDocId(new DocId("gone"));
+ ids.add(attrBuilder.build());
+
+ // (4)
+ ids.add(new DocIdPusher.Record.Builder(new DocId("flagson"))
+ .setLock(true).setCrawlImmediately(true).setCrawlOnce(true).build());
+
+ // (5)
+ ids.add(new DocIdPusher.Record.Builder(new DocId("deleted"))
+ .setDeleteFromIndex(true).build());
+
+ String xml = lclMeker.makeMetadataAndUrlXml("t3sT", ids);
+ xml = xml.replaceAll("\r\n", "\n");
+ assertEquals(golden, xml);
+ }
+
+ @Test
+ public void testCrawlOnceOverride() throws java.net.URISyntaxException {
+ GsaFeedFileMaker lclMeker = new GsaFeedFileMaker(encoder, aclTransform,
+ false, false,
+ /*override crawl-immediately?*/ false,
+ /*crawl-immediately value*/ false,
+ /*override crawl-once?*/ true,
+ /*crawl-once value*/ false);
+ String golden =
+ "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"
+ + "<!DOCTYPE gsafeed PUBLIC \"-//Google//DTD GSA Feeds//EN\" \"\">\n"
+ + "<gsafeed>\n"
+ + "<!--GSA EasyConnector-->\n"
+ + "<header>\n"
+ + "<datasource>t3sT</datasource>\n"
+ + "<feedtype>metadata-and-url</feedtype>\n"
+ + "</header>\n"
+ + "<group>\n"
+
+ // (1)
+ + "<record crawl-once=\"false\""
+ + " displayurl=\"http://f000nkey.net\" mimetype=\"text/plain\""
+ + " url=\"http://localhost/E11\"/>\n"
+
+ // (2)
+ + "<record crawl-immediately=\"true\" crawl-once=\"false\""
+ + " displayurl=\"http://yankee.doodle.com\""
+ + " last-modified=\"Thu, 01 Jan 1970 00:00:00 +0000\""
+ + " mimetype=\"text/plain\" url=\"http://localhost/elefenta\"/>\n"
+
+ // (3)
+ + "<record crawl-once=\"false\""
+ + " displayurl=\"http://google.com/news\""
+ + " last-modified=\"Fri, 02 Jan 1970 00:00:00 +0000\""
+ + " mimetype=\"text/plain\" url=\"http://localhost/gone\"/>\n"
+
+ // (4)
+ + "<record crawl-immediately=\"true\" crawl-once=\"false\""
+ + " lock=\"true\" mimetype=\"text/plain\""
+ + " url=\"http://localhost/flagson\"/>\n"
+
+ // (5)
+ + "<record action=\"delete\" crawl-once=\"false\""
+ + " mimetype=\"text/plain\""
+ + " url=\"http://localhost/deleted\"/>\n"
+
+ + "</group>\n"
+ + "</gsafeed>\n";
+
+ ArrayList<DocIdPusher.Record> ids = new ArrayList<DocIdPusher.Record>();
+ DocIdPusher.Record.Builder attrBuilder
+ = new DocIdPusher.Record.Builder(new DocId("E11"));
+
+ // (1)
+ attrBuilder.setResultLink(new URI("http://f000nkey.net"));
+ ids.add(attrBuilder.build());
+
+ // (2)
+ attrBuilder.setResultLink(new URI("http://yankee.doodle.com"));
+ attrBuilder.setLastModified(new Date(0));
+ attrBuilder.setCrawlImmediately(true);
+ attrBuilder.setDocId(new DocId("elefenta"));
+ ids.add(attrBuilder.build());
+
+ // (3)
+ attrBuilder.setResultLink(new URI("http://google.com/news"));
+ attrBuilder.setLastModified(new Date(1000 * 60 * 60 * 24));
+ attrBuilder.setCrawlImmediately(false);
+ attrBuilder.setCrawlOnce(false);
+ attrBuilder.setDocId(new DocId("gone"));
+ ids.add(attrBuilder.build());
+
+ // (4)
+ ids.add(new DocIdPusher.Record.Builder(new DocId("flagson"))
+ .setLock(true).setCrawlImmediately(true).setCrawlOnce(true).build());
+
+ // (5)
+ ids.add(new DocIdPusher.Record.Builder(new DocId("deleted"))
+ .setDeleteFromIndex(true).build());
+
+ String xml = lclMeker.makeMetadataAndUrlXml("t3sT", ids);
+ xml = xml.replaceAll("\r\n", "\n");
+ assertEquals(golden, xml);
+ }
}