blob: ce7bd567a477e87cd751230b3ef809766017ff0e [file] [log] [blame]
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.adaptor.examples;
import com.google.enterprise.adaptor.AbstractDocumentTransform;
import com.google.enterprise.adaptor.Metadata;
import com.google.enterprise.adaptor.TransformException;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Comparator;
import java.util.Map;
import java.util.Scanner;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.logging.Logger;
import java.util.regex.Pattern;
/**
* The transform examines the document for regex patterns. If a pattern is found,
* the associated metadata is inserted at the end of the HEAD section of the
* HTML. If no HEAD section exists, nothing gets inserted.
*/
public class MetaTaggerTransform extends AbstractDocumentTransform {
private static final Logger log = Logger.getLogger(MetaTaggerTransform.class.getName());
public MetaTaggerTransform() {}
public MetaTaggerTransform(String patternFile) throws IOException {
loadPatternFile(patternFile);
}
@Override
public void transform(ByteArrayOutputStream contentIn, OutputStream contentOut,
Metadata metadata, Map<String, String> params)
throws TransformException, IOException {
String content = contentIn.toString();
StringBuilder sb = new StringBuilder();
for (Map.Entry<Pattern, String> entry : patternMappings.entrySet()) {
if (entry.getKey().matcher(content).find()) {
sb.append(entry.getValue());
}
}
// This is a very simple insertion mechanism. It looks for the closing
// </HEAD> element and inserts the metadata right before it.
content = content.replaceFirst("</(HEAD|head)", "\n" + sb.toString() + "</HEAD");
contentOut.write(content.getBytes());
}
private void loadPatternFile(String filename) throws IOException {
Scanner sc = new Scanner(new File(filename));
while (sc.hasNextLine()) {
String line = sc.nextLine().trim();
int sepIndex = line.indexOf(PATTERN_FILE_SEP);
if (line.isEmpty() || sepIndex < 0) {
continue;
}
Pattern pattern = Pattern.compile(line.substring(0, sepIndex));
String metadata = line.substring(sepIndex + 1, line.length());
String existing = patternMappings.get(pattern);
if (existing == null) {
patternMappings.put(pattern, metadata + "\n");
} else {
patternMappings.put(pattern, existing + metadata + "\n");
}
}
}
/**
* Maps Pattern to String representation of metadata.
* The String is assumed to be a valid HTML fragment that is pasted into the
* HEAD section of the HTML document.
*
* We use a SortedMap with this comparator to ensure we get the same metadata
* ordering for each invocation. Serving different docs each time could lead
* to unnecessary recrawls from the GSA.
*/
private SortedMap<Pattern, String> patternMappings =
new TreeMap<Pattern, String>(new PatternComparator());
private static final char PATTERN_FILE_SEP = ' ';
private class PatternComparator implements Comparator<Pattern> {
public int compare(Pattern p1, Pattern p2) {
return p1.toString().compareTo(p2.toString());
}
public boolean equals(Pattern p1, Pattern p2) {
return p1.toString().equals(p2.toString());
}
}
public static MetaTaggerTransform create(Map<String, String> config)
throws IOException {
String patternFile = config.get("patternFile");
MetaTaggerTransform transform;
if (patternFile == null) {
transform = new MetaTaggerTransform();
} else {
transform = new MetaTaggerTransform(patternFile);
}
transform.configure(config);
return transform;
}
}