Extractor plugin modules are available in Confluence 1.4 and later |
Attachment content extractor plugins enable Confluence to index the contents of attachments that it may not otherwise understand. |
Attachment content extractor plugins must extend the bucket.search.lucene.extractor.BaseAttachmentContentExtractor base class. The skeleton of this class is:
package bucket.search.lucene.extractor;
import bucket.search.lucene.Extractor;
import bucket.search.lucene.SearchableAttachment;
import bucket.search.Searchable;
import org.apache.lucene.document.Document;
import com.opensymphony.util.TextUtils;
import java.io.InputStream;
import java.io.IOException;
public abstract class BaseAttachmentContentExtractor implements Extractor
{
/** You should not have to override this method */
public void addFields(Document document, StringBuffer defaultSearchableText, Searchable searchable);
/** Override this method if you can not get the functionality you want by overriding
getMatchingContentTypes() and getMatchingFilenameExtensions() */
protected boolean shouldExtractFrom(String fileName, String contentType);
/** Override this method to return the MIME content-types that your plugin knows how to extract
text from. If you have already overridden shouldExtractFrom(), this method is useless */
protected String[] getMatchingContentTypes()
{
return new String[0];
}
/** Override this method to return the filename extensions that your plugin knows how to extract
text from. If you have already overridden shouldExtractFrom(), this method is useless */
protected String[] getMatchingFileExtensions()
{
return new String[0];
}
/** Override this method to do the actual work of extracting the content of the attachment. Your extractor
should return the text that is to be indexed */
protected abstract String extractText(InputStream is, SearchableAttachment attachment) throws IOException;
}
|
The first attachment content extractor that returns Other (non-attachment) content extractors will still run, regardless. |
This is an example of a hypothetical extractor that extracts the contents of mp3 ID3 tags.
package com.example.extras.extractor;
import.com.hypothetical.id3.Id3Tag
import bucket.search.lucene.extractor.BaseAttachmentContentExtractor;
import bucket.search.lucene.SearchableAttachment;
import java.io.InputStream;
import java.io.IOException;
public class Id3Extractor extends BaseAttachmentContentExtractor
{
public static final String[] MIME_TYPES = {"audio/x-mp3",
"audio/mpeg",
"audio/mp4a-latm"};
public static final String[] FILE_EXTS = {"mp3", "m4a"};
protected String extractText(InputStream is, SearchableAttachment attachment)
throws IOException
{
Id3Tag tag = Id3Tag.parse(is);
return (tag.getTitle() + " " + tag.getArtist() + " "
+ tag.getGenre() + " " + tag.getAlbumTitle());
}
protected String[] getMatchingContentTypes()
{
return MIME_TYPES;
}
protected String[] getMatchingFileExtensions()
{
return FILE_EXTS;
}
}
|