View Source

Extractor plugin modules are available in Confluence 1.4 and later

Attachment content extractor plugins enable Confluence to index the contents of attachments that it may not otherwise understand.

Before you read this document, you should be familiar with Extractor Plugins.

The BaseAttachmentContentExtractor class

Attachment content extractor plugins must extend the bucket.search.lucene.extractor.BaseAttachmentContentExtractor base class. The skeleton of this class is:

package bucket.search.lucene.extractor;

import bucket.search.lucene.Extractor;
import bucket.search.lucene.SearchableAttachment;
import bucket.search.Searchable;
import org.apache.lucene.document.Document;
import com.opensymphony.util.TextUtils;

import java.io.InputStream;
import java.io.IOException;

public abstract class BaseAttachmentContentExtractor implements Extractor
{
    /** You should not have to override this method */
    public void addFields(Document document, StringBuffer defaultSearchableText, Searchable searchable);

    /** Override this method if you can not get the functionality you want by overriding
        getMatchingContentTypes() and getMatchingFilenameExtensions() */
    protected boolean shouldExtractFrom(String fileName, String contentType);

    /** Override this method to return the MIME content-types that your plugin knows how to extract
        text from. If you have already overridden shouldExtractFrom(), this method is useless */
    protected String[] getMatchingContentTypes()
    {
        return new String[0];
    }

    /** Override this method to return the filename extensions that your plugin knows how to extract
        text from. If you have already overridden shouldExtractFrom(), this method is useless */
    protected String[] getMatchingFileExtensions()
    {
        return new String[0];
    }

    /** Override this method to do the actual work of extracting the content of the attachment. Your extractor
        should return the text that is to be indexed */
    protected abstract String extractText(InputStream is, SearchableAttachment attachment) throws IOException;
}

The first attachment content extractor that returns true from shouldExtractFrom, and a not-null, not-empty String from extractText() will cause all remaining attachment content extractors not to run against this file. Thus, it's important to get the priority value for your plugin right, so general, but inaccurate extractors are set to run after specific, more accurate extractors.

Other (non-attachment) content extractors will still run, regardless.

An Example

This is an example of a hypothetical extractor that extracts the contents of mp3 ID3 tags.

package com.example.extras.extractor;

import.com.hypothetical.id3.Id3Tag
import bucket.search.lucene.extractor.BaseAttachmentContentExtractor;
import bucket.search.lucene.SearchableAttachment;

import java.io.InputStream;
import java.io.IOException;

public class Id3Extractor extends BaseAttachmentContentExtractor
{
    public static final String[] MIME_TYPES = {"audio/x-mp3", 
                                               "audio/mpeg", 
                                               "audio/mp4a-latm"};

    public static final String[] FILE_EXTS = {"mp3", "m4a"};


    protected String extractText(InputStream is, SearchableAttachment attachment) 
            throws IOException
    {
        Id3Tag tag = Id3Tag.parse(is);
        return (tag.getTitle() + " " + tag.getArtist() + " " 
                + tag.getGenre() + " " + tag.getAlbumTitle());
    }

    protected String[] getMatchingContentTypes()
    {
        return MIME_TYPES;
    }

    protected String[] getMatchingFileExtensions()
    {
        return FILE_EXTS;
    }
}