Why does Lucene return no matching results if BM25 algorithm is used to compute Document similarity?

Question

Hello im trying to do some document similarity calculations with the Okapi BM25 algorithm.

But I got problems with the query type. I can't get a results except when Im using the default Queryparser.

The basic idea was to index target documents and compare them with source documents by building a query with the content of the document.

This is a very minimalistic approach, but I got to make it work. Please correct me if Im doing something stupid.

My Code looks as followed:

package de.paul.bm25;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

public class DocumentSimilarityBM25 {

    Analyzer analyzer;
    Directory index;
    IndexWriterConfig config;
    IndexWriter writer;
    IndexReader reader;
    IndexSearcher searcher;
    Similarity similarity = new DefaultSimilarity();
    String FIELD_CONTENT = "CONTENT";

    public DocumentSimilarityBM25() throws IOException {
        analyzer = new KeywordAnalyzer();
        index  = new RAMDirectory();
        config = new IndexWriterConfig(analyzer);
        writer = new IndexWriter(index, config);
        similarity = new BM25Similarity();
    }

    public void start() {
        try {
            index();
            List<TopDocs> candidates = search();
            printResults(candidates);
        } catch (IOException | ParseException e) {
            e.printStackTrace();
        }
    }

    String[] srcDocuments = new String[]{
        "apples are tastefull",
        "apples and oranges grow an trees",
        "banana are yellow and very sweet",
        "this is a zero"
    };

    String[] trgDocuments = new String[]{
        "apples oranges and banana",
        "apples grow on appletrees",
        "bananes have much suga. " +
        "so they are high caloric",
        "bananas have a curvy form",
        "oranges have the orangecolor and are bigger than apples"
    };

    private void index() throws IOException {
        for(String target :trgDocuments) {
            addDoc(createDoc(target));  
        }
        System.out.println("Number of indexed Files:" + writer.maxDoc());
        writer.close();
    }

    private Query createQuery(Document doc) {
        final DisjunctionMaxQuery qry = new DisjunctionMaxQuery(0.0f);
        BooleanQuery bQuery = new BooleanQuery();
        PhraseQuery pQuery = new PhraseQuery();
        //MultiPhraseQuery mPhrase = new MultiPhraseQuery();

        String content = doc.get(FIELD_CONTENT);
        String[] terms = content.split("\\s");
        for(String term : terms) {
            pQuery = new PhraseQuery();
            pQuery.add(new Term(FIELD_CONTENT, term));
            bQuery.add(pQuery, Occur.SHOULD);
        }

        qry.add(bQuery);
        return qry;
    }

    private List<TopDocs> search() throws IOException, ParseException {
        List<TopDocs> candidates = new ArrayList<>();
        //Query query = new org.apache.lucene.queryparser.classic.QueryParser(FIELD_CONTENT, analyzer).parse(srcDocument);
        reader = DirectoryReader.open(index);
        searcher = new IndexSearcher(reader);
        searcher.setSimilarity(similarity);

        for(String source : srcDocuments) {
            Query query = createQuery(createDoc(source));

            System.out.println("Query:"+query.toString());
            TopDocs candidate = searcher.search(query, reader.maxDoc());
            candidates.add(candidate);
        }

        return candidates;
    }

    private void printResults(List<TopDocs> candidates) throws IOException {
        for(TopDocs candidate : candidates) {
            prinCandidate(candidate);
        }
        reader.close();
    }

    private void prinCandidate(TopDocs candidate) throws IOException {
        float maxScore = candidate.getMaxScore();
        ScoreDoc[] hits = candidate.scoreDocs;

        System.out.println("Found " + hits.length + " hits.");
        System.out.println("MaxScore:" + maxScore);

        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            float score = hits[i].score;

            System.out.println((i + 1)
                    + ". Score: " + score
                    + " " + d.get(FIELD_CONTENT) + "\t"
            );
        }   
    }

    private void addDoc(Document doc) throws IOException {
        writer.addDocument(doc);
        writer.commit();
    }

    private Document createDoc(String content) throws IOException {
        Document doc = new Document();
        doc.add(new TextField(FIELD_CONTENT, content, Field.Store.YES));
        return doc;
    }

}

Answer 1

Your analyzer is the problem. KeywordAnalyzer indexes the entire field as a single token. It should be used for keywords, unique identifiers, part numbers, stuff like that.

You are attempting to search text, though. Use StandardAnalyzer instead, and you'll start seeing results:

public DocumentSimilarityBM25() throws IOException {
    analyzer = new StandardAnalyzer();
    index  = new RAMDirectory();
  ...

Why does Lucene return no matching results if BM25 algorithm is used to compute Document similarity?

Question

1 answers

solution1
1 2015-08-09 18:54:36

Why does Lucene return no matching results if BM25 algorithm is used to compute Document similarity?

Question

1 answers

solution1 1 2015-08-09 18:54:36

solution1
1 2015-08-09 18:54:36