[英]Searching for UUID in lucene not working
我有一個UUID字段,我將按以下格式添加到我的文檔中:372d325c-e01b-432f-98bd-bc4c949f15b8。 但是,當我嘗試通過UUID查詢文檔時,無論我如何嘗試轉義表達式,它都不會返回它們。 例如:
+uuid:372d325c-e01b-432f-98bd-bc4c949f15b8
+uuid:"372d325c-e01b-432f-98bd-bc4c949f15b8"
+uuid:372d325c\-e01b\-432f\-98bd\-bc4c949f15b8
+uuid:(372d325c-e01b-432f-98bd-bc4c949f15b8)
+uuid:("372d325c-e01b-432f-98bd-bc4c949f15b8")
甚至使用TermQuery完全跳過QueryParser,如下所示:
new TermQuery(new Term("uuid", uuid.toString()))
要么
new TermQuery(new Term("uuid", QueryParser.escape(uuid.toString())))
這些搜索都不會返回文檔,但如果我搜索UUID的某些部分,它將返回一個文檔。 例如,這些將返回一些東西:
+uuid:372d325c
+uuid:e01b
+uuid:432f
我應該怎么做索引這些文件,以便我可以通過他們的UUID拉回來? 我已經考慮重新格式化UUID以刪除連字符,但我還沒有實現它。
我使用它的唯一方法是使用WhitespaceAnalyzer而不是StandardAnalyzer。 然后使用如下的TermQuery:
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
writer = new IndexWriter( directory, config);
然后搜索:
TopDocs docs = searcher.search(new TermQuery(new Term("uuid", uuid.toString())), 1);
WhitespaceAnalyzer阻止Lucene用連字符拆分UUID。 另一種選擇可能是從UUID中消除破折號,但使用WhitespaceAnalyzer也可以用於我的目的。
根據Lucene查詢語法規則 ,查詢
+uuid:372d325c\-e01b\-432f\-98bd\-bc4c949f15b8
應該管用。
我想如果不這樣做,那是因為當文檔插入索引時,未填充uuid
字段。 你能確定這個領域究竟插入了什么嗎? 您可以使用Luke抓取索引並查找為uuid
字段存儲的實際值。
如果您計划將UUID字段作為查找鍵,則需要讓Lucene將整個字段索引為單個字符串,而不進行標記化。 這是通過為UUID字段設置正確的FieldType來完成的。 在Lucene 4+中,您可以使用StringField。
import java.io.IOException;
import java.util.UUID;
import junit.framework.Assert;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
/**
* Using Lucene 4.7 on Java 7.
*/
public class LuceneUUIDFieldLookupTest {
private Directory directory;
private Analyzer analyzer;
@Test
public void testUsingUUIDAsLookupKey() throws IOException, ParseException {
directory = new RAMDirectory();
analyzer = new StandardAnalyzer(Version.LUCENE_47);
UUID docUUID = UUID.randomUUID();
String docContentText1 = "Stack Overflow is a question and answer site for professional and enthusiast programmers.";
index(docUUID, docContentText1);
QueryParser parser = new QueryParser(Version.LUCENE_47, MyIndexedFields.DOC_TEXT_FIELD.name(), analyzer);
Query queryForProgrammer = parser.parse("programmers");
IndexSearcher indexSearcher = getIndexSearcher();
TopDocs hits = indexSearcher.search(queryForProgrammer, Integer.MAX_VALUE);
Assert.assertTrue(hits.scoreDocs.length == 1);
Integer internalDocId1 = hits.scoreDocs[0].doc;
Document docRetrieved1 = indexSearcher.doc(internalDocId1);
indexSearcher.getIndexReader().close();
String docText1 = docRetrieved1.get(MyIndexedFields.DOC_TEXT_FIELD.name());
Assert.assertEquals(docText1, docContentText1);
String docContentText2 = "TechCrunch is a leading technology media property, dedicated to ... according to a new report from the Wall Street Journal confirmed by Google to TechCrunch.";
reindex(docUUID, docContentText2);
Query queryForTechCrunch = parser.parse("technology");
indexSearcher = getIndexSearcher(); //you must reopen directory because the previous IndexSearcher only sees a snapshoted directory.
hits = indexSearcher.search(queryForTechCrunch, Integer.MAX_VALUE);
Assert.assertTrue(hits.scoreDocs.length == 1);
Integer internalDocId2 = hits.scoreDocs[0].doc;
Document docRetrieved2 = indexSearcher.doc(internalDocId2);
indexSearcher.getIndexReader().close();
String docText2 = docRetrieved2.get(MyIndexedFields.DOC_TEXT_FIELD.name());
Assert.assertEquals(docText2, docContentText2);
}
private void reindex(UUID myUUID, String docContentText) throws IOException {
try (IndexWriter indexWriter = new IndexWriter(directory, getIndexWriterConfig())) {
Term term = new Term(MyIndexedFields.MY_UUID_FIELD.name(), myUUID.toString());
indexWriter.updateDocument(term, buildDoc(myUUID, docContentText));
}//auto-close
}
private void index(UUID myUUID, String docContentText) throws IOException {
try (IndexWriter indexWriter = new IndexWriter(directory, getIndexWriterConfig())) {
indexWriter.addDocument(buildDoc(myUUID, docContentText));
}//auto-close
}
private IndexWriterConfig getIndexWriterConfig() {
return new IndexWriterConfig(Version.LUCENE_47, analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
private Document buildDoc(UUID myUUID, String docContentText) {
Document doc = new Document();
doc.add(new Field(
MyIndexedFields.MY_UUID_FIELD.name(),
myUUID.toString(),
StringField.TYPE_STORED));//use TYPE_STORED if you want to read it back in search result.
doc.add(new Field(
MyIndexedFields.DOC_TEXT_FIELD.name(),
docContentText,
TextField.TYPE_STORED));
return doc;
}
private IndexSearcher getIndexSearcher() throws IOException {
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(ireader);
return indexSearcher;
}
enum MyIndexedFields {
MY_UUID_FIELD,
DOC_TEXT_FIELD
}
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.