[英]How does regex query work on lucene?
我正在嘗試在我的應用程序中實現luecene搜索引擎。
我正在使用Lucene 5.4.1
我已經成功實現了Lucene的Wildequeques和常規查詢。
但是我的主要重點是在帶有正則表達式模式的文本文件中搜索特定文本。
索引作者代碼:
public IndexWriter generateIndex(String docsPath) throws IOException {
String indexPath = System.getProperty("java.io.tmpdir") +File.separator+"indexDirectory";
if (indexPath == null) {
throw new IOException("System property 'java.io.tmpdir' does not specify a tmp dir");
}
File tmpDir = new File(indexPath);
if (!tmpDir.exists()) {
boolean created = tmpDir.mkdirs();
if (!created) {
throw new IOException("Unable to create tmp dir " + tmpDir);
}
}
boolean create = true;
final Path docDir = Paths.get(docsPath);
if (!Files.isReadable(docDir)) {
System.out.println("Document directory '" + docDir.toAbsolutePath()
+ "' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
System.out.println("Indexing to directory '" + indexPath + "'...");
Directory dir = FSDirectory.open(Paths.get(indexPath));
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
if (create) {
iwc.setOpenMode(OpenMode.CREATE);
} else {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
setIndexWriter(writer);
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
writer.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
return getIndexWriter();
}
static void indexDocs(final IndexWriter writer, Path path) throws IOException {
if (Files.isDirectory(path)) {
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
try {
indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
} catch (IOException ignore) {
// don't index files that can't be read.
}
return FileVisitResult.CONTINUE;
}
});
} else {
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
}
}
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
try (InputStream stream = Files.newInputStream(file)) {
Document doc = new Document();
Field pathField = new StringField("path", file.toString(), Field.Store.NO);
doc.add(pathField);
doc.add(new LongField("modified", lastModified, Field.Store.NO));
doc.add(new TextField("contents",
new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding " + file);
writer.addDocument(doc);
} else {
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.toString()), doc);
}
}
}
索引搜索代碼:
public IndexReader searchExecutor(String index, String queryString, RegexCapabilities capability) throws Exception {
String field = "contents";
String queries = null;
boolean raw = false;
int hitsPerPage = Integer.MAX_VALUE;
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
BufferedReader in = null;
Query q = new RegexpQuery(new Term("text", queryString));
q = q.rewrite(reader);
RegexQuery query = new RegexQuery(new Term("\\s*(FIND|find)"));
if (capability != null)
query.setRegexImplementation(capability);
System.out.println("Searching for: " + query.toString(field));
searcher.search(query, null, 1000);
doSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);
//reader.close();
return reader;
}
public static void doSearch(BufferedReader in, IndexSearcher searcher, Query query, int hitsPerPage, boolean raw,
boolean interactive)
throws IOException {
TopDocs results = searcher.search(query, 5 * hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
//generateIndex.deleteDocuments(query);
//generateIndex.getDirectory();
// TermsEnum.totalTermFreq();
int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents");
int start = 0;
int end = Math.min(numTotalHits, hitsPerPage);
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
String path = doc.get("path");
File file = new File(path);
if (path != null) {
System.out.println((i + 1) + ". " + path);
String title = doc.get("title");
if (title != null) {
System.out.println(" Title: " + doc.get("title"));
}
} else {
System.out.println((i + 1) + ". " + "No path for this document");
}
}
}
請幫忙。
您的問題是關於在Lucene中使用正則表達式進行搜索。
RegexQuery
,因此請嘗試RegexpQuery
\\s*
開頭,但您沒有使用KeywordTokenizer 。 大多數其他令牌生成器將刪除(也稱為“拆分為”)空白 ->閱讀ES和TestRegexpRandom(測試類)中 支持的RegExp語法和語法,並在索引上使用https://github.com/DmitryKey/luke 。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.