[英]How to get more than 10K logs/results in Elasticsearch
如果使用最新版本的 Elasticsearch (7.13) 說我有超過 10K 的日志/結果,我該如何獲取所有日志? 我正在閱讀滾動搜索結果,但一開始它顯示:
我們不再推薦使用滾動 API 進行深度分頁。 如果您需要在分頁超過 10,000 個點擊時保留索引狀態,請使用帶有時間點 (PIT) 的 search_after 參數。
但是使用search_after 時,它表示您可以訪問超過 10,000 次點擊,但您需要使用時間點 api來獲取 PIT(時間點)ID,然后將該 ID 傳遞給 search_after 參數。 在 kibana CLI 中,如果您輸入 cmd POST /YOUR PATTERN INDEX NAME*/_pit?keep_alive=1m
它將返回該 PIT ID。 但是您將如何在 NEST 中為 .net 客戶端執行該命令?
這只會告訴您如果您已經擁有 PIT ID 該怎么辦,但沒有向您展示如何執行 post 命令來獲取 PIT ID? 有沒有辦法不用去 Kibana -> Discover -> CLI 並運行命令POST /customer-simulation-es-app-logs*/_pit?keep_alive=1m
(customer-sim 是我的名字指數)
在實施 Rob 的示例之前,我有以下內容:
[HttpGet("GetMonthlyLogs")]
public async Task<List<EsSource>> GetLogsByDate()
{
string indexName = "customer-simulation-es-app-logs*";
var connectionSettings = new ConnectionSettings(new Uri("http://localhost:9200"));
connectionSettings.DefaultIndex(indexName);
connectionSettings.EnableDebugMode();
_elasticClient = new ElasticClient(connectionSettings);
// this will return the number of results in the index based on the criteria below:
var responseHits = _elasticClient.Count<EsSource>(c => c
.Query(q => q
.Bool(b => b
.Should(
m => m
.Match(ma => ma
.Field(fa => fa.level)
.Query("Error")),
m => m
.Match(ma => ma
.Field(fa => fa.level)
.Query("Information")))
.Filter(f => f.DateRange(dr => dr
.Field("@timestamp")
.GreaterThanOrEquals("2021-07-16T12:46:00.227-05:00")
.LessThanOrEquals("2021-07-16T12:55:00.227-05:00")))
.MinimumShouldMatch(1)))).Count;
var response = await _elasticClient.SearchAsync<EsSource>(s => s
.Size(3000) // must see about this
.Source(src => src.Includes(i => i
.Fields(f => f.timestamp,
f => f.level,
f => f.messageTemplate,
f => f.message)))
.Index("customer-simulation-es-app-logs*")
.Query(q => q
.Bool(b => b
.Should(
m => m
.Match(ma => ma
.Field(fa => fa.level)
.Query("Error")),
m => m
.Match(ma => ma
.Field(fa => fa.level)
.Query("Information")))
.Filter(f => f.DateRange(dr => dr
.Field("@timestamp")
.GreaterThanOrEquals("2021-07-16T12:46:00.227-05:00")
.LessThanOrEquals("2021-07-16T12:55:00.227-05:00")))
.MinimumShouldMatch(1))));
return response?.Documents.ToList();
}
public class EsSource
{
[Date(Name = "@timestamp")]
public DateTimeOffset timestamp { get; set; }
public String level { get; set; }
public String messageTemplate { get; set; }
public String message { get; set; }
}
我試圖給 Rob 的示例實現一個嘗試,我所做的是以下。 但是,我的問題是,如果我沒有 EsDocuments 中的“ID”,我可以改用timestamp
嗎? 需要foreach
是因為這會將結果分組為 1000 對嗎? 我也可以按時間戳排序還是嚴格必須是結果 ID? 由於我沒有ID
我想制作另一個使用 searchapi 的var searchResponse
,然后創建一個名為 EsID 的通用變量,這樣我就可以在命中中循環遍歷 searchResponse,例如foreach(var item in searchResponse.Hits(){ EsID = item.Id }
然后將它用於具有批處理的foreach
( batches.Select(x => EsID)
) 並將其用於排序。但我覺得那將是重復的代碼......如果我是,請糾正我錯誤的?
請在此處查看我的實現:
private IElasticClient _elasticClient;
[HttpGet("GetMonthlyLogs")]
public async Task<List<EsSource>> GetLogsByDate()
{
string indexName = "customer-simulation-es-app-logs*";
var connectionSettings = new ConnectionSettings(new Uri("http://localhost:9200"));
connectionSettings.DefaultIndex(indexName);
connectionSettings.EnableDebugMode();
_elasticClient = new ElasticClient(connectionSettings);
// this will return the number of results in the index based on the criteria's
var responseHits = _elasticClient.Count<EsSource>(c => c
.Query(q => q
.Bool(b => b
.Should(
m => m
.Match(ma => ma
.Field(fa => fa.level)
.Query("Error")),
m => m
.Match(ma => ma
.Field(fa => fa.level)
.Query("Information")))
.Filter(f => f.DateRange(dr => dr
.Field("@timestamp")
.GreaterThanOrEquals("2021-07-16T12:46:00.227-05:00")
.LessThanOrEquals("2021-07-16T12:55:00.227-05:00")))
.MinimumShouldMatch(1)))).Count;
foreach (var batches in Enumerable.Range(0, (int)responseHits).Batch(1000))
{
var bulk = await _elasticClient.IndexManyAsync(batches.Select(x => new EsSource { /* can I use timestamp?? */}));
}
await _elasticClient.Indices.RefreshAsync();
var openPit = await _elasticClient.OpenPointInTimeAsync(indexName, d => d.KeepAlive("1m"));
var pit = openPit.Id;
var searchAfter = 0;
try
{
while (true)
{
var response = await _elasticClient.SearchAsync<EsSource>(s => s
.TrackTotalHits(false) // disable the tracking of total hits to speed up pagination
.Size(1000)
// pass pit id & extend lifetime of it by another minute
.PointInTime(pit, d => d.KeepAlive("1m"))
.Source(src => src.Includes(i => i
.Fields(f => f.timestamp,
f => f.level,
f => f.messageTemplate,
f => f.message)))
.Query(q => q
.Bool(b => b
.Should(
m => m
.Match(ma => ma
.Field(fa => fa.level)
.Query("Error")),
m => m
.Match(ma => ma
.Field(fa => fa.level)
.Query("Information")))
.Filter(f => f.DateRange(dr => dr
.Field("@timestamp")
.GreaterThanOrEquals("2021-07-14T00:00:00.000-05:00")
.LessThanOrEquals("2021-07-14T23:59:59.999-05:00")))
.MinimumShouldMatch(1)))
// can I assort with timestamp or does it have to be the result ID?
.Sort(srt => srt.Ascending(f => f.timestamp))
.SearchAfter(searchAfter));
if (response.Documents.Count == 0)
{
break;
}
//searchAfter = response.Documents.LastOrDefault()?.timestamp ?? x;
}
}
finally
{
// closing the pit
var closePit = await _elasticClient.ClosePointInTimeAsync(d => d.Id(pit));
}
return // response?.Documents.ToList();
}
public class EsSource
{
[Date(Name = "@timestamp")]
public DateTimeOffset timestamp { get; set; }
public String level { get; set; }
public String messageTemplate { get; set; }
public String message { get; set; }
}
您需要將PointInTime
實例添加到您的搜索查詢中,如下所示:
esQuery.PointInTime = new PointInTime(PointInTimeId,KeepAlive);
您對 ES 的第一個請求,您的PointInTimeId
將為空,有關更多信息,請在此處查看 ES 官方文檔。
我准備了一個帶有注釋的示例應用程序,它演示了如何使用 PIT 從索引中檢索所有文檔並進行搜索。
class Program
{
static async Task Main(string[] args)
{
string indexName = "test";
var connectionSettings = new ConnectionSettings(new Uri("http://localhost:9200"));
connectionSettings.DefaultIndex(indexName);
connectionSettings.EnableDebugMode();
var elasticClient = new ElasticClient(connectionSettings);
await elasticClient.Indices.DeleteAsync(indexName);
var indexResponse = await elasticClient.Indices.CreateAsync(indexName);
// index some test data
// Batch coming from morelinq nuget
Console.WriteLine($"Index some data into index");
foreach (var batches in Enumerable.Range(0, 20000).Batch(1000))
{
var bulk = await elasticClient.IndexManyAsync(batches.Select(x => new EsDocument {Id = x }));
}
await elasticClient.Indices.RefreshAsync();
var countResponse = await elasticClient.CountAsync<EsDocument>(d => d);
Console.WriteLine($"Documents in index: {countResponse.Count}");
Console.WriteLine($"Open new pit");
var openPit = await elasticClient.OpenPointInTimeAsync(indexName, d => d.KeepAlive("1m"));
var pit = openPit.Id;
Console.WriteLine($"Read all docs from index ..");
// we will start reading docs from the beginning
var searchAfter = 0;
try
{
while (true)
{
var searchResponse = await elasticClient.SearchAsync<EsDocument>(s => s
// disable the tracking of total hits to speed up pagination.
.TrackTotalHits(false)
.Size(1000)
// pass pit id and extend lifetime of it by another minute
.PointInTime(pit, d => d.KeepAlive("1m"))
.Query(q => q.MatchAll())
// sort by Id filed so we can pass last retrieved id to next search
.Sort(sort => sort.Ascending(f => f.Id))
// pass last id we received from prev. search request so we can keep retrieving more documents
.SearchAfter(searchAfter));
// if we didn't receive any docs just stop processing
if (searchResponse.Documents.Count == 0)
{
break;
}
Console.WriteLine(
$"Id [{searchResponse.Documents.FirstOrDefault()?.Id}..{searchResponse.Documents.LastOrDefault()?.Id}]");
searchAfter = searchResponse.Documents.LastOrDefault()?.Id ?? 0;
}
}
finally
{
Console.WriteLine($"Close pit");
var closePit = await elasticClient.ClosePointInTimeAsync(d => d.Id(pit));
}
}
class EsDocument
{
public int Id { get; set; }
}
}
印刷
Index some data into index
Documents in index: 20000
Open new pit
Read all docs from index ..
Id [1..1000]
Id [1001..2000]
Id [2001..3000]
Id [3001..4000]
Id [4001..5000]
Id [5001..6000]
Id [6001..7000]
Id [7001..8000]
Id [8001..9000]
Id [9001..10000]
Id [10001..11000]
Id [11001..12000]
Id [12001..13000]
Id [13001..14000]
Id [14001..15000]
Id [15001..16000]
Id [16001..17000]
Id [17001..18000]
Id [18001..19000]
Id [19001..19999]
Close pit
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.