[英]elasticsearch scrolling using python client
在scroll_id
滾動時,在每次滾動時提供最新的scroll_id
很重要:
初始搜索請求和每個后續滾動請求都返回一個新的 scroll_id — 應僅使用最近的 scroll_id。
下面的例子(取自這里)讓我很困惑。 一、srolling初始化:
rs = es.search(index=['tweets-2014-04-12','tweets-2014-04-13'],
scroll='10s',
search_type='scan',
size=100,
preference='_primary_first',
body={
"fields" : ["created_at", "entities.urls.expanded_url", "user.id_str"],
"query" : {
"wildcard" : { "entities.urls.expanded_url" : "*.ru" }
}
}
)
sid = rs['_scroll_id']
然后循環:
tweets = [] while (1):
try:
rs = es.scroll(scroll_id=sid, scroll='10s')
tweets += rs['hits']['hits']
except:
break
它有效,但我沒有看到sid
在哪里更新......我相信它發生在內部,在 python 客戶端中; 但我不明白它是如何工作的...
使用 python 請求
import requests
import json
elastic_url = 'http://localhost:9200/my_index/_search?scroll=1m'
scroll_api_url = 'http://localhost:9200/_search/scroll'
headers = {'Content-Type': 'application/json'}
payload = {
"size": 100,
"sort": ["_doc"]
"query": {
"match" : {
"title" : "elasticsearch"
}
}
}
r1 = requests.request(
"POST",
elastic_url,
data=json.dumps(payload),
headers=headers
)
# first batch data
try:
res_json = r1.json()
data = res_json['hits']['hits']
_scroll_id = res_json['_scroll_id']
except KeyError:
data = []
_scroll_id = None
print 'Error: Elastic Search: %s' % str(r1.json())
while data:
print data
# scroll to get next batch data
scroll_payload = json.dumps({
'scroll': '1m',
'scroll_id': _scroll_id
})
scroll_res = requests.request(
"POST", scroll_api_url,
data=scroll_payload,
headers=headers
)
try:
res_json = scroll_res.json()
data = res_json['hits']['hits']
_scroll_id = res_json['_scroll_id']
except KeyError:
data = []
_scroll_id = None
err_msg = 'Error: Elastic Search Scroll: %s'
print err_msg % str(scroll_res.json())
參考: https : //www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html#search-request-scroll
這是一個老問題,但由於某種原因,在搜索“elasticsearch python scroll”時首先出現了。 python 模塊提供了一個輔助方法來為您完成所有工作。 它是一個生成器函數,它將在管理底層滾動 ID 的同時將每個文檔返回給您。
https://elasticsearch-py.readthedocs.io/en/master/helpers.html#scan
下面是一個使用示例:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
query = {
"query": {"match_all": {}}
}
es = Elasticsearch(...)
for hit in scan(es, index="my-index", query=query):
print(hit["_source"]["field"])
實際上,代碼中有一個錯誤 - 為了正確使用滾動功能,您應該在下一次調用 scroll() 時使用每次新調用返回的新 scroll_id,而不是重用第一個:
重要的
初始搜索請求和每個后續滾動請求都返回一個新的 scroll_id — 應僅使用最近的 scroll_id。
http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-request-scroll.html
之所以有效,是因為 Elasticsearch 並不總是在調用之間更改 scroll_id,並且可以針對較小的結果集返回與最初一段時間內返回的相同的 scroll_id。 去年的這個討論是在其他兩個看到相同問題的用戶之間進行的,一段時間后返回了相同的 scroll_id:
因此,雖然您的代碼適用於較小的結果集,但它是不正確的 - 您需要捕獲在每次對 scroll() 的新調用中返回的 scroll_id 並將其用於下一次調用。
self._elkUrl = " http://Hostname:9200/logstash- */_search?scroll=1m"
self._scrollUrl="http://Hostname:9200/_search/scroll"
"""
Function to get the data from ELK through scrolling mechanism
"""
def GetDataFromELK(self):
#implementing scroll and retriving data from elk to get more than 100000 records at one search
#ref :https://www.elastic.co/guide/en/elasticsearch/reference/6.8/search-request-scroll.html
try :
dataFrame=pd.DataFrame()
if self._elkUrl is None:
raise ValueError("_elkUrl is missing")
if self._username is None:
raise ValueError("_userNmae for elk is missing")
if self._password is None:
raise ValueError("_password for elk is missing")
response=requests.post(self._elkUrl,json=self.body,auth=(self._username,self._password))
response=response.json()
if response is None:
raise ValueError("response is missing")
sid = response['_scroll_id']
hits = response['hits']
total= hits["total"]
if total is None:
raise ValueError("total hits from ELK is none")
total_val=int(total['value'])
url = self._scrollUrl
if url is None:
raise ValueError("scroll url is missing")
#start scrolling
while(total_val>0):
#keep search context alive for 2m
scroll = '2m'
scroll_query={"scroll" : scroll, "scroll_id" : sid }
response1=requests.post(url,json=scroll_query,auth=(self._username,self._password))
response1=response1.json()
# The result from the above request includes a scroll_id, which should be passed to the scroll API in order to retrieve the next batch of results
sid = response1['_scroll_id']
hits=response1['hits']
data=response1['hits']['hits']
if len(data)>0:
cleanDataFrame=self.DataClean(data)
dataFrame=dataFrame.append(cleanDataFrame)
total_val=len(response1['hits']['hits'])
num=len(dataFrame)
print('Total records recieved from ELK=',num)
return dataFrame
except Exception as e:
logging.error('Error while getting the data from elk', exc_info=e)
sys.exit()
from elasticsearch import Elasticsearch
elasticsearch_user_name ='es_username'
elasticsearch_user_password ='es_password'
es_index = "es_index"
es = Elasticsearch(["127.0.0.1:9200"],
http_auth=(elasticsearch_user_name, elasticsearch_user_password))
query = {
"query": {
"bool": {
"must": [
{
"range": {
"es_datetime": {
"gte": "2021-06-21T09:00:00.356Z",
"lte": "2021-06-21T09:01:00.356Z",
"format": "strict_date_optional_time"
}
}
}
]
}
},
"fields": [
"*"
],
"_source": False,
"size": 2000,
}
resp = es.search(index=es_index, body=query, scroll="1m")
old_scroll_id = resp['_scroll_id']
results = resp['hits']['hits']
while len(results):
for i, r in enumerate(results):
# do something whih data
pass
result = es.scroll(
scroll_id=old_scroll_id,
scroll='1m' # length of time to keep search context
)
# check if there's a new scroll ID
if old_scroll_id != result['_scroll_id']:
print("NEW SCROLL ID:", result['_scroll_id'])
# keep track of pass scroll _id
old_scroll_id = result['_scroll_id']
results = result['hits']['hits']
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.