[英]How to run multiple Azure Functions in parallel which scroll through Elasticsearch?
我有一個設置,我需要從 Elasticsearch 中提取數據並將其存儲在 Azure Blob 中。 現在要獲取我使用 Elasticsearch 的_search和_scroll API 的數據。 索引設計得非常好,格式類似於 game1.* 、 game2.* 、 game3. *等。
我創建了一個worker.py文件,我按照 Microsoft 的建議將它存儲在一個名為shared_code的文件夾中,並且我有幾個 Timer Trigger Functions 可以導入和調用worker.py 。 由於我們這邊設置 ES 的方式,我必須創建一個 VNET 和一個 static 出站 IP 地址,然后我們將其列入 ES 白名單。 相反,數據只能在端口 9200 上從 ES 中提取。所以我創建了一個 Azure Function 應用程序,它具有連接設置,我正在嘗試創建多個功能(game1-worker、game2-worker、game3- worker) 在第 5 分鍾從並行運行的 ES 中提取數據。我注意到如果我添加FUNCTIONS_WORKER_PROCESS_COUNT = 1
設置,那么這些函數將等到第一個觸發的完成其任務,然后第二個觸發。 如果我不添加此應用程序設置或增加數量,那么一旦 function 因為完成工作而停止,它將嘗試再次啟動它,然后我得到一個OSError: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted
錯誤。 有沒有辦法讓這些並行運行但沒有提到的錯誤?
這是worker.py的代碼:
#!/usr/bin/env python
# coding: utf-8
# # Elasticsearch to Azure Microservice
import json, datetime, gzip, importlib, os, re, logging
from elasticsearch import Elasticsearch
import azure.storage.blob as azsb
import azure.identity as azi
import os
import tempfile
def batch(game_name, env='prod'):
# #### Global Variables
env = env.lower()
connection_string = os.getenv('conn_storage')
lowerFormat = game_name.lower().replace(" ","_")
azFormat = re.sub(r'[^0-9a-zA-Z]+', '-', game_name).lower()
storageContainerName = azFormat
stateStorageContainerName = "azure-webjobs-state"
minutesOffset = 5
tempFilePath = tempfile.gettempdir()
curFileName = f"{lowerFormat}_cursor.py"
curTempFilePath = os.path.join(tempFilePath,curFileName)
curBlobFilePath = f"cursors/{curFileName}"
esUrl = os.getenv('esUrl')
# #### Connections
es = Elasticsearch(
esUrl,
port=9200,
timeout=300)
def uploadJsonGzipBlob(filePathAndName, jsonBody):
blob = azsb.BlobClient.from_connection_string(
conn_str=connection_string,
container_name=storageContainerName,
blob_name=filePathAndName
)
blob.upload_blob(gzip.compress(bytes(json.dumps(jsonBody), encoding='utf-8')))
def getAndLoadCursor(filePathAndName):
# Get cursor from blob
blob = azsb.BlobClient.from_connection_string(
conn_str=os.getenv('AzureWebJobsStorage'),
container_name=stateStorageContainerName,
blob_name=filePathAndName
)
# Stream it to Temp file
with open(curTempFilePath, "wb") as f:
data = blob.download_blob()
data.readinto(f)
# Load it by path
spec = importlib.util.spec_from_file_location("cursor", curTempFilePath)
cur = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cur)
return cur
def writeCursor(filePathAndName, body):
blob = azsb.BlobClient.from_connection_string(
conn_str=os.getenv('AzureWebJobsStorage'),
container_name=stateStorageContainerName,
blob_name=filePathAndName
)
blob.upload_blob(body, overwrite=True)
# Parameter and state settings
if os.getenv(f"{lowerFormat}_maxSizeMB") is None:
maxSizeMB = 10 # Default to 10 MB
else:
maxSizeMB = int(os.getenv(f"{lowerFormat}_maxSizeMB"))
if os.getenv(f"{lowerFormat}_maxProcessTimeSeconds") is None:
maxProcessTimeSeconds = 300 # Default to 300 seconds
else:
maxProcessTimeSeconds = int(os.getenv(f"{lowerFormat}_maxProcessTimeSeconds"))
try:
cur = getAndLoadCursor(curBlobFilePath)
except Exception as e:
dtStr = f"{datetime.datetime.utcnow():%Y/%m/%d %H:%M:00}"
writeCursor(curBlobFilePath, f"# Please use format YYYY/MM/DD HH24:MI:SS\nlastPolled = '{dtStr}'")
logging.info(f"No cursor file. Generated {curFileName} file with date {dtStr}")
return 0
# # Scrolling and Batching Engine
lastRowDateOffset = cur.lastPolled
nrFilesThisInstance = 0
while 1:
# Offset the current time by -5 minutes to account for the 2-3 min delay in Elasticsearch
initTime = datetime.datetime.utcnow()
## Filter lt (less than) endDate to avoid infinite loops.
## Filter lt manually when compiling historical based on
endDate = initTime-datetime.timedelta(minutes=minutesOffset)
endDate = f"{endDate:%Y/%m/%d %H:%M:%S}"
doc = {
"query": {
"range": {
"baseCtx.date": {
"gt": lastRowDateOffset,
"lt": endDate
}
}
}
}
Index = lowerFormat + ".*"
if env == 'dev': Index = 'dev.' + Index
if nrFilesThisInstance == 0:
page = es.search(
index = Index,
sort = "baseCtx.date:asc",
scroll = "2m",
size = 10000,
body = doc
)
else:
page = es.scroll(scroll_id = sid, scroll = "10m")
pageSize = len(page["hits"]["hits"])
data = page["hits"]["hits"]
sid = page["_scroll_id"]
totalSize = page["hits"]["total"]
print(f"Total Size: {totalSize}")
cnt = 0
# totalSize might be flawed as it returns at times an integer > 0 but array is empty
# To overcome this, I've added the below check for the array size instead
if pageSize == 0: break
while 1:
cnt += 1
page = es.scroll(scroll_id = sid, scroll = "10m")
pageSize = len(page["hits"]["hits"])
sid = page["_scroll_id"]
data += page["hits"]["hits"]
sizeMB = len(gzip.compress(bytes(json.dumps(data), encoding='utf-8'))) / (1024**2)
loopTime = datetime.datetime.utcnow()
processTimeSeconds = (loopTime-initTime).seconds
print(f"{cnt} Results pulled: {pageSize} -- Cumulative Results: {len(data)} -- Gzip Size MB: {sizeMB} -- processTimeSeconds: {processTimeSeconds} -- pageSize: {pageSize} -- startDate: {lastRowDateOffset} -- endDate: {endDate}")
if sizeMB > maxSizeMB: break
if processTimeSeconds > maxProcessTimeSeconds: break
if pageSize < 10000: break
lastRowDateOffset = max([x['_source']['baseCtx']['date'] for x in data])
lastRowDateOffsetDT = datetime.datetime.strptime(lastRowDateOffset, '%Y/%m/%d %H:%M:%S')
outFile = f"elasticsearch/live/{lastRowDateOffsetDT:%Y/%m/%d/%H}/{lowerFormat}_live_{lastRowDateOffsetDT:%Y%m%d%H%M%S}.json.gz"
uploadJsonGzipBlob(outFile, data)
writeCursor(curBlobFilePath, f"# Please use format YYYY/MM/DD HH24:MI:SS\nlastPolled = '{lastRowDateOffset}'")
nrFilesThisInstance += 1
logging.info(f"File compiled: {outFile} -- {sizeMB} MB\n")
# If the while loop ran for more than maxProcessTimeSeconds then end it
if processTimeSeconds > maxProcessTimeSeconds: break
if pageSize < 10000: break
logging.info(f"Closing Connection to {esUrl}")
es.close()
return 0
這些是我調用的兩個定時觸發器:
game1-worker
import logging
import datetime
import azure.functions as func
#from shared_code import worker
import importlib
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
# Load a new instance of worker.py
spec = importlib.util.spec_from_file_location("worker", "shared_code/worker.py")
worker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(worker)
worker.batch('game1name')
logging.info('Python timer trigger function ran at %s', utc_timestamp)
game2-worker
import logging
import datetime
import azure.functions as func
#from shared_code import worker
import importlib
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
# Load a new instance of worker.py
spec = importlib.util.spec_from_file_location("worker", "shared_code/worker.py")
worker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(worker)
worker.batch('game2name')
logging.info('Python timer trigger function ran at %s', utc_timestamp)
根據您的描述,多個工作進程共享底層運行時的資源(套接字)。
對於您的用例,您只需將FUNCTIONS_WORKER_PROCESS_COUNT
保留為1
。 默認值應該是1
,因此不指定它應該與將其設置為1
相同。
您需要了解 Azure 函數如何擴展。 這是非常不自然/令人困惑的。
假設消費計划。
編碼:您編寫函數。 說 F1 和 F2。 如何組織取決於您。
供應:
運行:
FUNCTIONS_WORKER_PROCESS_COUNT
更改為10
,那么 Host 將生成 10 個進程並在每個進程中運行您的應用程序。 根據有限的信息,創建 3 個功能似乎是糟糕的設計。 您可以改為創建一個計時器觸發的 function,它向 Q 發送 3 條消息(對於這種微小的流量,存儲 Q 應該綽綽有余),進而觸發您的實際功能/實現(這是存儲 Q 觸發的功能) . 消息類似於{"game_name": "game1"}
。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.