[英]Run a for loop parallelly in batches in Python
I have this code which runs for a long time because it processes about 6000 entries.我有这段代码运行了很长时间,因为它处理了大约 6000 个条目。 I am trying to run it parallelly so it takes less time.
我正在尝试并行运行它,因此需要更少的时间。 The multiprocessing code that I wrote is not working as expected.
我编写的多处理代码没有按预期工作。 For loop in the function lambda_handler is what needs to parallelized.
function lambda_handler 中的 for 循环是需要并行化的。 I am still new to python, so any help will be appreciated.
我还是 python 的新手,所以任何帮助将不胜感激。
Here's my code这是我的代码
import json
import boto3
from pprint import pprint
import os
import botocore
from datetime import datetime
import csv
from botocore.client import Config
import logging
import urllib3
import sys
import time
from boto3.session import Session
from botocore.client import Config
import multiprocessing as mp
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
config = Config(connect_timeout=7200,
retries={'max_attempts': 5},
proxies={'http': os.environ.get('HTTP_PROXY'),'https': os.environ.get('HTTPS_PROXY'),}
)
LOGGER=logging.getLogger()
LOGGER.setLevel(logging.INFO)
instance_id = os.environ.get('instance_id')
region_name = os.environ.get('Region')
s3_bucket = os.environ.get('s3_bucket')
env = os.environ.get('export_environment')
MaxResults = 4
# Multiprocessing
pool = mp.Pool(4)
def getUserList(instance_id: str, region_name: str, env: str):
users_all = ''
client = boto3.client('connect',region_name=region_name, verify=False, config=config)
#get list of Amazon Connect users - first batch
response = client.list_users(
InstanceId=instance_id,
MaxResults=MaxResults
)
users=response['UserSummaryList']
while "NextToken" in response:
response = client.list_users(InstanceId=instance_id, MaxResults=MaxResults, NextToken=response["NextToken"])
users.extend(response['UserSummaryList'])
users_all = [*users_all,*users]
return users_all
def lambda_handler(instance_id: str, region_name: str, env: str, s3_bucket):
client = boto3.client('connect',region_name="us-east-1",verify=False, config=config)
#get list of Amazon Connect users
users=getUserList(instance_id,region_name,env)
# This loop needs to be parallelized
for user in users:
#time.sleep(0.1)
user_id=user['Id']
user_name=user['Username']
response=client.describe_user(
InstanceId=instance_id,
UserId=user_id
)
user_firstname=response['User']['IdentityInfo']['FirstName']
user_lastname=response['User']['IdentityInfo']['LastName']
user_name=response['User']['Username']
user_routing_profile_id=response['User']['RoutingProfileId']
user_security_profile_ids=response['User']['SecurityProfileIds']
#user_email=response['User']['IdentityInfo']['Email']
user_phonetype=response['User']['PhoneConfig']['PhoneType']
user_desk_number=response['User']['PhoneConfig']['DeskPhoneNumber']
user_autoaccept=response['User']['PhoneConfig']['AutoAccept']
user_acw_timeout=response['User']['PhoneConfig']['AfterContactWorkTimeLimit']
try:
user_hierarchy_group_id=response['User']['HierarchyGroupId']
except KeyError as e:
user_hierarchy_group_id=""
security_profile_name_list=''
hierarchy_group_name=""
user_info=""+user_firstname+","+user_lastname+","+user_name+","+routing_profile_name+","+security_profile_name_list[1:]+","+user_phonetype+","+user_desk_number+","+str(user_autoaccept)+","+str(user_acw_timeout)+","+hierarchy_group_name+"\r\n"
print(user_info)
results = pool.starmap(lambda_handler, [(instance_id,region_name,env,s3_bucket)])
pool.close()
from threading import Thread
threadsToRun = 2 #How many threads?
while(True):
try:
with futures.ThreadPoolExecutor() as executor:
future_test_results = [executor.submit(LoopWhichYouWantToRunMultipleTimes)
for i in range(threadsToRun) ] # running same test multiple times, using threadsToRun as parametar
for future_test_result in future_test_results:
try:
test_result = future_test_result.result() # can use `timeout` to wait max seconds for each thread
except Exception as exc: # can give a exception in some thread, but...
print(exc)
except:
print("Error...")
I would rearrange the code as follows:我将重新排列代码如下:
process_user
that will process a single user.process_user
。 if your pool size is 4 then 4 of users can be proccessed concurrently each by a process in the multiprocessing pool.process_user
will now access what were previously arguments, ie instance_id
, region_name
, env
and s3_bucket
, as global variables, which have been defined in each pool process using the initializer and initargs arguments of the multiprocessing.pool.Pool.__init__
method. process_user
现在将访问以前的 arguments,即instance_id
、 region_name
、 env
和s3_bucket
,作为全局变量,这些变量已使用multiprocessing.pool.Pool.__init__
方法的初始化程序和initargs arguments 在每个池进程中定义。 Likewise, a single global client
instance that can be reused by process_user
will be created for each pool process.process_user
重用的单个全局client
实例。 This is the general idea, which obviously I cannot actually test.这是一般的想法,显然我无法实际测试。 You need to study this and understand what is being done with the help of the Python docs so that you can make whatever changes are required to get this to run.
您需要研究这一点并了解在 Python 文档的帮助下正在做什么,以便您可以进行所需的任何更改以使其运行。
import json
import boto3
from pprint import pprint
import os
import botocore
from datetime import datetime
import csv
from botocore.client import Config
import logging
import urllib3
import sys
import time
from boto3.session import Session
from botocore.client import Config
import multiprocessing as mp
LOGGER=logging.getLogger()
LOGGER.setLevel(logging.INFO)
MaxResults = 4
config = Config(connect_timeout=7200,
retries={'max_attempts': 5},
proxies={'http': os.environ.get('HTTP_PROXY'),'https': os.environ.get('HTTPS_PROXY'),}
)
def init_pool_processes(*args):
global instance_id, region_name, env, s3_bucket, client
# Unpack:
instance_id, region_name, env, s3_bucket = args
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# One global client per process:
client = boto3.client('connect',region_name="us-east-1",verify=False, config=config)
def process_user(user):
#time.sleep(0.1)
user_id=user['Id']
user_name=user['Username']
response=client.describe_user(
InstanceId=instance_id,
UserId=user_id
)
user_firstname=response['User']['IdentityInfo']['FirstName']
user_lastname=response['User']['IdentityInfo']['LastName']
user_name=response['User']['Username']
user_routing_profile_id=response['User']['RoutingProfileId']
user_security_profile_ids=response['User']['SecurityProfileIds']
#user_email=response['User']['IdentityInfo']['Email']
user_phonetype=response['User']['PhoneConfig']['PhoneType']
user_desk_number=response['User']['PhoneConfig']['DeskPhoneNumber']
user_autoaccept=response['User']['PhoneConfig']['AutoAccept']
user_acw_timeout=response['User']['PhoneConfig']['AfterContactWorkTimeLimit']
try:
user_hierarchy_group_id=response['User']['HierarchyGroupId']
except KeyError as e:
user_hierarchy_group_id=""
security_profile_name_list=''
hierarchy_group_name=""
user_info=""+user_firstname+","+user_lastname+","+user_name+","+routing_profile_name+","+security_profile_name_list[1:]+","+user_phonetype+","+user_desk_number+","+str(user_autoaccept)+","+str(user_acw_timeout)+","+hierarchy_group_name+"\r\n"
print(user_info)
def getUserList(instance_id: str, region_name: str, env: str):
users_all = ''
client = boto3.client('connect',region_name=region_name, verify=False, config=config)
#get list of Amazon Connect users - first batch
response = client.list_users(
InstanceId=instance_id,
MaxResults=MaxResults
)
users=response['UserSummaryList']
while "NextToken" in response:
response = client.list_users(InstanceId=instance_id, MaxResults=MaxResults, NextToken=response["NextToken"])
users.extend(response['UserSummaryList'])
users_all = [*users_all,*users]
return users_all
def lambda_handler(instance_id: str, region_name: str, env: str, s3_bucket):
#get list of Amazon Connect users
users=getUserList(instance_id,region_name,env)
pool = mp.Pool(4, initializer=init_pool_processes, initargs=(instance_id, region_name, env, s3_bucket))
pool.map(process_user, users)
pool.close()
pool.join()
if __name__ == '__main__':
instance_id = os.environ.get('instance_id')
region_name = os.environ.get('Region')
s3_bucket = os.environ.get('s3_bucket')
env = os.environ.get('export_environment')
lambda_handler(instance_id,region_name,env,s3_bucket)
You also need to look at:您还需要查看:
users_all = ''
... # code elided
users = response['UserSummaryList']
... # code elided
users_all = [*users_all, *users]
return users_all
Isn't the above equivalent to the following?上面的不等于下面的吗?
users = response['UserSummaryList']
... # code elided
return users
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.