I am having infinite loop waiting for messages, once i receive messages for example 30 seconds messages accumulated and dividing into chunks and processing it in parallel.
def process_data(data, i):
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
while true:
threads = []
for n in range(len(num_of_chunks)):
t = threading.Thread(target=process_data, args=(num_of_chunks[n], n))
threads.append(t)
t.start()
# Stop the threads
for x in threads:
t.join()
But, leading into MemoryError after few iterations.
Anything to be updated to avoid memory issue and work smoothly
Even i tried below code,
import multiprocessing
from faker import Faker
# Create Faker object to generate fake data for Producer
fake = Faker()
def myrandomdata(i,j):
return fake.random_int(min = 1, max = j)
def divide_chunks(l, n):
small_msgs = []
for i in range(0, len(l), n):
small_msgs.append(l[i:i + n])
return small_msgs
def process_data(data, i):
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
rnd = myrandomdata(1, 2000)
name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
if __name__ == "__main__":
while true:
#sample data
msgs = [{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]}]
#msgs are nothing but bulk data recieving from server continuously am appending to msgs
chunk_msgs = divide_chunks(msgs, 3)
#clearing msgs to append next data after chunking previous data
msgs.clear()
with multiprocessing.Pool(len(chunk_msgs)) as pool:
pool.starmap(process_data, [(chunk_msgs[n],n) for n in range(len(chunk_msgs))])
no luck:(
You should be creating the pool only once to avoid the repetitive creation and destruction of processes.
Assuming your processing is CPU-intensive, you should be creating a pool whose size is the number of CPU cores you have. Let's call this n_cores. Then you should split your msgs list into n_cores chunks where each chunk has approximately len(msgs) // n_cores messages. Your current divide_chunks method's n argument determines how many elements are in each chunk but it would be more convenient for it to specify the total number of chunks and let it figure out how many elements needs to be in each chunk.
import multiprocessing
def divide_chunks(iterable, n):
if type(iterable) is range and iterable.step != 1:
# algorithm doesn't work with steps other than 1:
iterable = list(iterable)
l = len(iterable)
n = min(l, n)
k, m = divmod(l, n)
return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
def process_data(chunk_number, msg_chunk):
#process data for chunks
try:
for msg in msg_chunk:
# data processing here according to my requirement
# it may take 20-25 seconds of process that is why am planning for parallel
# processing
...
except Exception as e:
print("exception", e)
return True
if __name__ == "__main__": # only imports and function/class defs before this line.
n_cores = multiprocessing.cpu_count()
with multiprocessing.Pool(n_cores) as pool:
while True:
# Process next list of messages:
msgs = [...]
chunks = divide_chunks(msgs, n_cores)
msgs.clear()
results = pool.starmap(process_data, enumerate(chunks))
Update to Use Multithreading
Read all the comments in the code and make sure you understand them!!!
from multiprocessing.pool import ThreadPool
from threading import Lock
def divide_chunks(iterable, n):
if type(iterable) is range and iterable.step != 1:
# algorithm doesn't work with steps other than 1:
iterable = list(iterable)
l = len(iterable)
n = min(l, n)
k, m = divmod(l, n)
return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
FILE_NO = 1
lock = Lock()
# What purpose does argument i serve? As long as you know ...
def process_data(i, msgs): # arguments must be in this order
global FILE_NO
#process data for chunks
parsed_records = []
for msg in msgs:
#just deleting unnecessary keys and few key data manipulated
parsed_records.append(record_data)
# Get next file number
# Do not use random number generator:
with lock:
file_no = FILE_NO
FILE_NO += 1
name = f"sample_{file_no}.json"
with open(name, "w") as outfile:
outfile.write(parsed_records)
return True
if __name__ == "__main__": # only imports and function/class defs before this line.
# The number of chunks you want msgs split into
# (this will be the number of files created for each invocation of process_data)
# For now I wll assume a fixed value of 10. If this is not true, then set
# POOL_SIZE to be what you think the maximum number of chunks you will have.
# But note this: depending upon where you are creating your files, writing more than
# one concurrently could hurt performance. This would be the case if you were, for example,
# writing to a non-solid state drive.
# Or recompute N_CHUNKS on each iteration based on size
# of msgs:
N_CHUNKS = 10
POOL_SIZE = N_CHUNKS
with ThreadPool(POOL_SIZE) as pool:
while True:
# Process next list of messages:
msgs = [...]
chunks = divide_chunks(msgs, N_CHUNKS)
msgs.clear()
results = pool.starmap(process_data, enumerate(chunks))
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.