简体   繁体   中英

multi thread memory error issue with infinite while loop

I am having infinite loop waiting for messages, once i receive messages for example 30 seconds messages accumulated and dividing into chunks and processing it in parallel.

def process_data(data, i):
    #process data for chunks
   parsed_records = []
for msg in msgs:
    #just deleting unnecessary keys and few key data manipulated
    parsed_records.append(record_data)

name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
    outfile.write(parsed_records)
    return True

while true:
    threads = []
    for n in range(len(num_of_chunks)):
        t = threading.Thread(target=process_data, args=(num_of_chunks[n], n))
        threads.append(t)
        t.start()

    # Stop the threads
    for x in threads:
        t.join()

But, leading into MemoryError after few iterations.

Anything to be updated to avoid memory issue and work smoothly

Even i tried below code,

import multiprocessing
from faker import Faker
# Create Faker object to generate fake data for Producer
fake = Faker()

def myrandomdata(i,j):
    return fake.random_int(min = 1, max = j)

def divide_chunks(l, n):
    small_msgs = []
    for i in range(0, len(l), n):
        small_msgs.append(l[i:i + n])
    return small_msgs



def process_data(data, i):
        #process data for chunks
       parsed_records = []

    for msg in msgs:
        #just deleting unnecessary keys and few key data manipulated
        parsed_records.append(record_data)
    rnd = myrandomdata(1, 2000)
    name=f"sample_{rnd}.json"
    with open(name, "w") as outfile:
        outfile.write(parsed_records)
    return True

if __name__ == "__main__": 
     while true:

  #sample data
        msgs = [{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]}] 
    #msgs are nothing but bulk data recieving from server continuously am appending to msgs
        chunk_msgs = divide_chunks(msgs, 3)

    #clearing msgs to append next data after chunking previous data
        msgs.clear()
        with multiprocessing.Pool(len(chunk_msgs)) as pool:
        pool.starmap(process_data, [(chunk_msgs[n],n) for n in range(len(chunk_msgs))])

no luck:(

  1. You should be creating the pool only once to avoid the repetitive creation and destruction of processes.

  2. Assuming your processing is CPU-intensive, you should be creating a pool whose size is the number of CPU cores you have. Let's call this n_cores. Then you should split your msgs list into n_cores chunks where each chunk has approximately len(msgs) // n_cores messages. Your current divide_chunks method's n argument determines how many elements are in each chunk but it would be more convenient for it to specify the total number of chunks and let it figure out how many elements needs to be in each chunk.

import multiprocessing

def divide_chunks(iterable, n):
    if type(iterable) is range and iterable.step != 1:
        # algorithm doesn't work with steps other than 1:
        iterable = list(iterable)
    l = len(iterable)
    n = min(l, n)
    k, m = divmod(l, n)
    return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

def process_data(chunk_number, msg_chunk):

    #process data for chunks
    try:
        for msg in msg_chunk:
            # data processing here according to my requirement
            # it may take 20-25 seconds of process that is why am planning for parallel
            # processing
            ...
    except Exception as e:
        print("exception", e)
    return True

if __name__ == "__main__":  # only imports and function/class defs before this line.
    n_cores = multiprocessing.cpu_count()
    with multiprocessing.Pool(n_cores) as pool:
        while True:
            # Process next list of messages:
            msgs = [...]
            chunks = divide_chunks(msgs, n_cores)
            msgs.clear()
            results = pool.starmap(process_data, enumerate(chunks))

Update to Use Multithreading

Read all the comments in the code and make sure you understand them!!!

from multiprocessing.pool import ThreadPool
from threading import Lock

def divide_chunks(iterable, n):
    if type(iterable) is range and iterable.step != 1:
        # algorithm doesn't work with steps other than 1:
        iterable = list(iterable)
    l = len(iterable)
    n = min(l, n)
    k, m = divmod(l, n)
    return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

FILE_NO = 1
lock = Lock()

# What purpose does argument i serve? As long as you know ...
def process_data(i, msgs): # arguments must be in this order
    global FILE_NO

    #process data for chunks
    parsed_records = []

    for msg in msgs:
        #just deleting unnecessary keys and few key data manipulated
        parsed_records.append(record_data)

    # Get next file number
    # Do not use random number generator:
    with lock:
        file_no = FILE_NO
        FILE_NO += 1

    name = f"sample_{file_no}.json"
    with open(name, "w") as outfile:
        outfile.write(parsed_records)

    return True

if __name__ == "__main__":  # only imports and function/class defs before this line.
    # The number of chunks you want msgs split into
    # (this will be the number of files created for each invocation of process_data)
    # For now I wll assume a fixed value of 10. If this is not true, then set
    # POOL_SIZE to be what you think the maximum number of chunks you will have.
    # But note this: depending upon where you are creating your files, writing more than
    # one concurrently could hurt performance. This would be the case if you were, for example,
    # writing to a non-solid state drive.

    # Or recompute N_CHUNKS on each iteration based on size
    # of msgs:
    N_CHUNKS = 10
    POOL_SIZE = N_CHUNKS
    with ThreadPool(POOL_SIZE) as pool:
        while True:
            # Process next list of messages:
            msgs = [...]
            chunks = divide_chunks(msgs, N_CHUNKS)
            msgs.clear()
            results = pool.starmap(process_data, enumerate(chunks))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM