多線程 memory 無限循環的錯誤問題

Question

我有無限循環等待消息，一旦我收到消息，例如 30 秒的消息累積並分成塊並並行處理。

def process_data(data, i):
    #process data for chunks
   parsed_records = []
for msg in msgs:
    #just deleting unnecessary keys and few key data manipulated
    parsed_records.append(record_data)

name=f"sample_{rnd}.json"
with open(name, "w") as outfile:
    outfile.write(parsed_records)
    return True

while true:
    threads = []
    for n in range(len(num_of_chunks)):
        t = threading.Thread(target=process_data, args=(num_of_chunks[n], n))
        threads.append(t)
        t.start()

    # Stop the threads
    for x in threads:
        t.join()

但是，在幾次迭代后導致 MemoryError。

任何要更新以避免 memory 問題並順利工作

即使我試過下面的代碼，

import multiprocessing
from faker import Faker
# Create Faker object to generate fake data for Producer
fake = Faker()

def myrandomdata(i,j):
    return fake.random_int(min = 1, max = j)

def divide_chunks(l, n):
    small_msgs = []
    for i in range(0, len(l), n):
        small_msgs.append(l[i:i + n])
    return small_msgs



def process_data(data, i):
        #process data for chunks
       parsed_records = []

    for msg in msgs:
        #just deleting unnecessary keys and few key data manipulated
        parsed_records.append(record_data)
    rnd = myrandomdata(1, 2000)
    name=f"sample_{rnd}.json"
    with open(name, "w") as outfile:
        outfile.write(parsed_records)
    return True

if __name__ == "__main__": 
     while true:

  #sample data
        msgs = [{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":173,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]},{"id":123,"min_position":7,"has_more_items":"true","items_html":"Bike","new_latent_count":3,"data":{"length":28,"text":"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."},"numericalArray":[29,32,32,25,31],"StringArray":["Nitrogen","Carbon","Carbon","Carbon"],"multipleTypesArray":"true","objArray":[{"class":"middle","age":7},{"class":"middle","age":5},{"class":"lower","age":6},{"class":"upper","age":0},{"class":"middle","age":7}]}] 
    #msgs are nothing but bulk data recieving from server continuously am appending to msgs
        chunk_msgs = divide_chunks(msgs, 3)

    #clearing msgs to append next data after chunking previous data
        msgs.clear()
        with multiprocessing.Pool(len(chunk_msgs)) as pool:
        pool.starmap(process_data, [(chunk_msgs[n],n) for n in range(len(chunk_msgs))])

沒有運氣：（

Answer 1

您應該只創建一次池，以避免重復創建和銷毀進程。
假設您的處理是 CPU 密集型的，您應該創建一個池，其大小是您擁有的 CPU 核心數。 我們稱之為 n_cores。 然后你應該將你的 msgs 列表分成 n_cores 塊，其中每個塊大約有 len(msgs) // n_cores 消息。 您當前的 divide_chunks 方法的 n 參數確定每個塊中有多少個元素，但指定塊的總數並讓它計算出每個塊中需要多少個元素會更方便。

import multiprocessing

def divide_chunks(iterable, n):
    if type(iterable) is range and iterable.step != 1:
        # algorithm doesn't work with steps other than 1:
        iterable = list(iterable)
    l = len(iterable)
    n = min(l, n)
    k, m = divmod(l, n)
    return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

def process_data(chunk_number, msg_chunk):

    #process data for chunks
    try:
        for msg in msg_chunk:
            # data processing here according to my requirement
            # it may take 20-25 seconds of process that is why am planning for parallel
            # processing
            ...
    except Exception as e:
        print("exception", e)
    return True

if __name__ == "__main__":  # only imports and function/class defs before this line.
    n_cores = multiprocessing.cpu_count()
    with multiprocessing.Pool(n_cores) as pool:
        while True:
            # Process next list of messages:
            msgs = [...]
            chunks = divide_chunks(msgs, n_cores)
            msgs.clear()
            results = pool.starmap(process_data, enumerate(chunks))

更新以使用多線程

閱讀代碼中的所有注釋並確保您理解它們！！！

from multiprocessing.pool import ThreadPool
from threading import Lock

def divide_chunks(iterable, n):
    if type(iterable) is range and iterable.step != 1:
        # algorithm doesn't work with steps other than 1:
        iterable = list(iterable)
    l = len(iterable)
    n = min(l, n)
    k, m = divmod(l, n)
    return [iterable[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

FILE_NO = 1
lock = Lock()

# What purpose does argument i serve? As long as you know ...
def process_data(i, msgs): # arguments must be in this order
    global FILE_NO

    #process data for chunks
    parsed_records = []

    for msg in msgs:
        #just deleting unnecessary keys and few key data manipulated
        parsed_records.append(record_data)

    # Get next file number
    # Do not use random number generator:
    with lock:
        file_no = FILE_NO
        FILE_NO += 1

    name = f"sample_{file_no}.json"
    with open(name, "w") as outfile:
        outfile.write(parsed_records)

    return True

if __name__ == "__main__":  # only imports and function/class defs before this line.
    # The number of chunks you want msgs split into
    # (this will be the number of files created for each invocation of process_data)
    # For now I wll assume a fixed value of 10. If this is not true, then set
    # POOL_SIZE to be what you think the maximum number of chunks you will have.
    # But note this: depending upon where you are creating your files, writing more than
    # one concurrently could hurt performance. This would be the case if you were, for example,
    # writing to a non-solid state drive.

    # Or recompute N_CHUNKS on each iteration based on size
    # of msgs:
    N_CHUNKS = 10
    POOL_SIZE = N_CHUNKS
    with ThreadPool(POOL_SIZE) as pool:
        while True:
            # Process next list of messages:
            msgs = [...]
            chunks = divide_chunks(msgs, N_CHUNKS)
            msgs.clear()
            results = pool.starmap(process_data, enumerate(chunks))

多線程 memory 無限循環的錯誤問題

問題描述

1 個解決方案

解決方案1
0 2023-01-24 14:38:27

多線程 memory 無限循環的錯誤問題

問題描述

1 個解決方案

解決方案1 0 2023-01-24 14:38:27

解決方案1
0 2023-01-24 14:38:27