简体   繁体   English

函数的 Joblib 并行计算

[英]Joblib parallel computation for function

在此处输入图片说明

How do I parallelize this function using Job lib?如何使用 Job lib 并行化此函数? The computation is happening inside for loop计算发生在 for 循环内

lotrunnums=
['RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543','6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268']

#function

def get_data_to_dict(data_train, lotrunnums):
    
    start = time.time()
    data=dict()
    for i in lotrunnums:
        trace=data_train["SENSOR_VALUE"][data_train["LOT_RUNNUM"]==i]
        data.update(dict({i: np.array(trace)}))
    end = time.time()
    print('{:.4f} s'.format(end-start))
    return data

You would have to create function which gets i and data_train and returns trace您必须创建获取idata_train并返回trace函数

    def func(data):
        data_train, i = data
        trace = data_train["SENSOR_VALUE"][data_train["LOT_RUNNUM"]==i]
        return i, trace.to_list()

and then you can run it in for -loop or in ThreadPool , Joblib , etc.然后您可以在for JoblibThreadPoolJoblib等中运行它。

And after running you can convert all results to dictionary.运行后,您可以将所有结果转换为字典。


Minimal code which test different methods测试不同方法的最少代码

import pandas as pd
import numpy as np
import time
import random

random.seed(0) # always generate the same values

data_train = pd.DataFrame({
    'LOT_RUNNUM': [
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
        'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
        '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268',
    ],
    'SENSOR_VALUE': [random.randint(0,9) for _ in range(8*10)],
})    

lotrunnums = [
    'RX9OE_29756162','S009K_29952685','1P32G_29692263','721YA_29780543',
    '6S3R6_29759571','RX9D0_29865357','RX9PV_29751006','RX9QM_29794268'
]

def get_data_to_dict_1(data_train, lotrunnums):
    """ using `for`-loop` """

    start = time.time()
    
    data = dict()
    
    for i in lotrunnums:
        trace = data_train["SENSOR_VALUE"][data_train["LOT_RUNNUM"]==i]
        #data.update(dict({i: np.array(trace)}))
        #data[i] = np.array(trace)
        data[i] = trace.to_list()
        
    end = time.time()
    
    print('{:.4f} s'.format(end-start))
    
    return data

def get_data_to_dict_2(data_train, lotrunnums):
    """ using `isin()`, `groupby()`, `apply()` """

    start = time.time()
    
    trace = data_train[["SENSOR_VALUE", "LOT_RUNNUM"]][data_train["LOT_RUNNUM"].isin(lotrunnums)]

    data = trace.groupby("LOT_RUNNUM")['SENSOR_VALUE'].apply(list).to_dict()
    
    #groups = trace.groupby("LOT_RUNNUM").apply(lambda key, grp: [key, grp["SENSOR_VALUE"].to_list()])
    #print(groups)
    #for key, grp in groups:
    #    print([key, grp["SENSOR_VALUE"].to_list()])
    #data = dict([key, grp["SENSOR_VALUE"].to_list()] for key, grp in groups)
    
    end = time.time()
    
    print('{:.4f} s'.format(end-start))
    
    return data

def get_data_to_dict_threadpool(data_train, lotrunnums):
    """ using ThreadPoll """
    
    from concurrent.futures import ThreadPoolExecutor
    
    def func(data):
        data_train, i = data
        trace = data_train["SENSOR_VALUE"][data_train["LOT_RUNNUM"]==i]
        return i, trace.to_list()
    
    start = time.time()
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        future = executor.map(func, [(data_train, i) for i in lotrunnums])
        data = dict(future)

    end = time.time()
    
    print('{:.4f} s'.format(end-start))
    
    return data

def get_data_to_dict_joblib(data_train, lotrunnums):
    """ using Joblib with threads """
     
    from joblib import Parallel, delayed
    
    def func(data_train, i):
        #data_train, i = data
        trace = data_train["SENSOR_VALUE"][data_train["LOT_RUNNUM"]==i]
        return i, trace.to_list()
    
    start = time.time()
    
    results = Parallel(n_jobs=4, prefer="threads")(delayed(func)(data_train, i) for i in lotrunnums)
    data = dict(results)
    
    end = time.time()
    
    print('{:.4f} s'.format(end-start))
    
    return data

def get_data_to_dict_joblib_process(data_train, lotrunnums):
    """ using Joblib with processes """
    
    from joblib import Parallel, delayed
    
    def func(data_train, i):
        #data_train, i = data
        trace = data_train["SENSOR_VALUE"][data_train["LOT_RUNNUM"]==i]
        return i, trace.to_list()
    
    start = time.time()
    
    results = Parallel(n_jobs=4)(delayed(func)(data_train, i) for i in lotrunnums)
    data = dict(results)
    
    end = time.time()
    
    print('{:.4f} s'.format(end-start))
    
    return data

# --- main ---

print('--- normal 1 ---')
print(get_data_to_dict_1(data_train, lotrunnums))

print('--- normal 2 ---')
print(get_data_to_dict_2(data_train, lotrunnums))

print('--- threadpool ---')
print(get_data_to_dict_threadpool(data_train, lotrunnums))

print('--- joblib - thread ---')
print(get_data_to_dict_joblib(data_train, lotrunnums))

print('--- joblib - process ---')
print(get_data_to_dict_joblib_process(data_train, lotrunnums))

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM