Python 多处理 output 结果

Question

Given a list of data to process and a 64-core CPU (plus 500 GB RAM).给定要处理的数据列表和 64 核 CPU（外加 500 GB RAM）。 The list should sort strings and store data in a result set of millions of records, which runs just fine, takes a few seconds with multiprocessing.该列表应该对字符串进行排序并将数据存储在数百万条记录的结果集中，运行得很好，多处理需要几秒钟。 But I'd also need to store the result somehow, either in a txt, csv output or a database.但我还需要以某种方式将结果存储在 txt、csv output 或数据库中。 So far I haven't found a viable solution, because after the first part (process), the insert method either gives an error with trying it with MySQL pooling, or takes an insanely long time giving the txt output.到目前为止，我还没有找到一个可行的解决方案，因为在第一部分（过程）之后，插入方法要么在尝试使用 MySQL 池时出错，要么需要很长时间才能给出 txt output。 What Ive tried so far: simple txt output, print out to txt file, using csv, pandas and numpy libs.到目前为止我尝试过的内容：简单的 txt output，打印到 txt 文件，使用 csv、pandas 和 Z2EA9510C37FZ2F lib.FECB948 Nothing seems to speed it up.似乎没有什么可以加快速度。 Any help would be greatly appreciated: My code right now:任何帮助将不胜感激：我现在的代码：

import os
import re
import datetime
import time
import csv

import mysql.connector as connector
from mysql.connector.pooling import MySQLConnectionPool

import mysql

import numpy as np
from tqdm import tqdm
from time import sleep
import multiprocessing as mp

import numpy

pool = MySQLConnectionPool( pool_name="sql_pool",
                            pool_size=32,
                            pool_reset_session=True,
                            host="localhost",
                            port="3306",
                            user="homestead",
                            password="secret",
                            database="homestead")

# # sql connection
db = mysql.connector.connect(
  host="localhost",
  port="3306",
  user="homestead",
  password="secret",
  database="homestead"
)

sql_cursor = db.cursor()
delete_statement = "DELETE FROM statistics"
sql_cursor.execute(delete_statement)

db.commit()

sql_statement = "INSERT INTO statistics (name, cnt) VALUES (%s, %s)"

list = []
domains = mp.Manager().list()
unique_list = mp.Manager().list()
invalid_emails = mp.Manager().list()
result = mp.Manager().list()
regex_email = '^(\w|\.|\_|\-)+[@](\w|\_|\-|\.)+[.]\w{2,3}$'

# check email validity
def check(list, email):
    if(re.search(regex_email, email)):
        domains.append(email.lower().split('@')[1])
        return True
    else:
        invalid_emails.append(email)
        return False
#end of check email validity

# execution time converter
def convertTime(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60

    if(hour == 0):
        if(minutes == 0):
            return "{0} sec".format(seconds)
        else:
            return "{0}min {1}sec".format(minutes, seconds)
    else:
        return "{0}hr {1}min {2}sec".format(hour, minutes, seconds)
# execution time converter end

#process
def process(list):
    for item in tqdm(list):
        if(check(list, item)):
            item = item.lower().split('@')[1]
            if item not in unique_list:
                unique_list.append(item)
# end of process

def insert(list):
    global sql_statement

    # Add to db
    con = pool.get_connection()
    cur = con.cursor()

    print("PID %d: using connection %s" % (os.getpid(), con))
    #cur.executemany(sql_statement, sorted(map(set_result, list)))
    for item in list:

        cur.execute(sql_statement, (item, domains.count(item)))
    con.commit()
    cur.close()
    con.close()

# def insert_into_database(list):
    #sql_cursor.execute(sql_statement, (unique_list, 1), multi=True)

    # sql_cursor.executemany(sql_statement, sorted(map(set_result, list)))
    # db.commit()

# statistics
def statistics(list):
    for item in tqdm(list):
        if(domains.count(item) > 0):
            result.append([domains.count(item), item])
# end of statistics

params = sys.argv
filename = ''
process_count = -1
for i, item in enumerate(params):
    if(item.endswith('.txt')):
        filename = item
    if(item == '--top'):
        process_count = int(params[i+1])


def set_result(item):
    return item, domains.count(item)

# main
if(filename):
    try:
        start_time = time.time()
        now = datetime.datetime.now()
        dirname = "email_stats_{0}".format(now.strftime("%Y%m%d_%H%M%S"))
        os.mkdir(dirname)

        list = open(filename).read().split()

        if(process_count == -1):
            process_count = len(list)

        if(process_count > 0):
            list = list[:process_count]

        #chunking list
        n = int(len(list) /  mp.cpu_count())
        chunks = [list[i:i + n] for i in range(0, len(list), n)]

        processes = []
        print('Processing list on {0} cores...'.format(mp.cpu_count()))
        for chunk in chunks:
            p = mp.Process(target=process, args=[chunk])
            p.start()
            processes.append(p)

        for p in processes:
            p.join()

        # insert(unique_list)

        ## step 2 - write sql

        ##  Clearing out db before new data insert
        con = pool.get_connection()
        cur = con.cursor()

        delete_statement = "DELETE FROM statistics"
        cur.execute(delete_statement)

        u_processes = []

        #Maximum pool size for sql is 32, so maximum chunk number should be that too.
        if(mp.cpu_count() < 32):
            n2 = int(len(unique_list) /  mp.cpu_count())
        else:
            n2 = int(len(unique_list) /  32)

        u_chunks = [unique_list[i:i + n2] for i in range(0, len(unique_list), n2)]
        for u_chunk in u_chunks:
            p = mp.Process(target=insert, args=[u_chunk])
            p.start()
            u_processes.append(p)

        for p in u_processes:
            p.join()

        for p in u_processes:
            p.close()

        # sql_cursor.executemany(sql_statement, sorted(map(set_result, unique_list)))
        # db.commit()
        # for item in tqdm(unique_list):
        #     sql_val = (item, domains.count(item))
        #     sql_cursor.execute(sql_statement, sql_val)
        #
        #     db.commit()

        ## numpy.savetxt('saved.txt', sorted(map(set_result, unique_list)), fmt='%s')


        # with(mp.Pool(mp.cpu_count(), initializer = db) as Pool:
        #     Pool.map_async(insert_into_database(),set(unique_list))
        #     Pool.close()
        #     Pool.join()

        print('Creating statistics for {0} individual domains...'.format(len(unique_list)))

        # unique_list = set(unique_list)
        # with open("{0}/result.txt".format(dirname), "w+") as f:
        #     csv.writer(f).writerows(sorted(map(set_result, unique_list), reverse=True))

        print('Writing final statistics...')
        print('OK.')
        f = open("{0}/stat.txt".format(dirname),"w+")
        f.write("Number of processed emails: {0}\r\n".format(process_count))
        f.write("Number of valid emails: {0}\r\n".format(len(list) - len(invalid_emails)))
        f.write("Number of invalid emails: {0}\r\n".format(len(invalid_emails)))
        f.write("Execution time: {0}".format(convertTime(int(time.time() - start_time))))
        f.close()

    except FileNotFoundError:
        print('File not found, path or file broken.')
else:
    print('Wrong file format, should be a txt file.')
# main

Answer 1

See my comments regarding some changes you might wish to make, one of which might improve performance.请参阅我对您可能希望进行的一些更改的评论，其中一项可能会提高性能。 But I think one area of performance which could really be improved is in your use of managed lists.但我认为真正可以改进的性能领域之一是您使用托管列表。 These are represented by proxies and each operation on such a list is essentially a remote procedure call and thus very slow.这些由代理表示，并且在这样的列表上的每个操作本质上都是一个远程过程调用，因此非常慢。 You cannot avoid this given that you need to have multiple processes updating a common, shared lists (or dict if you take my suggestion).您无法避免这种情况，因为您需要让多个进程更新一个通用的共享列表（如果您接受我的建议，则为dict ）。 But in the main process you might be trying, for example, to construct a set from a shared list as follows:但是在主进程中，您可能会尝试，例如，从共享列表构造一个集合，如下所示：

Pool.map_async(insert_into_database(),set(unique_list))

(by the way, that should be Pool.map(insert_into_database, set(unique_list)) , ie you have an extra set of () and you can then get rid of the calls to pool.close() and pool.join() if you wish) （顺便说一句，这应该是Pool.map(insert_into_database, set(unique_list)) ，即您有一组额外的() ，然后您可以摆脱对pool.close()和pool.join()的调用如果你希望）

The problem is that you are iterating every element of unique_list through a proxy, which might be what is taking a very long time.问题是您正在通过代理迭代unique_list的每个元素，这可能需要很长时间。 I say "might" because I would think the use of managed lists would prevent the code as is, ie without outputting the results, from completing in "a few seconds" if we are talking about "millions" of records and thus millions of remote procedure calls.我说“可能”是因为我认为使用托管列表会阻止代码按原样（即不输出结果）在“几秒钟”内完成，如果我们谈论的是“数百万”条记录，因此数百万条远程过程调用。 But this number could certainly be reduced if you could somehow get the underlying list as a native list.但是，如果您能以某种方式将基础list作为本机列表，那么这个数字肯定会减少。

First, you need to heed my comment about having declared a variable named list thus making it impossible to create native lists or subclasses of list .首先，您需要注意我关于声明了一个名为list的变量的评论，因此无法创建本机列表或list的子类。 Once your have renamed that variable to something more reasonable, we can create our own managed class MyList that will expose the underlying list on which it is built.一旦您将该变量重命名为更合理的值，我们就可以创建我们自己的托管 class MyList ，它将公开构建它的基础list 。 Note that you can do the same thing with a MyDict class that subclasses dict .请注意，您可以使用子类dict的MyDict class 做同样的事情。 I have defined both classes for you.我已经为你定义了这两个类。 Here is a benchmark showing the difference between constructing a native list from a managed list versus creating a native list from a MyList :这是一个基准，显示了从托管列表构建本地列表与从MyList创建本地列表之间的区别：

import multiprocessing as mp
from multiprocessing.managers import BaseManager
import time

class MyManager(BaseManager):
    pass

class MyList(list):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def get_underlying_list(self):
        return self

class MyDict(dict):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def get_underlying_dict(self):
        return self

# required for windows, which I am running on:
if __name__ == '__main__':
    l = mp.Manager().list()
    for i in range(100_000):
        l.append(i)
    t = time.time()
    l2 = list(l)
    print(time.time() - t, l2[0:5], l2[-5:])


    MyManager.register('MyList', MyList)
    MyManager.register('MyDict', MyDict)
    my_manager = MyManager()
    # must explicitly start the manager or use: with MyManager() as manager:
    my_manager.start()
    l = my_manager.MyList()
    for i in range(100_000):
        l.append(i)
    t = time.time()
    l2 = list(l.get_underlying_list())
    print(time.time() - t, l2[0:5], l2[-5:])

Prints:印刷：

7.3949973583221436 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]
0.007997751235961914 [0, 1, 2, 3, 4] [99995, 99996, 99997, 99998, 99999]

Python 多处理 output 结果

问题描述

1 个解决方案

解决方案1
0 2021-05-24 12:43:13

Python 多处理 output 结果

问题描述

1 个解决方案

解决方案1 0 2021-05-24 12:43:13

解决方案1
0 2021-05-24 12:43:13