Python多核CSV简短程序，需要建议/帮助

Question

I'm a hobby coder started with AHK, then some java and now I try to learn Python. 我是一个从AHK开始的业余编码员，然后是一些Java，现在我尝试学习Python。 I have searched and found some tips but I have yet not been able to implement it into my own code. 我已经搜索并找到了一些技巧，但是还无法将其实现到自己的代码中。 Hopefully someone here can help me, it's a very short program. 希望这里有人可以帮助我，这是一个非常简短的程序。 I'm using .txt csv database with ";" 我正在将.txt csv数据库与“;”一起使用 as a separator. 作为分隔符。 DATABASE EXAMPLE: 数据库示例：

Which color is normally a cat?;Black 通常是哪种颜色的猫？；黑色

How tall was the longest man on earth?;272 cm 地球上最长的人有多高？; 272厘米

Is the earth round?;Yes 地球是圆形的吗？

The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core). 现在，数据库由20.000行组成，仅使用25％的CPU（1个内核）即可使程序“变慢”。

If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. 如果我可以使用所有4个内核（100％），我猜它会更快地执行任务。 The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. 任务基本上是将CLIPBOARD与数据库进行比较，如果有匹配项，它应该给我一个答案作为回报。 Perhaps also I can separate the database into 4 pieces? 也许我也可以将数据库分为四个部分？

The code right now looks like this! 现在的代码如下所示！ Not more then 65 lines and its doing its job (but to slow). 最多不超过65条线及其工作（但速度较慢）。 Advice on how I can make this process into multi core needed. 有关如何使此过程成为多核所需的建议。

    import time
    import pyperclip as pp
    import pandas as pd
    import pymsgbox as pmb
    from fuzzywuzzy import fuzz
    import numpy


    ratio_threshold = 90
    fall_back_time = 1
    db_file_path = 'database.txt'
    db_separator = ';'
    db_encoding = 'latin-1'

    def load_db():
        while True:
            try:
                # Read and create database
                db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
                db = db.drop_duplicates()
                return db
            except:
                print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
        time.sleep(fall_back_time)


    def top_answers(db, question):
        db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
        db_sorted = db.sort_values(by='ratio', ascending=False)
        db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
        return db_sorted


    def write_txt(top):
        result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
        result = '\n'.join(result)
        fileHandle = open("svar.txt", "w")
        fileHandle.write(result)
        fileHandle.close()
        pp.copy("")


    def main():
        try:
            db = load_db()
            last_db_reload = time.time()

            while True:
                # Get contents of clipboard
                question = pp.paste()

                # Rank answer
                top = top_answers(db, question)

                # If answer was found, show results
                if len(top) > 0:
                    write_txt(top)
                time.sleep(fall_back_time)
        except:
            print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
            time.sleep(fall_back_time)


   if name == 'main':
       main()'

Answer 1

If you could divide the db into four equally large you could process them in parallel like this: 如果您可以将数据库分为四个相等的大数据库，则可以像下面这样并行处理它们：

import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading

ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'


def worker(thread_id, question):
    thread_id = str(thread_id)
    db = pd.read_csv(db_file_path + thread_id, sep=db_separator,    encoding=db_encoding)
    db = db.drop_duplicates()
    db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
    db_sorted = db.sort_values(by='ratio', ascending=False)
    db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
    top = db_sorted
    result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
    result = '\n'.join(result)
    fileHandle = open("svar" + thread_id + ".txt", "w")
    fileHandle.write(result)
    fileHandle.close()
    pp.copy("")
    return


def main():
    question = pp.paste()
    for i in range(1, 4):
        t = threading.Thread(target=worker, args=(i, question))
        t.start()
        t.join()


if name == 'main':
    main()

Answer 2

The solution with multiprocessing: 多处理解决方案：

import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np

# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce

import sys
import os
from contextlib import closing

ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'

chunked_db = []
NUM_PROCESSES = os.cpu_count()

def load_db():
    while True:
        try:
            # Read and create database
            db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
            db.columns = ['question', 'answer']
            #db = db.drop_duplicates() # i drop it for experiment
            break
        except:
            print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
    time.sleep(fall_back_time)
    # split database into equal chunks:
    # (if you have a lot of RAM, otherwise you 
    # need to compute ranges in db, something like
    # chunk_size = len(db)//NUM_PROCESSES
    # ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
    # and pass ranges in original db to processes
    chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
    return chunked_db




def top_answers_multiprocessed(question, chunked_db):

    # on unix, python uses 'fork' mode by default
    # so the process has 'copy-on-change' access to all global variables
    # i.e. if process will change something in db, it will be copied to it
    # with a lot of overhead
    # Unfortunately, I'fe heard that on Windows only 'spawn' mode with full 
    # copy of everything is used

    # Process pipeline uses pickle, it's quite slow.
    # so on small database you may not have benefit from multiprocessing
    # If you are going to transfer big objects in or out, look
    # in the direction of multiprocessing.Array

    # this solution is not fully efficient,
    # as pool is recreated each time

    # You can create daemon processes which will monitor
    # Queue for incoming questions, but it's harder to implement
    def top_answers(idx):
        # question is in the scope of parent function, 
        chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
        db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
        db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
        return db_sorted



    with closing(Pool(processes=NUM_PROCESSES)) as pool:
        # chunked_db is a list of databases
        # they are in global scope, we send only index beacause
        # all the data set is pickled
        num_chunks = len(chunked_db)
        # apply function top_answers across generator range(num_chunks)
        res = pool.imap_unordered(top_answers, range(num_chunks))
        res = list(res) 
        # now res is list of dataframes, let's join it
        res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
    return res_final





def write_txt(top):
    result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
    result = '\n'.join(result)
    fileHandle = open("svar.txt", "w")
    fileHandle.write(result)
    fileHandle.close()
    pp.copy("")


def mainfunc():
   global chunked_db
   chunked_db = load_db()
   last_db_reload = time.time()
   print('db loaded')

   last_clip = ""
   while True:
       # Get contents of clipboard
       try:
           new_clip = pp.paste()
       except:
           continue

       if (new_clip != last_clip) and (len(new_clip)> 0):
           print(new_clip)
           last_clip = new_clip

           question = new_clip.strip()
       else:
           continue


       # Rank answer
       top = top_answers_multiprocessed(question, chunked_db)

       # If answer was found, show results
       if len(top) > 0:
            #write_txt(top)
            print(top)


if __name__ == '__main__':
    mainfunc()

Python多核CSV简短程序，需要建议/帮助

问题描述

2 个解决方案

解决方案1
0 2018-10-10 13:30:47

解决方案2
0 2018-10-10 13:37:40

Python多核CSV简短程序，需要建议/帮助

问题描述

2 个解决方案

解决方案1 0 2018-10-10 13:30:47

解决方案2 0 2018-10-10 13:37:40

解决方案1
0 2018-10-10 13:30:47

解决方案2
0 2018-10-10 13:37:40