如何在Python中的熊猫数据框上使用向量化或应用而不是迭代

Question

I have 2000+ dataframes with two columns. 我有两列2000+数据框。 I want to ngrams for on the columns and then create a new dataframe with ngrams. 我想在列上使用ngrams，然后使用ngrams创建一个新的数据框。 Here is my code. 这是我的代码。 Its working fine. 它的工作正常。 Just taking a lot of time. 只花很多时间。

I am currently using itterows to iterate through each row of each dataframe in each file. 我目前正在使用itterows遍历每个文件中每个数据帧的每一行。 Is there an easier way to do this using vectorization or apply ? 是否有使用矢量化或套用的简便方法？

import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import psutil
import numpy as np
import pandas as pd
import time

def create_combinations(file):
    initial_path ='./to_process/'
    final_path = './processed/'
    custom = pd.read_pickle(initial_path+file, compression='gzip')
    custom = custom.drop_duplicates(subset=['category', 'element'])
    custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
    total_rows = len(custom.index)
    logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
    # if total_rows > cores:
    #     partitions = math.floor(total_rows/cores)
    # logging.warning('Number of partitions : ' + str(partitions))
    if total_rows > 0:
        combined_df = pd.DataFrame(columns=['category', 'element'])
        logging.warning('creating combinations')
        for key, data in custom.iterrows():
            words = data['element']#.split()
            logging.warning(words)
            words2 = words.replace('%', '%%').replace(' ', '%s')
            logging.warning('Number of words to combine: '+ str(len(words.split())))
            k = 0
            df1 = pd.DataFrame(columns=['category', 'element'])
            for i in itertools.product((' ', ''), repeat=words.count(' ')):
                df1.loc[k, 'element'] = (words2 % i)
                df1.loc[k, 'category'] = data['category']
                k += 1
            combined_df = pd.concat([combined_df,df1], axis=0)
            del df1
        combined_df.to_pickle(final_path + file, compression='gzip')
        combined_df.to_csv(final_path + os.path.splitext(file)[0]+'.csv') 
        del combined_df
        del custom
            # partitions = 1
        logging.warning('completed ' + file)
    else:
        logging.warning('No rows to process')





if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    partitions = 1 #number of partitions to split dataframe
    cores = 2 #number of cores on your machine
    path ='./to_process/'
    combi_path = './processed/'
    files = [f for f in listdir(path) if isfile(join(path, f))]
    pickle_files=[]
    for any_file in files:
        if any_file.endswith('.pickle'):
            if os.path.isfile(combi_path+any_file):
                logging.warning(any_file +' already processed.')
            else:
                pickle_files.insert(len(pickle_files),any_file)
    p = multiprocessing.Pool(processes = len(pickle_files))
    start = time.time()
    async_result = p.map_async(create_combinations, pickle_files)
    p.close()
    p.join()
    print("Complete")
    end = time.time()
    print('total time (s)= ' + str(end-start))

enter code here 在这里输入代码

Answer 1

# pylint: disable=I0011
# pylint: disable=C0111
# pylint: disable=C0301
# pylint: disable=C0103
# pylint: disable=W0612
# pylint: disable=W0611
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import time
import gc
import numpy as np
import pandas as pd


def create_combinations(file):
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    initial_path ='./training/pickles/standard and documentation/custom_elements/trial/'
    final_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
    completed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
    custom = pd.read_pickle(initial_path+file, compression='gzip')
    custom = custom.drop_duplicates(subset=['category', 'element'])
    custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
    total_rows = len(custom.index)
    logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
    cat = []
    ele = []
    combined_df = pd.DataFrame(columns=['category', 'element'])
    logging.warning('creating combinations')
    k=1
    for key, data in custom.iterrows():
        words = data['element']#.split()
        logging.warning(words)
        words2 = words.replace('%', '%%').replace(' ', '%s')
        logging.warning('Number of words to combine: '+ str(len(words.split())))
        for i in itertools.product((' ', ''), repeat=words.count(' ')):
            ele.append(words2 % i)
            cat.append(data['category'])
        lst = zip(cat,ele)
        if len(lst) > 200000:
            del cat
            del ele
            combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
            del lst
            combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
            combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv') 
            #del combined_df
            gc.collect()
            k+=1
    del cat
    del ele
    combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
    del lst
    combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
    combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv') 
    del combined_df
    gc.collect()
    del custom
    del words
    del words2
    logging.warning('completed ' + file)
    os.rename(initial_path+file, completed_file_path+file)
    os.rename(initial_path+os.path.splitext(file)[0]+'.csv', completed_file_path+os.path.splitext(file)[0]+'.csv')
    return True


if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    partitions = 1 #number of partitions to split dataframe
    cores = 6 #number of cores on your machine
    path ='./training/pickles/standard and documentation/custom_elements/trial/'
    combi_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
    processed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
    files = [f for f in listdir(path) if isfile(join(path, f))]
    pickle_files=[]
    for any_file in files:
        if any_file.endswith('.pickle'):
            if os.path.isfile(combi_path+any_file):
                os.rename(path+any_file, processed_file_path+any_file)
                os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
                logging.warning(any_file +' already processed.')
            else:
                df = pd.read_pickle(path+any_file, compression='gzip')
                rows = len(df.index)
                if rows > 0:
                    #if rows < 500:
                    pickle_files.insert(len(pickle_files),any_file)
                    # else:
                    #     continue
                else:
                    os.rename(path+any_file, processed_file_path+any_file)
                    os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
                del df
                gc.collect()
                del rows
                gc.collect()
    ctx = multiprocessing.get_context('spawn')
    p = ctx.Pool(processes=cores, maxtasksperchild=1000)
    start = time.time()
    async_result = p.map_async(create_combinations, pickle_files)
    p.close()
    p.join()
    print("Complete")
    end = time.time()
    print('total time (s)= ' + str(end-start))

如何在Python中的熊猫数据框上使用向量化或应用而不是迭代

问题描述

1 个解决方案

解决方案1
0 已采纳 2017-12-15 15:59:35

如何在Python中的熊猫数据框上使用向量化或应用而不是迭代

问题描述

1 个解决方案

解决方案1 0 已采纳 2017-12-15 15:59:35

解决方案1
0 已采纳 2017-12-15 15:59:35