[英]How to Use vectorize or Apply instead of iterrows on pandas dataframe in python
I have 2000+ dataframes with two columns. 我有两列2000+数据框。 I want to ngrams for on the columns and then create a new dataframe with ngrams.
我想在列上使用ngrams,然后使用ngrams创建一个新的数据框。 Here is my code.
这是我的代码。 Its working fine.
它的工作正常。 Just taking a lot of time.
只花很多时间。
I am currently using itterows to iterate through each row of each dataframe in each file. 我目前正在使用itterows遍历每个文件中每个数据帧的每一行。 Is there an easier way to do this using vectorization or apply ?
是否有使用矢量化或套用的简便方法?
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import psutil
import numpy as np
import pandas as pd
import time
def create_combinations(file):
initial_path ='./to_process/'
final_path = './processed/'
custom = pd.read_pickle(initial_path+file, compression='gzip')
custom = custom.drop_duplicates(subset=['category', 'element'])
custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
total_rows = len(custom.index)
logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
# if total_rows > cores:
# partitions = math.floor(total_rows/cores)
# logging.warning('Number of partitions : ' + str(partitions))
if total_rows > 0:
combined_df = pd.DataFrame(columns=['category', 'element'])
logging.warning('creating combinations')
for key, data in custom.iterrows():
words = data['element']#.split()
logging.warning(words)
words2 = words.replace('%', '%%').replace(' ', '%s')
logging.warning('Number of words to combine: '+ str(len(words.split())))
k = 0
df1 = pd.DataFrame(columns=['category', 'element'])
for i in itertools.product((' ', ''), repeat=words.count(' ')):
df1.loc[k, 'element'] = (words2 % i)
df1.loc[k, 'category'] = data['category']
k += 1
combined_df = pd.concat([combined_df,df1], axis=0)
del df1
combined_df.to_pickle(final_path + file, compression='gzip')
combined_df.to_csv(final_path + os.path.splitext(file)[0]+'.csv')
del combined_df
del custom
# partitions = 1
logging.warning('completed ' + file)
else:
logging.warning('No rows to process')
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
partitions = 1 #number of partitions to split dataframe
cores = 2 #number of cores on your machine
path ='./to_process/'
combi_path = './processed/'
files = [f for f in listdir(path) if isfile(join(path, f))]
pickle_files=[]
for any_file in files:
if any_file.endswith('.pickle'):
if os.path.isfile(combi_path+any_file):
logging.warning(any_file +' already processed.')
else:
pickle_files.insert(len(pickle_files),any_file)
p = multiprocessing.Pool(processes = len(pickle_files))
start = time.time()
async_result = p.map_async(create_combinations, pickle_files)
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
enter code here 在这里输入代码
# pylint: disable=I0011
# pylint: disable=C0111
# pylint: disable=C0301
# pylint: disable=C0103
# pylint: disable=W0612
# pylint: disable=W0611
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import time
import gc
import numpy as np
import pandas as pd
def create_combinations(file):
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
initial_path ='./training/pickles/standard and documentation/custom_elements/trial/'
final_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
completed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
custom = pd.read_pickle(initial_path+file, compression='gzip')
custom = custom.drop_duplicates(subset=['category', 'element'])
custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
total_rows = len(custom.index)
logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
cat = []
ele = []
combined_df = pd.DataFrame(columns=['category', 'element'])
logging.warning('creating combinations')
k=1
for key, data in custom.iterrows():
words = data['element']#.split()
logging.warning(words)
words2 = words.replace('%', '%%').replace(' ', '%s')
logging.warning('Number of words to combine: '+ str(len(words.split())))
for i in itertools.product((' ', ''), repeat=words.count(' ')):
ele.append(words2 % i)
cat.append(data['category'])
lst = zip(cat,ele)
if len(lst) > 200000:
del cat
del ele
combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
del lst
combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv')
#del combined_df
gc.collect()
k+=1
del cat
del ele
combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
del lst
combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv')
del combined_df
gc.collect()
del custom
del words
del words2
logging.warning('completed ' + file)
os.rename(initial_path+file, completed_file_path+file)
os.rename(initial_path+os.path.splitext(file)[0]+'.csv', completed_file_path+os.path.splitext(file)[0]+'.csv')
return True
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
partitions = 1 #number of partitions to split dataframe
cores = 6 #number of cores on your machine
path ='./training/pickles/standard and documentation/custom_elements/trial/'
combi_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
processed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
files = [f for f in listdir(path) if isfile(join(path, f))]
pickle_files=[]
for any_file in files:
if any_file.endswith('.pickle'):
if os.path.isfile(combi_path+any_file):
os.rename(path+any_file, processed_file_path+any_file)
os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
logging.warning(any_file +' already processed.')
else:
df = pd.read_pickle(path+any_file, compression='gzip')
rows = len(df.index)
if rows > 0:
#if rows < 500:
pickle_files.insert(len(pickle_files),any_file)
# else:
# continue
else:
os.rename(path+any_file, processed_file_path+any_file)
os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
del df
gc.collect()
del rows
gc.collect()
ctx = multiprocessing.get_context('spawn')
p = ctx.Pool(processes=cores, maxtasksperchild=1000)
start = time.time()
async_result = p.map_async(create_combinations, pickle_files)
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.