[英]How to efficiently interpolate data in a Pandas DataFrame row-wise?
[英]How to efficiently perform row-wise operations using pandas?
我想從一些csv文件中獲取一些基本的統計信息, 而不將整個文件加載到內存中 。 我用兩種方式做到這一點,一種看似使用熊貓的“聰明”方式和另一種使用csv的休閑方式我希望熊貓的方式更快,但csv方式實際上更快。 我想知道為什么。
這是我的代碼:
import pandas as pd
import csv
movies = pd.read_csv('movies.csv') # movieId,title,genres
movie_count = movies.shape[0] # 9742
movieId_min = ratings.movieId.min()
movieId_max = ratings.movieId.max()
movieId_disperse = movies.movieId.sort_values().to_dict()
movieId_squeeze = {v: k for k, v in movieId_disperse.items()}
def get_ratings_stats():
gp_by_user = []
gp_by_movie = [0] * movie_count
top_rator = (0, 0) # (idx, value)
top_rated = (0, 0) # (idx, value)
rating_count = 0
user_count = 0
last_user = -1
for row in csv.DictReader(open('ratings.csv')):
user = int(row['userId'])-1
movie = movieId_squeeze[int(row['movieId'])]
if last_user != user:
last_user = user
user_count += 1
gp_by_user += [0]
rating_count += 1
gp_by_user[user] += 1
gp_by_movie[movie] += 1
top_rator = (user, gp_by_user[user]) if gp_by_user[user] > top_rator[1] else top_rator
top_rated = (movie, gp_by_movie[movie]) if gp_by_movie[movie] > top_rated[1] else top_rated
top_rator = (top_rator[0]+1, top_rator[1])
top_rated = (movieId_disperse[top_rated[0]], top_rated[1])
return rating_count, top_rator, top_rated
現在,如果我更換線:
for row in csv.DictReader(open('ratings.csv')):
附:
for chunk in pd.read_csv('ratings.csv', chunksize=1000):
for _,row in chunk.iterrows():
代碼實際上變慢了10倍。
以下是時間結果:
> %timeit get_ratings_stats() # with csv
325 ms ± 9.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
> %timeit get_ratings_stats() # with pandas
3.45 s ± 67.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
關於如何使這些代碼更好/更快/更易讀的任何評論將非常感激
我認為重點是你不應該使用大熊貓,如果你打算像dict一樣對待龐大,昂貴的數據結構。 問題不應該是如何讓熊貓變得更好,它應該是如何用熊貓編寫代碼來做你想做的事情。
import pandas as pd
def get_ratings_stats():
movie_rating_data = pd.read_csv('ratings.csv')
# Get the movie with the best rating
top_movie = movie_rating_data.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
# Get the user with the best rating
top_user = movie_rating_data.loc[:, ['userId', 'rating']].groupby('userId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
return movie_rating_data.shape[0], top_movie, top_user
def get_ratings_stats_slowly():
movies = pd.DataFrame(columns = ["movieId", "ratings"])
users = pd.DataFrame(users = ["userId", "ratings"])
data_size = 0
for chunk in pd.read_csv('ratings.csv', chunksize=1000):
movies = movies.append(chunk.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max'))
users = users.append(chunk.loc[:, ['userId', 'rating']].groupby('userId').agg('max'))
data_size += chunk.shape[0]
top_movie = movies.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
top_user = users.loc[:, ['userId', 'rating']].groupby('userId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
return data_size, top_movie, top_user
我不確定這是你想要做的整體,但是你的代碼是不可理解的 - 這應該是一個好的開始(你可以用.agg('max')
取代.agg('max')
.count()
如果你是感興趣的評級數量等)。
我認為並行處理是您問題的答案。 我已嘗試對您的問題進行一些並行處理,但我不得不將評級文件拆分為多個文件進行處理。
我最初做的是將CSV文件中的評級數據復制10倍,然后我執行你的腳本以獲得初始執行時間,這對我來說大概是3.6 seconds
。 現在,通過將文件拆分為多個文件,可以通過多個子進程進行處理,例如使用我的腳本-k 2
(基本上是2個工作程序),總執行時間減少到1.87 seconds
。 如果我使用-k 4
(4個工人),執行時間將是1.13 seconds
。
我不確定是否可以以塊的形式讀取CSV並且基本上從單個大文件中執行CSV的並行搜索讀取,但這會使它更快,唯一的缺點是需要進行初始化大CSV文件中的行數,以了解每個worker的行數。
分裂腳本:
import csv
file_path = "data/ratings.csv"
out_path = "data/big_ratings_{}.csv"
out_csv = None
for i in range(10):
print("Iteration #{}".format(i+1))
pin = open(file_path, "r")
pout = open(out_path.format(i), "w")
in_csv = csv.DictReader(pin)
out_csv = csv.DictWriter(pout, fieldnames=in_csv.fieldnames)
out_csv.writeheader()
for row in in_csv:
out_csv.writerow(row)
pin.close()
pout.close()
實際評級處理腳本
import time
import csv
import argparse
import os
import sys
from multiprocessing import Process, Queue, Value
import pandas as pd
top_rator_queue = Queue()
top_rated_queue = Queue()
DEFAULT_NO_OF_WORKERS = 1
RATINGS_FILE_PATH = "data/big_ratings_{}.csv"
NUMBER_OF_FILES = 10
class ProcessRatings(Process):
def __init__(self, file_index_range, top_rator_queue, top_rated_queue, movie_id_squeeze):
super(ProcessRatings, self).__init__()
self.file_index_range = file_index_range
self.top_rator_queue = top_rator_queue
self.top_rated_queue = top_rated_queue
self.movie_id_squeeze = movie_id_squeeze
def run(self):
for file_index in self.file_index_range:
print("[PID: {}] Processing file index {} .".format(os.getpid(), file_index))
start = time.time()
gp_by_user = []
gp_by_movie = [0] * movie_count
top_rator = (0, 0) # (idx, value)
top_rated = (0, 0) # (idx, value)
rating_count = 0
user_count = 0
last_user = -1
for row in csv.DictReader(open(RATINGS_FILE_PATH.format(file_index))):
user = int(row['userId'])-1
movie = self.movie_id_squeeze[int(row['movieId'])]
if last_user != user:
last_user = user
user_count += 1
gp_by_user += [0]
gp_by_user[user] += 1
gp_by_movie[movie] += 1
top_rator = (user, gp_by_user[user]) if gp_by_user[user] > top_rator[1] else top_rator
top_rated = (movie, gp_by_movie[movie]) if gp_by_movie[movie] > top_rated[1] else top_rated
end = time.time()
print("[PID: {}] Processing time for file index {} : {}s!".format(os.getpid(), file_index, end-start))
print("[PID: {}] WORKER DONE!".format(os.getpid()))
if __name__ == "__main__":
print("Processing ratings in multiple worker processes.")
start = time.time()
# script arguments handling
parser = argparse.ArgumentParser()
parser.add_argument("-k", dest="workers", action="store")
args_space = parser.parse_args()
# determine the number of workers
number_of_workers = DEFAULT_NO_OF_WORKERS
if args_space.workers:
number_of_workers = int(args_space.workers)
else:
print("Number of workers not specified. Assuming: {}".format(number_of_workers))
# rating data
rating_count = 0
movies = pd.read_csv('data/movies.csv') # movieId,title,genres
movie_count = movies.shape[0] # 9742
movieId_min = movies.movieId.min()
movieId_max = movies.movieId.max()
movieId_disperse = movies.movieId.sort_values().to_dict()
movieId_squeeze = {v: k for k, v in movieId_disperse.items()}
# process data
processes = []
# initialize the worker processes
number_of_files_per_worker = NUMBER_OF_FILES // number_of_workers
for i in range(number_of_workers):
p = ProcessRatings(
range(i, i+number_of_files_per_worker), # file index
top_rator_queue,
top_rated_queue,
movieId_squeeze
)
p.start()
processes.append(p)
print("MAIN: Wait for processes to finish ...")
# wait until all processes are done
while True:
# determine if the processes are still running
if not any(p.is_alive() for p in processes):
break
# gather the data and do a final processing
end = time.time()
print("Processing time: {}s".format(end - start))
print("Rating count: {}".format(rating_count))
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.