[英]Slow data loading to MongoDB from pandas Dataframe
我有2個大型CSV文件需要加載到Mongo集合中。 首先,我將數據讀入pandas Dataframe中,進行一些預處理,然后將結果字典插入Mongo集合中。 問題在於性能很慢,因為它是按順序執行的,並且應該在第一個集合已填充完畢(用外鍵更新行)之后才將數據加載到第二個集合中。 如何加快加載過程?
import pymongo
import config
import pandas as pd
import numpy as np
from datetime import datetime
from config import logger
client = pymongo.MongoClient(config.IP)
try:
client.server_info()
except pymongo.errors.ServerSelectionTimeoutError as e:
logger.error("Unable to connect to %s. Error: %s" % (config.IP, e))
client = None
# connect to database (or create if not exists)
mydb = client[config.DB_NAME]
# connect to collections (or create if not exists)
movie_collection = mydb[config.DB_MOVIE_COLLECTION]
actors_collection = mydb[config.DB_ACTOR_COLLECTION]
def read_data(file):
'''
returns Dataframe with read csv data
'''
df = pd.read_csv(file, sep='\t')
df.replace('\\N', np.nan, inplace=True)
return df
def insert_to_collection(collection, data):
collection.insert(data)
def fill_movie_data():
'''
iterates over movie Dataframe
process values and creates dict structure
with specific attributes to insert into MongoDB movie collection
'''
# load data to pandas Dataframe
logger.info("Reading movie data to Dataframe")
data = read_data('datasets/title.basics.tsv')
for index, row in data.iterrows():
result_dict = {}
id_ = row['tconst']
title = row['primaryTitle']
# check value of movie year (if not NaN)
if not pd.isnull(row['endYear']) and not pd.isnull(row['startYear']):
year = list([row['startYear'], row['endYear']])
elif not pd.isnull(row['startYear']):
year = int(row['startYear'])
else:
year = None
# check value of movie duration (if not NaN)
if not pd.isnull(row['runtimeMinutes']):
try:
duration = int(row['runtimeMinutes'])
except ValueError:
duration = None
else:
duration = None
# check value of genres (if not NaN)
if not pd.isnull(row['genres']):
genres = row['genres'].split(',')
else:
genres = None
result_dict['_id'] = id_
result_dict['primary_title'] = title
# if both years have values
if isinstance(year, list):
result_dict['year_start'] = int(year[0])
result_dict['year_end'] = int(year[1])
# if start_year has value
elif year:
result_dict['year'] = year
if duration:
result_dict['duration'] = duration
if genres:
result_dict['genres'] = genres
insert_to_collection(movie_collection, result_dict)
def fill_actors_data():
'''
iterates over actors Dataframe
process values, creates dict structure
with new fields to insert into MongoDB actors collection
'''
logger.info("Inserting data to actors collection")
# load data to pandas Dataframe
logger.info("Reading actors data to Dataframe")
data = read_data('datasets/name.basics.tsv')
logger.info("Inserting data to actors collection")
for index, row in data.iterrows():
result_dict = {}
id_ = row['nconst']
name = row['primaryName']
# if no birth year and death year value
if pd.isnull(row['birthYear']):
yob = None
alive = False
# if both birth and death year have value
elif not pd.isnull(row['birthYear']) and not pd.isnull(row['deathYear']):
yob = int(row['birthYear'])
death = int(row['deathYear'])
age = death - yob
alive = False
# if only birth year has value
else:
yob = int(row['birthYear'])
current_year = datetime.now().year
age = current_year - yob
alive = True
if not pd.isnull(row['knownForTitles']):
movies = row['knownForTitles'].split(',')
result_dict['_id'] = id_
result_dict['name'] = name
result_dict['yob'] = yob
result_dict['alive'] = alive
result_dict['age'] = age
result_dict['movies'] = movies
insert_to_collection(actors_collection, result_dict)
# update movie documents with list of actors ids
movie_collection.update_many({"_id": {"$in": movies}}, {"$push": { "people": id_}})
# if collections are empty, fill it with data
if movie_collection.count() == 0:
fill_movie_data()
if actors_collection.count() == 0:
fill_actors_data()
批量插入無需一次插入一個記錄。
insert_many
目前,您有:
def insert_to_collection(collection: pymongo.collection.Collection, data: dict):
collection.insert(data)
順便說一下,您正在使用不推薦使用的insert()
。
您想要擁有的是:
def insert_to_collection(collection: pymongo.collection.Collection, data: list):
collection.insert_many(data)
因此,在您的兩個函數fill_movie_data
和fill_actors_data
中,您可以不時調用一次並批量插入,而不是insert_to_collection()
在循環中調用insert_to_collection()
。
以下是您發布的代碼,並進行了一些修改:
添加一個max_bulk_size
,越大越適合您的速度,只需確保它不超過RAM。
max_bulk_size = 500
添加一個results_list
並將result_dict
附加到它。 一旦列表的大小達到max_bulk_size
,將其保存並清空列表。
def fill_movie_data():
'''
iterates over movie Dataframe
process values and creates dict structure
with specific attributes to insert into MongoDB movie collection
'''
# load data to pandas Dataframe
logger.info("Reading movie data to Dataframe")
data = read_data('datasets/title.basics.tsv')
results_list = []
for index, row in data.iterrows():
result_dict = {}
id_ = row['tconst']
title = row['primaryTitle']
# check value of movie year (if not NaN)
if not pd.isnull(row['endYear']) and not pd.isnull(row['startYear']):
year = list([row['startYear'], row['endYear']])
elif not pd.isnull(row['startYear']):
year = int(row['startYear'])
else:
year = None
# check value of movie duration (if not NaN)
if not pd.isnull(row['runtimeMinutes']):
try:
duration = int(row['runtimeMinutes'])
except ValueError:
duration = None
else:
duration = None
# check value of genres (if not NaN)
if not pd.isnull(row['genres']):
genres = row['genres'].split(',')
else:
genres = None
result_dict['_id'] = id_
result_dict['primary_title'] = title
# if both years have values
if isinstance(year, list):
result_dict['year_start'] = int(year[0])
result_dict['year_end'] = int(year[1])
# if start_year has value
elif year:
result_dict['year'] = year
if duration:
result_dict['duration'] = duration
if genres:
result_dict['genres'] = genres
results_list.append(result_dict)
if len(results_list) > max_bulk_size:
insert_to_collection(movie_collection, results_list)
results_list = []
與您的其他循環相同。
def fill_actors_data():
'''
iterates over actors Dataframe
process values, creates dict structure
with new fields to insert into MongoDB actors collection
'''
logger.info("Inserting data to actors collection")
# load data to pandas Dataframe
logger.info("Reading actors data to Dataframe")
data = read_data('datasets/name.basics.tsv')
logger.info("Inserting data to actors collection")
results_list = []
for index, row in data.iterrows():
result_dict = {}
id_ = row['nconst']
name = row['primaryName']
# if no birth year and death year value
if pd.isnull(row['birthYear']):
yob = None
alive = False
# if both birth and death year have value
elif not pd.isnull(row['birthYear']) and not pd.isnull(row['deathYear']):
yob = int(row['birthYear'])
death = int(row['deathYear'])
age = death - yob
alive = False
# if only birth year has value
else:
yob = int(row['birthYear'])
current_year = datetime.now().year
age = current_year - yob
alive = True
if not pd.isnull(row['knownForTitles']):
movies = row['knownForTitles'].split(',')
result_dict['_id'] = id_
result_dict['name'] = name
result_dict['yob'] = yob
result_dict['alive'] = alive
result_dict['age'] = age
result_dict['movies'] = movies
results_list.append(result_dict)
if len(results_list) > max_bulk_size:
insert_to_collection(actors_collection, results_list)
results_list = []
# update movie documents with list of actors ids
movie_collection.update_many({"_id": {"$in": movies}}, {"$push": { "people": id_}})
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.