[英]Python Loop Performance too slow
我有许多来自数据库的记录。 我想将数据库记录的结构转换为更像父类和子类
所以forecast_data
有如下属性:
component_plan_id, region, planning_item, cfg, measure, period_str, currency, forecast_value, forecast_currency
这个想法是转换为父记录,具有属性
component_plan_id, region, planning_item, cfg, measure, currency
每个父记录的子记录将是
period_str, forecast_value, forecast_currency
所以我在我的代码中所做的是
1. Get the list of unique property in parent's records
2. For each record in (1), get records with the same attribute, create child record with period_str, forecast_value, forecast_currency
下面的代码已经可以工作了,但不知何故它太慢了。 有什么方法可以提高性能吗?
data = []
# Format the forecast data
for rec in list(set((row.component_plan_id, row.region,
row.planning_item, row.cfg, row.measure, row.currency) for row in forecast_data)):
new_rec = ComponentForecastReadDto(component_plan_id = rec[0],
region = rec[1], planning_item = rec[2],
cfg = rec[3], measure = rec[4], currency = rec[5])
# Get forecast value
new_rec.forecast = []
for rec_forecast in [x for x in forecast_data if
x.component_plan_id == new_rec.component_plan_id and
x.region == new_rec.region and
x.planning_item == new_rec.planning_item and
x.cfg == new_rec.cfg and
x.measure == new_rec.measure and
x.currency == new_rec.currency]:
new_forecast = ComponentForecastValueReadDto(period_str = rec_forecast.period_str,
forecast_value = rec_forecast.forecast_value, forecast_currency = rec_forecast.forecast_currency)
new_rec.forecast.append(new_forecast)
data.append(new_rec)
ComponentForecastReadDto
和ComponentForecastValueReadDto
继承自BaseModel
中的 BaseModel。
样本输入:
| component_plan id | region | planning_item | cfg | measure | period_str | currency | forecast_value | forecast_currency |
| 1 | America | Item 1 | cfg A | unit | 2022-06 | 2 | 100 | 200 |
| 1 | America | Item 1 | cfg A | unit | 2022-07 | 2 | 150 | 300 |
| 1 | America | Item 1 | cfg A | unit | 2022-08 | 2 | 200 | 400 |
| 1 | Asia | Item 1 | cfg A | unit | 2022-06 | 3 | 150 | 450 |
输出
记录#1
component_plan_id = 1
region = America
planning_item = Item 1
cfg = cfg A
measure = unit
currency = 2
children:
1. period_str = 2022-06
forecast_value = 100
forecast_currency = 200
2. period_str = 2022-07
forecast_value = 150
forecast_currency = 300
3. period_str = 2022-08
forecast_value = 200
forecast_currency = 400
记录#2
component_plan_id = 1
region = Asia
planning_item = Item 1
cfg = cfg A
measure = unit
currency = 3
children:
1. period_str = 2022-06
forecast_value = 150
forecast_currency = 450
我最终改变了如下功能:
这将运行时间从 O(N^2) 减少到 O(N log N),它运行得非常快。
forecast_data = sorted(forecast_data, key = lambda x: (x.component_plan_id, x.region, x.planning_item, x.cfg, x.measure, x.currency))
data = []
if (len(forecast_data) > 0):
cur_component_plan_id = forecast_data[0].component_plan_id
cur_region = forecast_data[0].region
cur_planning_item = forecast_data[0].planning_item
cur_cfg = forecast_data[0].cfg
cur_measure = forecast_data[0].measure
cur_currency = forecast_data[0].currency
new_rec = ComponentForecastReadDto(component_plan_id = cur_component_plan_id,
region = cur_region, planning_item = cur_planning_item,
cfg = cur_cfg, measure = cur_measure, currency = cur_currency)
# Format the forecast data
for rec in forecast_data:
if (rec.component_plan_id != cur_component_plan_id or rec.region != cur_region or
rec.planning_item != cur_planning_item or rec.cfg != cur_cfg or
rec.measure != cur_measure or rec.currency != cur_currency):
data.append(new_rec)
cur_component_plan_id = rec.component_plan_id
cur_region = rec.region
cur_planning_item = rec.planning_item
cur_cfg = rec.cfg
cur_measure = rec.measure
cur_currency = rec.currency
new_rec = ComponentForecastReadDto(component_plan_id = cur_component_plan_id,
region = cur_region, planning_item = cur_planning_item,
cfg = cur_cfg, measure = cur_measure, currency = cur_currency)
new_forecast = ComponentForecastValueReadDto(period_str = rec.period_str,
forecast_value = rec.forecast_value, forecast_currency = rec.forecast_currency)
new_rec.forecast.append(new_forecast)
data.append(new_rec)
假设您的数据为 CSV 格式,名称为input.csv
:
component_plan_id,region,planning_item,cfg,measure,period_str,currency,forecast_value,forecast_currency
1,America,Item1,cfgA,unit,2022-06,2,100,200
1,America,Item1,cfgA,unit,2022-07,2,150,300
1,America,Item1,cfgA,unit,2022-08,2,200,400
1,Asia,Item1,cfgA,unit,2022-06,3,150,450
我用pandas.DataFrame.groupby
重写了这个:
import pandas as pd
from pprint import pprint
GROUP_COLUMNS = [
'component_plan_id',
'region',
'planning_item',
'cfg',
'measure',
'currency'
]
CHILD_COLUMNS = [
'period_str',
'forecast_value',
'forecast_currency'
]
df = pd.read_csv('input.csv')
groups = df.groupby(GROUP_COLUMNS)
results = []
for name, group in groups:
record = dict()
for i in range(len(GROUP_COLUMNS)):
record[GROUP_COLUMNS[i]] = name[i]
record['children'] = group[CHILD_COLUMNS].to_dict(orient='records')
results.append(record)
pprint(results)
如果您想要更快, joblib
Parallel 可能会有所帮助:
import pandas as pd
from pprint import pprint
from joblib import Parallel, delayed
GROUP_COLUMNS = [
'component_plan_id',
'region',
'planning_item',
'cfg',
'measure',
'currency'
]
CHILD_COLUMNS = [
'period_str',
'forecast_value',
'forecast_currency'
]
def handle(name, group):
record = dict()
for i in range(len(GROUP_COLUMNS)):
record[GROUP_COLUMNS[i]] = name[i]
record['children'] = group[CHILD_COLUMNS].to_dict(orient='records')
return record
df = pd.read_csv('input.csv')
groups = df.groupby(GROUP_COLUMNS)
results = Parallel(n_jobs=-1)(
delayed(handle)(name, group) for name, group in groups
)
pprint(results)
两种方式都将返回相同的结果:
[
{
'cfg': 'cfgA',
'children': [
{'forecast_currency': 200,
'forecast_value': 100,
'period_str': '2022-06'},
{'forecast_currency': 300,
'forecast_value': 150,
'period_str': '2022-07'},
{'forecast_currency': 400,
'forecast_value': 200,
'period_str': '2022-08'}
],
'component_plan_id': 1,
'currency': 2,
'measure': 'unit',
'planning_item': 'Item1',
'region': 'America'
},
{
'cfg': 'cfgA',
'children': [
{'forecast_currency': 450,
'forecast_value': 150,
'period_str': '2022-06'}
],
'component_plan_id': 1,
'currency': 3,
'measure': 'unit',
'planning_item': 'Item1',
'region': 'Asia'
}
]
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.