I have a number of records that come from database. I want to convert the structure of the DB records to be more like a parent and child class
So forecast_data
has properties as follows:
component_plan_id, region, planning_item, cfg, measure, period_str, currency, forecast_value, forecast_currency
The idea is to convert to parent record, with property
component_plan_id, region, planning_item, cfg, measure, currency
and the child record for each parent record will be
period_str, forecast_value, forecast_currency
So what I do in my code, is that
1. Get the list of unique property in parent's records
2. For each record in (1), get records with the same attribute, create child record with period_str, forecast_value, forecast_currency
The code below already works, but somehow it is too slow. Is there any way to enhance the performance?
data = []
# Format the forecast data
for rec in list(set((row.component_plan_id, row.region,
row.planning_item, row.cfg, row.measure, row.currency) for row in forecast_data)):
new_rec = ComponentForecastReadDto(component_plan_id = rec[0],
region = rec[1], planning_item = rec[2],
cfg = rec[3], measure = rec[4], currency = rec[5])
# Get forecast value
new_rec.forecast = []
for rec_forecast in [x for x in forecast_data if
x.component_plan_id == new_rec.component_plan_id and
x.region == new_rec.region and
x.planning_item == new_rec.planning_item and
x.cfg == new_rec.cfg and
x.measure == new_rec.measure and
x.currency == new_rec.currency]:
new_forecast = ComponentForecastValueReadDto(period_str = rec_forecast.period_str,
forecast_value = rec_forecast.forecast_value, forecast_currency = rec_forecast.forecast_currency)
new_rec.forecast.append(new_forecast)
data.append(new_rec)
ComponentForecastReadDto
and ComponentForecastValueReadDto
are inherited from BaseModel
in pydantic.
Sample input:
| component_plan id | region | planning_item | cfg | measure | period_str | currency | forecast_value | forecast_currency |
| 1 | America | Item 1 | cfg A | unit | 2022-06 | 2 | 100 | 200 |
| 1 | America | Item 1 | cfg A | unit | 2022-07 | 2 | 150 | 300 |
| 1 | America | Item 1 | cfg A | unit | 2022-08 | 2 | 200 | 400 |
| 1 | Asia | Item 1 | cfg A | unit | 2022-06 | 3 | 150 | 450 |
Output
Record #1
component_plan_id = 1
region = America
planning_item = Item 1
cfg = cfg A
measure = unit
currency = 2
children:
1. period_str = 2022-06
forecast_value = 100
forecast_currency = 200
2. period_str = 2022-07
forecast_value = 150
forecast_currency = 300
3. period_str = 2022-08
forecast_value = 200
forecast_currency = 400
Record #2
component_plan_id = 1
region = Asia
planning_item = Item 1
cfg = cfg A
measure = unit
currency = 3
children:
1. period_str = 2022-06
forecast_value = 150
forecast_currency = 450
I ended up changing the function like follows:
This reduces the running time from O(N^2) to O(N log N), it runs really fast.
forecast_data = sorted(forecast_data, key = lambda x: (x.component_plan_id, x.region, x.planning_item, x.cfg, x.measure, x.currency))
data = []
if (len(forecast_data) > 0):
cur_component_plan_id = forecast_data[0].component_plan_id
cur_region = forecast_data[0].region
cur_planning_item = forecast_data[0].planning_item
cur_cfg = forecast_data[0].cfg
cur_measure = forecast_data[0].measure
cur_currency = forecast_data[0].currency
new_rec = ComponentForecastReadDto(component_plan_id = cur_component_plan_id,
region = cur_region, planning_item = cur_planning_item,
cfg = cur_cfg, measure = cur_measure, currency = cur_currency)
# Format the forecast data
for rec in forecast_data:
if (rec.component_plan_id != cur_component_plan_id or rec.region != cur_region or
rec.planning_item != cur_planning_item or rec.cfg != cur_cfg or
rec.measure != cur_measure or rec.currency != cur_currency):
data.append(new_rec)
cur_component_plan_id = rec.component_plan_id
cur_region = rec.region
cur_planning_item = rec.planning_item
cur_cfg = rec.cfg
cur_measure = rec.measure
cur_currency = rec.currency
new_rec = ComponentForecastReadDto(component_plan_id = cur_component_plan_id,
region = cur_region, planning_item = cur_planning_item,
cfg = cur_cfg, measure = cur_measure, currency = cur_currency)
new_forecast = ComponentForecastValueReadDto(period_str = rec.period_str,
forecast_value = rec.forecast_value, forecast_currency = rec.forecast_currency)
new_rec.forecast.append(new_forecast)
data.append(new_rec)
Assuming your data in CSV format, with name input.csv
:
component_plan_id,region,planning_item,cfg,measure,period_str,currency,forecast_value,forecast_currency
1,America,Item1,cfgA,unit,2022-06,2,100,200
1,America,Item1,cfgA,unit,2022-07,2,150,300
1,America,Item1,cfgA,unit,2022-08,2,200,400
1,Asia,Item1,cfgA,unit,2022-06,3,150,450
I used pandas.DataFrame.groupby
to rewrite this:
import pandas as pd
from pprint import pprint
GROUP_COLUMNS = [
'component_plan_id',
'region',
'planning_item',
'cfg',
'measure',
'currency'
]
CHILD_COLUMNS = [
'period_str',
'forecast_value',
'forecast_currency'
]
df = pd.read_csv('input.csv')
groups = df.groupby(GROUP_COLUMNS)
results = []
for name, group in groups:
record = dict()
for i in range(len(GROUP_COLUMNS)):
record[GROUP_COLUMNS[i]] = name[i]
record['children'] = group[CHILD_COLUMNS].to_dict(orient='records')
results.append(record)
pprint(results)
In case you want it faster, joblib
Parallel might help a bit:
import pandas as pd
from pprint import pprint
from joblib import Parallel, delayed
GROUP_COLUMNS = [
'component_plan_id',
'region',
'planning_item',
'cfg',
'measure',
'currency'
]
CHILD_COLUMNS = [
'period_str',
'forecast_value',
'forecast_currency'
]
def handle(name, group):
record = dict()
for i in range(len(GROUP_COLUMNS)):
record[GROUP_COLUMNS[i]] = name[i]
record['children'] = group[CHILD_COLUMNS].to_dict(orient='records')
return record
df = pd.read_csv('input.csv')
groups = df.groupby(GROUP_COLUMNS)
results = Parallel(n_jobs=-1)(
delayed(handle)(name, group) for name, group in groups
)
pprint(results)
Both ways will return the same result:
[
{
'cfg': 'cfgA',
'children': [
{'forecast_currency': 200,
'forecast_value': 100,
'period_str': '2022-06'},
{'forecast_currency': 300,
'forecast_value': 150,
'period_str': '2022-07'},
{'forecast_currency': 400,
'forecast_value': 200,
'period_str': '2022-08'}
],
'component_plan_id': 1,
'currency': 2,
'measure': 'unit',
'planning_item': 'Item1',
'region': 'America'
},
{
'cfg': 'cfgA',
'children': [
{'forecast_currency': 450,
'forecast_value': 150,
'period_str': '2022-06'}
],
'component_plan_id': 1,
'currency': 3,
'measure': 'unit',
'planning_item': 'Item1',
'region': 'Asia'
}
]
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.