[英]Flatten a list of dictionaries in python
我有一个字典列表:
data = [{"average": 2, "day": "2022-01-01", "name": "joe", "employee_id": 1},
{"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
{"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
{"sum": 13, "day": "2022-01-01", "name": "joe", "employee_id": 1},
{"sum": 15, "day": "2022-01-02", "name": "joe", "employee_id": 1},
{"sum": 0, "day": "2022-01-03", "name": "joe", "employee_id": 1},
{"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
{"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
{"sum": 9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
{"sum": 8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]
我想要我的 output 作为:
output = [{"name": "joe", "employee_id": 1, "day": "2022-01-01", "average": 2, "sum": 13},
{"name": "joe", "employee_id": 1, "day": "2022-01-02", "average": 3, "sum": 15},
{"name": "joe", "employee_id": 1, "day": "2022-01-03", "average": 9, "sum": 0},
{"name": "bob", "employee_id": 2, "day": "2022-01-01", "average": 1, "sum": 9},
{"name": "bob", "employee_id": 2, "day": "2022-01-02", "average": 3, "sum": 8}]
目标是将 output 值按日期、姓名和 employee_id 放在一起。
我试过了:
output = {}
for item in data:
if item["day"] not in output:
output[item["day"]] = item
else:
output[item["day"]].update(item)
print(list(output.values()))
这有助于将“平均值”、“总和”和“日期”放在一起,但最终不包括所有员工及其 ID。
任何帮助表示赞赏
将collections.defaultdict
与dict
一起使用。 这里,取每个字典的'day'
和'name'
的值作为key:
>>> from collections import defaultdict
>>> defdict = defaultdict(dict)
>>> for mp in data:
... defdict[mp['day'], mp['name']].update(mp)
...
>>> keys = ('name', 'employee_id', 'day', 'average', 'sum')
>>> [{k: mp[k] for k in keys} for mp in defdict.values()]
[{'name': 'joe', 'employee_id': 1, 'day': '2022-01-01', 'average': 2, 'sum': 13},
{'name': 'joe', 'employee_id': 1, 'day': '2022-01-02', 'average': 3, 'sum': 15},
{'name': 'joe', 'employee_id': 1, 'day': '2022-01-03', 'average': 9, 'sum': 0},
{'name': 'bob', 'employee_id': 2, 'day': '2022-01-01', 'average': 1, 'sum': 9},
{'name': 'bob', 'employee_id': 2, 'day': '2022-01-02', 'average': 3, 'sum': 8}]
对于150w条数据,这个方案的性能还是优于pandas(至少在将数据转换成DataFrame
时,for循环完成了工作):
In [451]: random.seed(0)
...: names = [''.join(random.choices(string.ascii_lowercase, k=random.randrange(3, 7))) for _ in range(10000)]
...: dates = [str(datetime.date(2022, i, j)) for i in range(7, 10) for j in range(1, 31)]
...: keys = ['sum', 'average']
...:
...: data = [{k: random.randrange(10), 'day': date, 'name': name, 'employee_id': i}
...: for i, name in enumerate(names, 1)
...: for date in sorted(random.sample(dates, random.randrange(60, 90)))
...: for k in keys]
...:
In [452]: len(data)
Out[452]: 1492286
In [453]: %%timeit
...: defdict = defaultdict(dict)
...: for mp in data:
...: defdict[mp['day'], mp['name']].update(mp)
...: keys = ('name', 'employee_id', 'day', 'average', 'sum')
...: [{k: mp[k] for k in keys} for mp in defdict.values()]
...:
...:
926 ms ± 6.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [454]: %%timeit
...: df = pd.DataFrame(data)
...: pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
...: df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
...: how='outer'
...: ).to_dict(orient= 'records')
...:
...:
3.58 s ± 19.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [455]: %timeit pd.DataFrame(data)
1.26 s ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
这是一种方法
# filter using loc where average is not null and where sum is not null, as two separate frame
# merge the two DF
#finally, output as a dict of orient record
pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
how='outer'
).to_dict(orient= 'records')
[{'name': 'joe',
'day': '2022-01-01',
'employee_id': 1,
'average': 2.0,
'sum': 13.0},
{'name': 'joe',
'day': '2022-01-02',
'employee_id': 1,
'average': 3.0,
'sum': 15.0},
{'name': 'joe',
'day': '2022-01-03',
'employee_id': 1,
'average': 9.0,
'sum': 0.0},
{'name': 'bob',
'day': '2022-01-01',
'employee_id': 2,
'average': 1.0,
'sum': 9.0},
{'name': 'bob',
'day': '2022-01-02',
'employee_id': 2,
'average': 3.0,
'sum': 8.0}]
从给出的描述中,组合“day”、“name”、“employee_id”就像一个独特的组合,应该添加其他两个字段。每个传入的字典都有这些,我们可以将它们用作新字典的键,但我们需要将它们转换为可散列的字符串,例如 json 字符串,我们需要通过排序来生成这些字符串以使它们独一无二。
from json import dumps
data = [{"average": 2, "day": "2022-01-01", "employee_id": 1, "name": "joe"},
{"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
{"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
{"sum": 13, "day": "2022-01-01", "name": "joe", "employee_id": 1},
{"sum": 15, "day": "2022-01-02", "name": "joe", "employee_id": 1},
{"sum": 0, "day": "2022-01-03", "name": "joe", "employee_id": 1},
{"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
{"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
{"sum": 9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
{"sum": 8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]
flattend_employee_summaries = dict()
for employee_summary in data:
key = employee_summary.copy()
if "average" in key:
del key["average"]
if dumps(key, sort_keys=True) not in flattend_employee_summaries:
flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
else:
flattend_employee_summaries[dumps(key, sort_keys=True)]["average"] = employee_summary["average"]
if "sum" in key:
del key["sum"]
if dumps(key, sort_keys=True) not in flattend_employee_summaries:
flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
else:
flattend_employee_summaries[dumps(key, sort_keys=True)]["sum"] = employee_summary["sum"]
flattend_employee_summaries = [ summary for summary in flattend_employee_summaries.values()]
print(f'{flattend_employee_summaries=}')
它已经得到回答,我怀疑这是重复我推荐的 Mechanic Pig 解决方案的漫长方法。 对于所有的解决方案,我相信我们假设每个员工每天只有一个平均记录。
employees = dict()
for data_row in data:
if data_row['employee_id'] not in employees:
employees[data_row['employee_id']] = {data_row['day']: {'name':data_row.get('name', 0),
'average': data_row.get('average', 0),
'sum': data_row.get('sum',0)
}
}
else:
data_row_day = data_row['day']
if data_row['day'] not in employees[data_row['employee_id']]:
employees[data_row['employee_id']][data_row_day] = {'name':data_row.get('name', 0),
'average': data_row.get('average', 0),
'sum': data_row.get('sum', 0)
}
else:
current_sum = employees[data_row['employee_id']][data_row_day].get('sum',0)
employees[data_row['employee_id']][data_row_day].update({'sum': current_sum + data_row.get('sum', 0) })
employee_output = list()
for employee_id, employee_dates in employees.items():
for employee_date, employee_details in employee_dates.items():
employee_output.append({"name": employee_details['name'],
"employee_id": employee_id,
"day": employee_date,
"average": employee_details['average'],
"sum": employee_details['sum'],
})
employee_output
将包含:
[{'name': 'joe',
'employee_id': 1,
'day': '2022-01-01',
'average': 2,
'sum': 13},
{'name': 'joe',
'employee_id': 1,
'day': '2022-01-02',
'average': 3,
'sum': 15},
{'name': 'joe',
'employee_id': 1,
'day': '2022-01-03',
'average': 9,
'sum': 0},
{'name': 'bob',
'employee_id': 2,
'day': '2022-01-01',
'average': 1,
'sum': 9},
{'name': 'bob',
'employee_id': 2,
'day': '2022-01-02',
'average': 3,
'sum': 8}]
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.