繁体   English   中英

扁平化字典列表 python

[英]Flatten a list of dictionaries in python

我有一个字典列表:

data = [{"average": 2, "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"sum": 13,    "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"sum": 15,    "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"sum": 0,     "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
        {"sum":     9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"sum":     8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]

我想要我的 output 作为:

output = [{"name": "joe", "employee_id": 1, "day": "2022-01-01", "average": 2, "sum": 13},
          {"name": "joe", "employee_id": 1, "day": "2022-01-02", "average": 3, "sum": 15},
          {"name": "joe", "employee_id": 1, "day": "2022-01-03", "average": 9, "sum": 0},
          {"name": "bob", "employee_id": 2, "day": "2022-01-01", "average": 1, "sum": 9},
          {"name": "bob", "employee_id": 2, "day": "2022-01-02", "average": 3, "sum": 8}]

目标是将 output 值按日期、姓名和 employee_id 放在一起。

我试过了:

output = {}
for item in data:
    if item["day"] not in output:
        output[item["day"]] = item
    else:
        output[item["day"]].update(item)
print(list(output.values()))

这有助于将“平均值”、“总和”和“日期”放在一起,但最终不包括所有员工及其 ID。

任何帮助表示赞赏

collections.defaultdictdict一起使用。 这里,取每个字典的'day''name'的值作为key:

>>> from collections import defaultdict
>>> defdict = defaultdict(dict)
>>> for mp in data:
...     defdict[mp['day'], mp['name']].update(mp)
...
>>> keys = ('name', 'employee_id', 'day', 'average', 'sum')
>>> [{k: mp[k] for k in keys} for mp in defdict.values()]
[{'name': 'joe', 'employee_id': 1, 'day': '2022-01-01', 'average': 2, 'sum': 13},
 {'name': 'joe', 'employee_id': 1, 'day': '2022-01-02', 'average': 3, 'sum': 15},
 {'name': 'joe', 'employee_id': 1, 'day': '2022-01-03', 'average': 9, 'sum': 0},
 {'name': 'bob', 'employee_id': 2, 'day': '2022-01-01', 'average': 1, 'sum': 9},
 {'name': 'bob', 'employee_id': 2, 'day': '2022-01-02', 'average': 3, 'sum': 8}]

对于150w条数据,这个方案的性能还是优于pandas(至少在将数据转换成DataFrame时,for循环完成了工作):

In [451]: random.seed(0)
     ...: names = [''.join(random.choices(string.ascii_lowercase, k=random.randrange(3, 7))) for _ in range(10000)]
     ...: dates = [str(datetime.date(2022, i, j)) for i in range(7, 10) for j in range(1, 31)]
     ...: keys = ['sum', 'average']
     ...:
     ...: data = [{k: random.randrange(10), 'day': date, 'name': name, 'employee_id': i}
     ...:         for i, name in enumerate(names, 1)
     ...:         for date in sorted(random.sample(dates, random.randrange(60, 90)))
     ...:         for k in keys]
     ...:

In [452]: len(data)
Out[452]: 1492286

In [453]: %%timeit
     ...: defdict = defaultdict(dict)
     ...: for mp in data:
     ...:     defdict[mp['day'], mp['name']].update(mp)
     ...: keys = ('name', 'employee_id', 'day', 'average', 'sum')
     ...: [{k: mp[k] for k in keys} for mp in defdict.values()]
     ...:
     ...:
926 ms ± 6.38 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [454]: %%timeit
     ...: df = pd.DataFrame(data)
     ...: pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
     ...:          df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
     ...:          how='outer'
     ...: ).to_dict(orient= 'records')
     ...:
     ...:
3.58 s ± 19.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [455]: %timeit pd.DataFrame(data)
1.26 s ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

这是一种方法

# filter using loc where average is not null and where sum is not null, as two separate frame
# merge the two DF
#finally, output as a dict of orient record

pd.merge(df.loc[df['average'].notna()][[ 'name','day','employee_id','average']],
         df.loc[df['sum'].notna()][['name','day','employee_id','sum']],
         how='outer'
).to_dict(orient= 'records')

[{'name': 'joe',
  'day': '2022-01-01',
  'employee_id': 1,
  'average': 2.0,
  'sum': 13.0},
 {'name': 'joe',
  'day': '2022-01-02',
  'employee_id': 1,
  'average': 3.0,
  'sum': 15.0},
 {'name': 'joe',
  'day': '2022-01-03',
  'employee_id': 1,
  'average': 9.0,
  'sum': 0.0},
 {'name': 'bob',
  'day': '2022-01-01',
  'employee_id': 2,
  'average': 1.0,
  'sum': 9.0},
 {'name': 'bob',
  'day': '2022-01-02',
  'employee_id': 2,
  'average': 3.0,
  'sum': 8.0}]

从给出的描述中,组合“day”、“name”、“employee_id”就像一个独特的组合,应该添加其他两个字段。每个传入的字典都有这些,我们可以将它们用作新字典的键,但我们需要将它们转换为可散列的字符串,例如 json 字符串,我们需要通过排序来生成这些字符串以使它们独一无二。

from json import dumps

data = [{"average": 2, "day": "2022-01-01",  "employee_id": 1, "name": "joe"},
        {"average": 3, "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"average": 9, "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"sum": 13,    "day": "2022-01-01", "name": "joe", "employee_id": 1},
        {"sum": 15,    "day": "2022-01-02", "name": "joe", "employee_id": 1},
        {"sum": 0,     "day": "2022-01-03", "name": "joe", "employee_id": 1},
        {"average": 1, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"average": 3, "day": "2022-01-02", "name": "bob", "employee_id": 2},
        {"sum":     9, "day": "2022-01-01", "name": "bob", "employee_id": 2},
        {"sum":     8, "day": "2022-01-02", "name": "bob", "employee_id": 2}]

flattend_employee_summaries = dict()

for employee_summary in data:
    
    
    key = employee_summary.copy()
        
    if "average" in key:
        del key["average"]
        if dumps(key, sort_keys=True) not in flattend_employee_summaries:
            flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
        else:
            flattend_employee_summaries[dumps(key, sort_keys=True)]["average"] = employee_summary["average"]
            
    if "sum" in key:
        del key["sum"]
        if dumps(key, sort_keys=True) not in flattend_employee_summaries:
            flattend_employee_summaries[dumps(key, sort_keys=True)] = employee_summary.copy()
        else:
            flattend_employee_summaries[dumps(key, sort_keys=True)]["sum"] = employee_summary["sum"]
        
flattend_employee_summaries = [ summary for summary  in flattend_employee_summaries.values()]
          
print(f'{flattend_employee_summaries=}')

它已经得到回答,我怀疑这是重复我推荐的 Mechanic Pig 解决方案的漫长方法。 对于所有的解决方案,我相信我们假设每个员工每天只有一个平均记录。

employees = dict()
for data_row in data:
    if data_row['employee_id'] not in employees:
        
        employees[data_row['employee_id']] = {data_row['day']: {'name':data_row.get('name', 0),
                                                                'average': data_row.get('average', 0), 
                                                                'sum': data_row.get('sum',0) 
                                                              } 
                                            }
    else:
        data_row_day = data_row['day']
        
        if data_row['day'] not in employees[data_row['employee_id']]:
            employees[data_row['employee_id']][data_row_day] = {'name':data_row.get('name', 0),
                                                                'average': data_row.get('average', 0), 
                                                                'sum': data_row.get('sum', 0) 
                                                              }
        else:
            current_sum = employees[data_row['employee_id']][data_row_day].get('sum',0)
            employees[data_row['employee_id']][data_row_day].update({'sum': current_sum + data_row.get('sum', 0) })

employee_output = list()
for employee_id, employee_dates in employees.items():
    for employee_date, employee_details in employee_dates.items():
        employee_output.append({"name": employee_details['name'],
                                "employee_id": employee_id,
                                "day": employee_date,
                                "average": employee_details['average'],
                                "sum":  employee_details['sum'],
                               })

employee_output将包含:

[{'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-01',
  'average': 2,
  'sum': 13},
 {'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-02',
  'average': 3,
  'sum': 15},
 {'name': 'joe',
  'employee_id': 1,
  'day': '2022-01-03',
  'average': 9,
  'sum': 0},
 {'name': 'bob',
  'employee_id': 2,
  'day': '2022-01-01',
  'average': 1,
  'sum': 9},
 {'name': 'bob',
  'employee_id': 2,
  'day': '2022-01-02',
  'average': 3,
  'sum': 8}]

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM