[英]Convert a 5 level dictionary (with pd.Series as value into a pandas DataFrame
原始問題:我正在使用python3。我有一些4級字典和5級字典。 我想將此多級字典轉換為具有遞歸函數的pandas DataFrame
為了簡化我的問題並測試我的函數,我生成了如下所示的3級字典並嘗試了遞歸函數。 我了解到,使用這3個級別的嵌套字典,還有許多其他方法可以解決問題。 但是,我覺得只有遞歸函數才能輕松地解決4層,5層或更多層字典上的問題
要創建簡化的三級詞典:
from collections import defaultdict
def ddict():
return defaultdict(ddict)
tree = ddict()
tree['level1_1']['level2_1']['level3_1'] = <pd.Series1>
tree['level1_1']['level2_1']['level3_2'] = <pd.Series2>
tree['level1_1']['level2_2']['level3_1'] = <pd.Series3>
tree['level1_1']['level2_2']['level3_2'] = <pd.Series4>
tree['level1_2']['level2_1']['level3_1'] = <pd.Series5>
tree['level1_2']['level2_1']['level3_2'] = <pd.Series6>
tree['level1_2']['level2_2']['level3_1'] = <pd.Series7>
tree['level1_2']['level2_2']['level3_2'] = <pd.Series8>
受以下Bart Cubrich的啟發,我修改了xx的代碼並將解決方案放在此處
import collections
def tree2df (d, colname):
"""
Inputs:
1. d (a nested dict, or a tree, all values are pd.Series)
2. colname (a list)
Return:
1. a pd.DataFrame
"""
def flatten(d, parent_key='', sep='-'):
items = []
for k, v in d.items():
new_key = str(parent_key) + str(sep) + str(k) if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
flat_dict = flatten (d)
levels, vals = zip(*[(tuple(level.split('-')),val) for level, val in flat_dict.items()])
max_level = np.max(np.array([len(l) for l in levels]))
if len(colname) != max_level:
print ("The numbers of column name is invalid because of moer than maximum level: %s.\nNothing will be returned. Please revise the colname!"%max_level)
else:
colname += ['Old index']
s = pd.concat(list(vals), keys = list(levels), names = colname)
s = pd.DataFrame(s)
s.reset_index(inplace=True)
s.rename(columns={0:'Value'},inplace=True)
return s
#Example
BlockEvent_TS_df = tree2df (BlockEvent_TS_tree, ['ID','Session','Trial type','Block', 'Event name'])
5級嵌套字典與3級嵌套字典具有相同的概念:
tree['level1_1']['level2_1']['level3_1']['level4_1']['level5_1'] = <pd.Series1>
...
tree['level1_2']['level2_2']['level3_2']['level4_2']['level5_2'] = <pd.Series32>
因為我有一個很大的數據集,所以在這里顯示整個嵌套字典非常復雜。 但是,想法是這樣的。 然后,我想有6列,5列來存儲每個級別,並且一列表示價值。
我已經嘗試了上面的代碼,並且對我來說效果很好。 速度也很不錯。
感謝你的幫助!
你需要:
format_ = {(level1_key, level2_key, level3_key): values
for level1_key, level2_dict in tree.items()
for level2_key, level3_dict in level2_dict.items()
for level3_key, values in level3_dict.items()}
df = pd.DataFrame(format_, index=['Value']).T.reset_index()
輸出:
level_0 level_1 level_2 Value
0 level1_1 level2_1 level3_1 1
1 level1_1 level2_1 level3_2 2
2 level1_1 level2_2 level3_1 3
3 level1_1 level2_2 level3_2 4
4 level1_2 level2_1 level3_1 5
5 level1_2 level2_1 level3_2 6
6 level1_2 level2_2 level3_1 7
7 level1_2 level2_2 level3_2 8
因此,我的解決方案是遍歷樹,查看所有鍵,並將每個元素路徑構建為數組,然后從記錄創建DataFrame。 我將這些步驟分為各自的方法。
可能有一種更有效的方法,但這應該可以完成工作。 希望這可以幫助。
def traverse_tree(d, prefix='', results=[]):
if type(d) is int:
record = str(prefix).split(',')
record.append(d)
results.append(record)
return results
keys = d.keys()
for key in keys:
temp = prefix + ',' if prefix != '' else ''
results = traverse_tree(d[key], temp + str(key), results)
return results
def dict_to_df(d):
res = traverse_tree(tree)
labels = []
for i in range(len(res[0]) - 1):
labels.append('L' + str(i+1))
labels.append('Value')
print(res)
print(labels)
return pd.DataFrame.from_records(res, columns=labels)
if __name__ == '__main__':
tree = ddict()
tree['level1_1']['level2_1']['level3_1'] = 1
tree['level1_1']['level2_1']['level3_2'] = 2
tree['level1_1']['level2_2']['level3_1'] = 3
tree['level1_1']['level2_2']['level3_2'] = 4
tree['level1_2']['level2_1']['level3_1'] = 5
tree['level1_2']['level2_1']['level3_2'] = 6
tree['level1_2']['level2_2']['level3_1'] = 7
tree['level1_2']['level2_2']['level3_2'] = 8
df = dict_to_df(tree)
print(df)
盡管看起來有些混亂,但該版本在各種級別的深度下都可以使用。
import pandas as pd
from collections import defaultdict
def ddict():
return defaultdict(ddict)
tree = ddict()
tree['level1_1']['level2_1']['level3_1'] = 1
tree['level1_1']['level2_1']['level3_2'] = 2
tree['level1_1']['level2_2']['level3_1'] = 3
tree['level1_1']['level2_2']['level3_2'] = 4
tree['level1_2']['level2_1']['level3_1'] = 5
tree['level1_2']['level2_1']['level3_2'] = 6
tree['level1_2']['level2_2']['level3_1'] = 7
tree['level1_2']['level2_2']['level3_2']['Level4_1'] = 8
import collections
def flatten(d, parent_key='', sep='-'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
flat_dict=flatten(tree)
#df=pd.DataFrame()
levels=[]
vals=[]
for key in flat_dict.keys():
levels.append(key.split('-'))
vals.append(flat_dict.get(key))
max_level=0
for level in levels:
if len(level)>max_level: max_level=len(level)
df=pd.DataFrame(columns=range(max_level+1))
index=0
for level,val in zip(levels,vals):
for i in range(max_level):
try:
level[i]
df.loc[index,i]=level[i]
except IndexError:
print('means this level has less than max')
df.loc[index,max_level]=val
index+=1
df
Out:
0 1 2 3 4
0 level1_1 level2_1 level3_1 NaN 1
1 level1_1 level2_1 level3_2 NaN 2
2 level1_1 level2_2 level3_1 NaN 3
3 level1_1 level2_2 level3_2 NaN 4
4 level1_2 level2_1 level3_1 NaN 5
5 level1_2 level2_1 level3_2 NaN 6
6 level1_2 level2_2 level3_1 NaN 7
7 level1_2 level2_2 level3_2 Level4_1 8
我從這里得到了扁平化的想法
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.