[英]creating nested dictionaries and lists from parsed CSV
我一直在從事一個涉及解析 CSV 文件的項目,以便按照復雜的模式將所有數據轉換為格式非常明確的 JSON。 我必須定制這個程序,因為 JSON 所需的復雜性使現有轉換器失敗。 我大部分時間都在那里,但我遇到了最后一個障礙:
我有嵌套的字典,偶爾其中必須有一個列表,這個列表將包含更多的字典。 這很好,我已經能夠完成它,但是現在我需要找到一種方法在其中添加更多嵌套字典。 下面是對該概念的簡化分解。
CSV 看起來像這樣,其中標簽前的 @ 表示它是一個列表
x.a, x.b.z, x.b.y, x.@c.z.nest1, x.@c.z.nest2, x.@c.yy, x.d, x.e.z, x.e.y
ab, cd, ef, gh, ij, kl, mn, op, qr
這應該導致以下 JSON
{
"x": {
"a": "ab",
"b": {
"z": "cd",
"y": "ef"
},
"c": [
{
"z": {
"nest1": "gh",
"nest2": "ij"
}
},
{
"yy": "kl"
}
],
"d": "mn",
"e": {
"z": "op",
"y": "qr"
}
}
}
這是我無法解決的一個問題,我當前的代碼只能在列表項之后執行一個字典,不能更進一步。 我還需要能夠以某種方式在字典列表中執行以下操作:
"c": [
{
"z": {
"nest1": "gh"
},
"zz": {
"nest2": "ij"
}
},
{
"yy": "kl"
}
即以某種方式在列表中的字典中添加多個嵌套字典。 這個問題發生在這些不能通過名稱引用的事實中,所以我不知道我如何可能指示在 CSV 格式中這樣做。
這是我擁有的代碼,它適用於嵌套在列表中的第一個字典:
import json
import pandas as pd
from os.path import exists
# df1 = pd.read_csv("excelTestFacilities.csv", header = 1, sep=",", keep_default_na=False, engine="python")
# df2 = pd.read_csv("excelTestFacilityContacts.csv", header = 1, sep=",", keep_default_na=False, engine="python")
# df = pd.merge(df1, df2, how = 'inner')
df = pd.read_csv("csvTestFile.csv", header = 1, sep=", ", keep_default_na=False, engine="python")
#print(df) # uncomment to see the transformation
json_data = df.to_dict(orient="records")
#print(json_data)
def unflatten_dic(dic):
"""
Unflattens a CSV list into a set of nested dictionaries
"""
ini = {}
for k,v in list(dic.items()):
node = ini
list_bool = False
*parents, key = k.split('.')
for parent in parents:
if parent[0] == '@':
list_bool = True
if list_bool:
for parent in parents:
if parent[0] == '@':
node[parent[1:]] = node = node.get(parent[1:], [])
else:
node[parent] = node = node.get(parent, {})
node.append({key : v})
else:
for parent in parents:
node[parent] = node = node.get(parent, {})
node[key] = v
return ini
def merge_lists(dic):
"""
Removes duplicates within sets
"""
for k,v in list(dic.items()):
if isinstance(v, dict):
keys = list(v.keys())
vals = list(v.values())
if all(isinstance(l, list) and len(l)==len(vals[0]) for l in vals):
dic[k] = []
val_tuple = set(zip(*vals)) # removing duplicates with set()
for t in val_tuple:
dic[k].append({subkey: t[i] for i, subkey in enumerate(keys)})
else:
merge_lists(v)
elif isinstance(v, list):
dic[k] = list(set(v)) # removing list duplicates
def clean_blanks(value):
"""
Recursively remove all None values from dictionaries and lists, and returns
the result as a new dictionary or list.
"""
if isinstance(value, list):
return [clean_blanks(x) for x in value if x != ""]
elif isinstance(value, dict):
return {
key: clean_blanks(val)
for key, val in value.items()
if val != "" and val != {}
}
else:
return value
def add_to_dict(section_added_to, section_to_add, value, reportNum):
"""
Adds a value to a given spot within a dictionary set.
section_added_to is optional for adding the set to a deeper section such as facility
section_to_add is the name that the new dictionary entry will have
value is the item to be added
reportNum is the number indicating which report to add to, starting at 0
"""
if section_added_to != '':
end_list[reportNum][section_added_to][section_to_add] = value
else:
end_list[reportNum][section_to_add] = value
def read_add_vals(filename_prefix, added_to, section):
for i in range(len(end_list)):
temp_list = []
filename = filename_prefix + str(i+1) + ".csv"
if not exists(filename):
continue;
temp_df = pd.read_csv(filename, header = 1, sep=",", keep_default_na=False, engine="python")
temp_json = temp_df.to_dict(orient="records")
for y in temp_json:
return_ini = unflatten_dic(y)
temp_list.append(return_ini)
add_to_dict(added_to, section, temp_list, i)
global end_list
end_list = []
for x in json_data:
return_ini = unflatten_dic(x)
end_list.append(return_ini)
#read_add_vals('excelTestPermitsFac', 'facility', 'permits');
json_data = clean_blanks(end_list)
final_json = {"year":2021, "version":"2022-02-14", "reports":json_data}
print(json.dumps(final_json, indent=4))
此代碼的某些部分涉及到整體端 JSON 的其他組件,但我主要關心如何更改 unflatten_dic() 這是我當前用於更改 unflatten_dic() 的工作代碼,即使它不起作用...
def list_get(list, list_item):
i = 0
for dict in list:
if list_item in dict:
return dict.get(list_item, {})
i += 1
return {}
def check_in_list(list, list_item):
i = 0
for dict in list:
if list_item in dict:
return i
i += 1
return -1
def unflatten_dic(dic):
"""
Unflattens a CSV list into a set of nested dictionaries
"""
ini = {}
for k,v in list(dic.items()):
node = ini
list_bool = False
*parents, key = k.split('.')
for parent in parents:
if parent[0] == '@':
list_bool = True
previous_node_list = False
if list_bool:
for parent in parents:
print(parent)
if parent[0] == '@':
node[parent[1:]] = node = node.get(parent[1:], [])
ends_with_dict = False
previous_node_list = True
else:
print("else")
if previous_node_list:
print("prev list")
i = check_in_list(node, parent)
if i >= 0:
node[i] = node = list_get(node, parent)
else:
node.append({parent : {}})
previous_node_list = False
ends_with_dict = True
else:
print("not prev list")
node[parent] = node = node.get(parent, {})
previous_node_list = False
if ends_with_dict:
node[key] = v
else:
node.append({key : v})
else:
for parent in parents:
node[parent] = node = node.get(parent, {})
node[key] = v
#print(node)
return ini
任何,即使是少量的幫助將不勝感激。
最簡單的方法是使用遞歸和collections.defaultdict
對父項上的子條目進行分組(每個條目在 csv 數據中由.
分隔):
from collections import defaultdict
def to_dict(vals, is_list = 0):
def form_child(a, b):
return b[0][0] if len(b[0]) == 1 else to_dict(b, a[0] == '@')
d = defaultdict(list)
for a, *b in vals:
d[a].append(b)
if not is_list:
return {a[a[0] == '@':]:form_child(a, b) for a, b in d.items()}
return [{a[a[0] == '@':]:form_child(a, b)} for a, b in d.items()]
import csv, json
with open('filename.csv') as f:
data = list(csv.reader(f))
r = [a.split('.')+[b] for i in range(0, len(data), 2) for a, b in zip(data[i], data[i+1])]
print(json.dumps(to_dict(r), indent=4))
輸出:
{
"x": {
"a": "ab",
"b": {
"z": "cd",
"y": "ef"
},
"c": [
{
"z": {
"nest1": "gh",
"nest2": "ij"
}
},
{
"yy": "kl"
}
],
"d": "mn",
"e": {
"z": "op",
"y": "qr"
}
}
}
我設法讓它在似乎所有情況下都能正常工作。 這是我為 unflatten_dic() 函數編寫的代碼。
def unflatten_dic(dic):
"""
Unflattens a CSV list into a set of nested dictionaries
"""
ini = {}
for k,v in list(dic.items()):
node = ini
list_bool = False
*parents, key = k.split('.')
# print("parents")
# print(parents)
for parent in parents:
if parent[0] == '@':
list_bool = True
if list_bool:
for parent in parents:
if parent[0] == '@':
node[parent[1:]] = node = node.get(parent[1:], [])
elif parent.isnumeric():
# print("numeric parent")
# print("length of node")
# print(len(node))
if len(node) > int(parent):
# print("node length good")
node = node[int(parent)]
else:
node.append({})
node = node[int(parent)]
else:
node[parent] = node = node.get(parent, {})
try:
node.append({key : v})
except AttributeError:
node[key] = v
else:
for parent in parents:
node[parent] = node = node.get(parent, {})
node[key] = v
return ini
到目前為止,我還沒有遇到問題,這是基於 CSV 的以下規則:
@ 在任何名稱之前都會導致該項目成為列表
如果 CSV 中列表之后的部分是數字,則會在列表中創建多個字典。 這是一個例子
x.a, x.b.z, x.b.y, x.@c.0.zz, x.@c.1.zz, x.@c.2.zz, x.d, x.e.z, x.e.y, x.@c.1.yy.l, x.@c.1.yy.@m.q, x.@c.1.yy.@m.r
ab, cd, ef, gh, , kl, mn, op, qr, st, uv, wx
12, 34, 56, 78, 90, 09, , 65, 43, 21, , 92
這將在格式化后產生以下 JSON
"reports": [
{
"x": {
"a": "ab",
"b": {
"z": "cd",
"y": "ef"
},
"c": [
{
"zz": "gh"
},
{
"yy": {
"l": "st",
"m": [
{
"q": "uv"
},
{
"r": "wx"
}
]
}
},
{
"zz": "kl"
}
],
"d": "mn",
"e": {
"z": "op",
"y": "qr"
}
}
},
{
"x": {
"a": "12",
"b": {
"z": "34",
"y": "56"
},
"c": [
{
"zz": "78"
},
{
"zz": "90",
"yy": {
"l": "21",
"m": [
{
"r": "92"
}
]
}
},
{
"zz": "09"
}
],
"e": {
"z": "65",
"y": "43"
}
}
}
]
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.