[英]How to convert a nested JSON to CSV
我想將嵌套的 json 轉換為 csv 格式,包括分組列表/字典的子行。
這是我的 json
data =\
{
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}
]
}, {
"id": "2",
"label": "Package 3",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}
]
}
]
}
import pandas as pd
df = pd.json_normalize(data)
# display(df)
description id name packages item.description item.id
0 HLD 1 HIGHLEVEL [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}] description 11
JSON的Output轉CSV轉換器
"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"
我嘗試了 pandas 歸一化,但結果與想要的不一樣。 JSON 數組在 csv 中沒有轉換成子行。我想在 csv 中保留空字符串。
我想做同樣的事情,但使用 Python 腳本。
這應該適合你:
from copy import deepcopy
import pandas
def cross_join(left, right):
new_rows = [] if right else left
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_row[key] = value
new_rows.append(deepcopy(temp_row))
return new_rows
def flatten_list(data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def json_to_dataframe(data_in):
def flatten_json(data, prev_heading=''):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
elif isinstance(data, list):
rows = []
if(len(data) != 0):
for i in range(len(data)):
[rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
else:
data.append("")
[rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
else:
rows = [{prev_heading[1:]: data}]
return rows
return pandas.DataFrame(flatten_json(data_in))
def remove_duplicates(df):
columns = list(df)[:7]
for c in columns:
df[c] = df[c].mask(df[c].duplicated(), "")
return df
if __name__ == '__main__':
df = json_to_dataframe(data)
df = remove_duplicates(df)
print(df)
df.to_csv('data.csv', index=False)
輸入 01:
data = {
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}, {
"id": "3",
"price": 9
}
]
}, {
"id": "2",
"label": "Package 3",
"products": [{
"id": "1",
"price": 5
}, {
"id": "2",
"price": 3
}, {
"id": "3",
"price": 9
}
]
}
]
}
Output 01:
輸入 02:
data = {
"id": "1",
"name": "HIGHLEVEL",
"description": "HLD",
"item": {
"id": "11",
"description": "description"
},
"packages": [{
"id": "1",
"label": "Package 1",
"products": []
}, {
"id": "2",
"label": "Package 3",
"products": []
}
]
}
希望它能解決您的問題。 如果您需要任何解釋,請告訴我。
謝謝
謝謝@Trenton McKinney
import pandas as pd
import json
data =\
{'description': 'HLD',
'id': '1',
'item': {'description': 'description', 'id': '11'},
'name': 'HIGHLEVEL',
'packages': [{'id': '1',
'label': 'Package 1',
'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]},
{'id': '2',
'label': 'Package 3',
'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]}
df = pd.json_normalize(data, record_path=['packages'], meta=['id', 'name', 'description', ['item', 'id'], ['item', 'description']], meta_prefix='top_', sep='_')
df = df.explode('products')
df.rename({'id': 'packages_id', 'label': 'packages_label'}, axis=1, inplace=True)
df = df.join(pd.DataFrame(df.pop('products').values.tolist()))
df.rename({'id': 'packages_products_id', 'price': 'packages_products_price'}, axis=1, inplace=True)
df.columns = df.columns.str.replace('top_', '')
df = df[['id', 'name', 'description', 'item_id', 'item_description', 'packages_id', 'packages_label', 'packages_products_id', 'packages_products_price']]
columns_to_group = ["name", "description", "item_id", "item_description", "packages_id", "packages_label"]
for c in columns_to_group:
df[c] = df[c].mask(
df[c].duplicated(), ""
)
print(df)
df.to_csv('data.csv', index=False)
現在我必須讓它更通用,以便它可以用於任何結構化的 json。
import json # For JSON loading
import csv # For CSV dict writer
def get_leaves(item, key=None, key_prefix=""):
"""
This function converts nested dictionary structure to flat
"""
if isinstance(item, dict):
leaves = {}
"""Iterates the dictionary and go to leaf node after that calls to get_leaves function recursively to go to leaves level"""
for item_key in item.keys():
"""Some times leaves and parents or some other leaves might have same key that's why adding leave node key to distinguish"""
temp_key_prefix = (
item_key if (key_prefix == "") else (key_prefix + "_" + str(item_key))
)
leaves.update(get_leaves(item[item_key], item_key, temp_key_prefix))
return leaves
elif isinstance(item, list):
leaves = {}
elements = []
"""Iterates the list and go to leaf node after that if it is leave then simply add value to current key's list or
calls to get_leaves function recursively to go to leaves level"""
for element in item:
if isinstance(element, dict) or isinstance(element, list):
leaves.update(get_leaves(element, key, key_prefix))
else:
elements.append(element)
if len(elements) > 0:
leaves[key] = elements
return leaves
else:
return {key_prefix: item}
with open("./campaign-summary.json") as f_input, open("./finalised_output.csv", "w", newline="") as f_output:
json_data = json.load(f_input, strict=False)
"""'First parse all entries to get the unique fieldnames why because already we have file in RAM level and
if we put each dictionary after parsing in list or some data structure it will crash your system due to memory constraint
that's why first we will get the keys first then we convert each dictionary and put it to CSV"""
fieldnames = set()
for entry in json_data:
fieldnames.update(get_leaves(entry).keys())
csv_output = csv.DictWriter(f_output, delimiter=";", fieldnames=sorted(fieldnames))
csv_output.writeheader()
csv_output.writerows(get_leaves(entry) for entry in json_data)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.