簡體   English   中英

如何將嵌套的 JSON 轉換為 CSV

[英]How to convert a nested JSON to CSV

我想將嵌套的 json 轉換為 csv 格式,包括分組列表/字典的子行。

這是我的 json

data =\
{
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }
    ]
}
import pandas as pd

df = pd.json_normalize(data)

# display(df)
  description id       name                                                                                                                                                                                                packages item.description item.id
0         HLD  1  HIGHLEVEL  [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]      description      11

JSON的Output轉CSV轉換器

"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"

我嘗試了 pandas 歸一化,但結果與想要的不一樣。 JSON 數組在 csv 中沒有轉換成子行。我想在 csv 中保留空字符串。

我想做同樣的事情,但使用 Python 腳本。

這應該適合你:

from copy import deepcopy
import pandas


def cross_join(left, right):
    new_rows = [] if right else left
    for left_row in left:
        for right_row in right:
            temp_row = deepcopy(left_row)
            for key, value in right_row.items():
                temp_row[key] = value
            new_rows.append(deepcopy(temp_row))
    return new_rows


def flatten_list(data):
    for elem in data:
        if isinstance(elem, list):
            yield from flatten_list(elem)
        else:
            yield elem


def json_to_dataframe(data_in):
    def flatten_json(data, prev_heading=''):
        if isinstance(data, dict):
            rows = [{}]
            for key, value in data.items():
                rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
        elif isinstance(data, list):
            rows = []
            if(len(data) != 0):
                for i in range(len(data)):
                    [rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
            else:
                data.append("")
                [rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
        else:
            rows = [{prev_heading[1:]: data}]
        return rows

    return pandas.DataFrame(flatten_json(data_in))

def remove_duplicates(df):
    columns = list(df)[:7]
    for c in columns:
        df[c] = df[c].mask(df[c].duplicated(), "")

    return df


if __name__ == '__main__':
    df = json_to_dataframe(data)
    df = remove_duplicates(df)

    print(df)
    df.to_csv('data.csv', index=False)

輸入 01:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }
    ]
}

Output 01:

在此處輸入圖像描述

輸入 02:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": []
        }, {
            "id": "2",
            "label": "Package 3",
            "products": []
        }
    ]
}

Output 02: 在此處輸入圖像描述

希望它能解決您的問題。 如果您需要任何解釋,請告訴我。

謝謝

謝謝@Trenton McKinney

import pandas as pd
import json

data =\
{'description': 'HLD',
 'id': '1',
 'item': {'description': 'description', 'id': '11'},
 'name': 'HIGHLEVEL',
 'packages': [{'id': '1',
               'label': 'Package 1',
               'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]},
              {'id': '2',
               'label': 'Package 3',
               'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]}


df = pd.json_normalize(data, record_path=['packages'], meta=['id', 'name', 'description', ['item', 'id'], ['item', 'description']], meta_prefix='top_', sep='_')
df = df.explode('products')

df.rename({'id': 'packages_id', 'label': 'packages_label'}, axis=1, inplace=True)
df = df.join(pd.DataFrame(df.pop('products').values.tolist()))

df.rename({'id': 'packages_products_id', 'price': 'packages_products_price'}, axis=1, inplace=True)

df.columns = df.columns.str.replace('top_', '')

df = df[['id', 'name', 'description', 'item_id', 'item_description', 'packages_id', 'packages_label', 'packages_products_id', 'packages_products_price']]

columns_to_group = ["name", "description", "item_id", "item_description", "packages_id", "packages_label"]

for c in columns_to_group:
  df[c] = df[c].mask(
    df[c].duplicated(), ""
  )

print(df)

df.to_csv('data.csv', index=False)

現在我必須讓它更通用,以便它可以用於任何結構化的 json。

import json  # For JSON loading
import csv  # For CSV dict writer


def get_leaves(item, key=None, key_prefix=""):
    """
    This function converts nested dictionary structure to flat
    """
    if isinstance(item, dict):
        leaves = {}
        """Iterates the dictionary and go to leaf node after that calls to get_leaves function recursively to go to leaves level"""
        for item_key in item.keys():
            """Some times leaves and parents or some other leaves might have same key that's why adding leave node key to distinguish"""
            temp_key_prefix = (
                item_key if (key_prefix == "") else (key_prefix + "_" + str(item_key))
            )
            leaves.update(get_leaves(item[item_key], item_key, temp_key_prefix))
        return leaves
    elif isinstance(item, list):
        leaves = {}
        elements = []
        """Iterates the list and go to leaf node after that if it is leave then simply add value to current key's list or 
        calls to get_leaves function recursively to go to leaves level"""
        for element in item:
            if isinstance(element, dict) or isinstance(element, list):
                leaves.update(get_leaves(element, key, key_prefix))
            else:
                elements.append(element)
        if len(elements) > 0:
            leaves[key] = elements
        return leaves
    else:
        return {key_prefix: item}


with open("./campaign-summary.json") as f_input, open("./finalised_output.csv", "w", newline="") as f_output:
    json_data = json.load(f_input, strict=False)
    """'First parse all entries to get the unique fieldnames why because already we have file in RAM level and
    if we put each dictionary after parsing in list or some data structure it will crash your system due to memory constraint
    that's why first we will get the keys first then we convert each dictionary and put it to CSV"""
    fieldnames = set()
    for entry in json_data:
        fieldnames.update(get_leaves(entry).keys())
    csv_output = csv.DictWriter(f_output, delimiter=";", fieldnames=sorted(fieldnames))
    csv_output.writeheader()
    csv_output.writerows(get_leaves(entry) for entry in json_data)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM