简体   繁体   中英

How to dynamically flatten a deeply nested json file using python?

I've tried using the flatten_json library to flatten it, was having trouble figuring out how to flatten the "project" and "phase" keys so that they do not result in additional "Columns".

I get something similar to the following JSON out of a rest API:

results = requests.get(apiUrl, verify = rootCaCert, headers = headers, params = httpParams)
for i in list(results.json()):
    outputContent = flatten(i)
    outputJson.append(outputContent)
findingsFrame = pandas.Series(outputJson).to_frame()

sparkSession = SparkSession.builder \
    .appName('DEV Canopy Feed') \
    .getOrCreate()

df = sparkSession.createDataFrame(findingsFrame)

Input JSON looks like this:

[
    {
        "status": "Open",
        "project_finding__id": 23,
        "rating__type": "Medium",
        "title": "Some string title"
        "date_modified": "2017-04-10T15:04:32.527000Z",
        "cvss3_score": null,
        "template_finding_id": null,
        "project_finding__uuid": "ba42302e-b879-11e9-a2a3-2a2ae2dbcce4",
        "references": [
            {
                "type": "system",
                "reference": "id-123",
                "title": "some reference title"
            }
        ],
        "date_created": "2017-04-10T15:04:32.527000Z",
        "phase": {
            "status": "Completed",
            "end_date": "2017-03-31",
            "uuid": "ba423506-b879-11e9-a2a3-2a2ae2dbcce4",
            "reference": "2017-1234",
            "title": "some title for phase",
            "notes":  "<p><strong>Some Text</strong>: in HTML</p><br/>\n<br/>\n<strong>this doesn't matter</strong><br/>\nfreeform html representation of notes<br/>\n<br/>\n<strong>Special needs:</strong><br/>",
            "contacts": [
                {
                    "notes": "<p>Random Note about contact</p>",
                    "role": "Primary Requestor",
                    "email": "user@domain.tld"
                },
                {
                    "notes": "<p>Random Note about contact</p>",
                    "role": "other role",
                    "email": "user2@domain.tld"
                }
            ],
            "start_date": "2017-02-20",
            "project": {
                "description": "&quot;some description of project&quot;",
                "reference": "123",
                "title": "Title of Project",
                "company": {
                    "abbreviation": "",
                    "description": "",
                    "id": 345,
                    "name": "BUSU",
                    "uuid": "ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4"
                },
                "id": 567,
                "uuid": "20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4"
            },
            "type": "Client Server",
            "id": 12312312,
            "description": "Block of text string"
        },
        "cvss2_score": "2.0",
        "first_date_created": "2017-04-10T15:04:32.527000Z",
        "first_phase_reference": "2017-1234"

here's a function I've used in the past to flatten out nested json.

def flatten_json(y):
    out = {}
    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x
    flatten(y)
    return out

You could try it out and see if it gives the output you are looking for (I have no idea what you're expected output is, as suggested in the comments, you might want to include that):

data = [
    {
        "status": "Open",
        "project_finding__id": 23,
        "rating__type": "Medium",
        "title": "Some string title",
        "date_modified": "2017-04-10T15:04:32.527000Z",
        "cvss3_score": None,
        "template_finding_id": None,
        "project_finding__uuid": "ba42302e-b879-11e9-a2a3-2a2ae2dbcce4",
        "references": [
            {
                "type": "system",
                "reference": "id-123",
                "title": "some reference title"
            }
        ],
        "date_created": "2017-04-10T15:04:32.527000Z",
        "phase": {
            "status": "Completed",
            "end_date": "2017-03-31",
            "uuid": "ba423506-b879-11e9-a2a3-2a2ae2dbcce4",
            "reference": "2017-1234",
            "title": "some title for phase",
            "notes":  "<p><strong>Some Text</strong>: in HTML</p><br/>\n<br/>\n<strong>this doesn't matter</strong><br/>\nfreeform html representation of notes<br/>\n<br/>\n<strong>Special needs:</strong><br/>",
            "contacts": [
                {
                    "notes": "<p>Random Note about contact</p>",
                    "role": "Primary Requestor",
                    "email": "user@domain.tld"
                },
                {
                    "notes": "<p>Random Note about contact</p>",
                    "role": "other role",
                    "email": "user2@domain.tld"
                }
            ],
            "start_date": "2017-02-20",
            "project": {
                "description": "&quot;some description of project&quot;",
                "reference": "123",
                "title": "Title of Project",
                "company": {
                    "abbreviation": "",
                    "description": "",
                    "id": 345,
                    "name": "BUSU",
                    "uuid": "ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4"
                },
                "id": 567,
                "uuid": "20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4"
            },
            "type": "Client Server",
            "id": 12312312,
            "description": "Block of text string"
        },
        "cvss2_score": "2.0",
        "first_date_created": "2017-04-10T15:04:32.527000Z",
        "first_phase_reference": "2017-1234"}]




def flatten_json(y):
    out = {}
    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x
    flatten(y)
    return out


flat = flatten_json(data)

import pandas as pd
import re

results = pd.DataFrame()
special_cols = []

columns_list = list(flat.keys())
for item in columns_list:
    try:
        row_idx = re.findall(r'\_(\d+)\_', item )[0]
    except:
        special_cols.append(item)
        continue
    column = re.findall(r'\_\d+\_(.*)', item )[0]
    column = column.replace('_', '')

    row_idx = int(row_idx)
    value = flat[item]

    results.loc[row_idx, column] = value

for item in special_cols:
    results[item] = flat[item]

Output:

print (results.to_string())
     type reference                 title                             notes               role             email 0_status  0_project_finding__id 0_rating__type            0_title              0_date_modified 0_cvss3_score 0_template_finding_id               0_project_finding__uuid               0_date_created 0_phase_status 0_phase_end_date                          0_phase_uuid 0_phase_reference         0_phase_title                                      0_phase_notes 0_phase_start_date              0_phase_project_description 0_phase_project_reference 0_phase_project_title 0_phase_project_company_abbreviation 0_phase_project_company_description  0_phase_project_company_id 0_phase_project_company_name          0_phase_project_company_uuid  0_phase_project_id                  0_phase_project_uuid   0_phase_type  0_phase_id   0_phase_description 0_cvss2_score         0_first_date_created 0_first_phase_reference
0  system    id-123  some reference title  <p>Random Note about contact</p>  Primary Requestor   user@domain.tld     Open                     23         Medium  Some string title  2017-04-10T15:04:32.527000Z          None                  None  ba42302e-b879-11e9-a2a3-2a2ae2dbcce4  2017-04-10T15:04:32.527000Z      Completed       2017-03-31  ba423506-b879-11e9-a2a3-2a2ae2dbcce4         2017-1234  some title for phase  <p><strong>Some Text</strong>: in HTML</p><br/...         2017-02-20  &quot;some description of project&quot;                       123      Title of Project                                                                                                  345                         BUSU  ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4                 567  20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4  Client Server    12312312  Block of text string           2.0  2017-04-10T15:04:32.527000Z               2017-1234
1     NaN       NaN                   NaN  <p>Random Note about contact</p>         other role  user2@domain.tld     Open                     23         Medium  Some string title  2017-04-10T15:04:32.527000Z          None                  None  ba42302e-b879-11e9-a2a3-2a2ae2dbcce4  2017-04-10T15:04:32.527000Z      Completed       2017-03-31  ba423506-b879-11e9-a2a3-2a2ae2dbcce4         2017-1234  some title for phase  <p><strong>Some Text</strong>: in HTML</p><br/...         2017-02-20  &quot;some description of project&quot;                       123      Title of Project                                                                                                  345                         BUSU  ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4                 567  20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4  Client Server    12312312  Block of text string           2.0  2017-04-10T15:04:32.527000Z               2017-1234

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM