[英]How to dynamically flatten a deeply nested json file using python?
I've tried using the flatten_json library to flatten it, was having trouble figuring out how to flatten the "project" and "phase" keys so that they do not result in additional "Columns". 我尝试使用flatten_json库对其进行展平,但在弄清楚如何展平“项目”和“阶段”键时遇到了麻烦,因此它们不会导致额外的“列”。
I get something similar to the following JSON out of a rest API: 我从rest API中得到类似于以下JSON的内容:
results = requests.get(apiUrl, verify = rootCaCert, headers = headers, params = httpParams)
for i in list(results.json()):
outputContent = flatten(i)
outputJson.append(outputContent)
findingsFrame = pandas.Series(outputJson).to_frame()
sparkSession = SparkSession.builder \
.appName('DEV Canopy Feed') \
.getOrCreate()
df = sparkSession.createDataFrame(findingsFrame)
Input JSON looks like this: 输入JSON如下所示:
[
{
"status": "Open",
"project_finding__id": 23,
"rating__type": "Medium",
"title": "Some string title"
"date_modified": "2017-04-10T15:04:32.527000Z",
"cvss3_score": null,
"template_finding_id": null,
"project_finding__uuid": "ba42302e-b879-11e9-a2a3-2a2ae2dbcce4",
"references": [
{
"type": "system",
"reference": "id-123",
"title": "some reference title"
}
],
"date_created": "2017-04-10T15:04:32.527000Z",
"phase": {
"status": "Completed",
"end_date": "2017-03-31",
"uuid": "ba423506-b879-11e9-a2a3-2a2ae2dbcce4",
"reference": "2017-1234",
"title": "some title for phase",
"notes": "<p><strong>Some Text</strong>: in HTML</p><br/>\n<br/>\n<strong>this doesn't matter</strong><br/>\nfreeform html representation of notes<br/>\n<br/>\n<strong>Special needs:</strong><br/>",
"contacts": [
{
"notes": "<p>Random Note about contact</p>",
"role": "Primary Requestor",
"email": "user@domain.tld"
},
{
"notes": "<p>Random Note about contact</p>",
"role": "other role",
"email": "user2@domain.tld"
}
],
"start_date": "2017-02-20",
"project": {
"description": ""some description of project"",
"reference": "123",
"title": "Title of Project",
"company": {
"abbreviation": "",
"description": "",
"id": 345,
"name": "BUSU",
"uuid": "ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4"
},
"id": 567,
"uuid": "20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4"
},
"type": "Client Server",
"id": 12312312,
"description": "Block of text string"
},
"cvss2_score": "2.0",
"first_date_created": "2017-04-10T15:04:32.527000Z",
"first_phase_reference": "2017-1234"
here's a function I've used in the past to flatten out nested json. 这是我过去用来拉平嵌套json的函数。
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
You could try it out and see if it gives the output you are looking for (I have no idea what you're expected output is, as suggested in the comments, you might want to include that): 您可以尝试一下,看看它是否提供了您想要的输出(我不知道您期望的输出是什么,正如评论中所建议的那样,您可能希望包括在内):
data = [
{
"status": "Open",
"project_finding__id": 23,
"rating__type": "Medium",
"title": "Some string title",
"date_modified": "2017-04-10T15:04:32.527000Z",
"cvss3_score": None,
"template_finding_id": None,
"project_finding__uuid": "ba42302e-b879-11e9-a2a3-2a2ae2dbcce4",
"references": [
{
"type": "system",
"reference": "id-123",
"title": "some reference title"
}
],
"date_created": "2017-04-10T15:04:32.527000Z",
"phase": {
"status": "Completed",
"end_date": "2017-03-31",
"uuid": "ba423506-b879-11e9-a2a3-2a2ae2dbcce4",
"reference": "2017-1234",
"title": "some title for phase",
"notes": "<p><strong>Some Text</strong>: in HTML</p><br/>\n<br/>\n<strong>this doesn't matter</strong><br/>\nfreeform html representation of notes<br/>\n<br/>\n<strong>Special needs:</strong><br/>",
"contacts": [
{
"notes": "<p>Random Note about contact</p>",
"role": "Primary Requestor",
"email": "user@domain.tld"
},
{
"notes": "<p>Random Note about contact</p>",
"role": "other role",
"email": "user2@domain.tld"
}
],
"start_date": "2017-02-20",
"project": {
"description": ""some description of project"",
"reference": "123",
"title": "Title of Project",
"company": {
"abbreviation": "",
"description": "",
"id": 345,
"name": "BUSU",
"uuid": "ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4"
},
"id": 567,
"uuid": "20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4"
},
"type": "Client Server",
"id": 12312312,
"description": "Block of text string"
},
"cvss2_score": "2.0",
"first_date_created": "2017-04-10T15:04:32.527000Z",
"first_phase_reference": "2017-1234"}]
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
flat = flatten_json(data)
import pandas as pd
import re
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = column.replace('_', '')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
Output: 输出:
print (results.to_string())
type reference title notes role email 0_status 0_project_finding__id 0_rating__type 0_title 0_date_modified 0_cvss3_score 0_template_finding_id 0_project_finding__uuid 0_date_created 0_phase_status 0_phase_end_date 0_phase_uuid 0_phase_reference 0_phase_title 0_phase_notes 0_phase_start_date 0_phase_project_description 0_phase_project_reference 0_phase_project_title 0_phase_project_company_abbreviation 0_phase_project_company_description 0_phase_project_company_id 0_phase_project_company_name 0_phase_project_company_uuid 0_phase_project_id 0_phase_project_uuid 0_phase_type 0_phase_id 0_phase_description 0_cvss2_score 0_first_date_created 0_first_phase_reference
0 system id-123 some reference title <p>Random Note about contact</p> Primary Requestor user@domain.tld Open 23 Medium Some string title 2017-04-10T15:04:32.527000Z None None ba42302e-b879-11e9-a2a3-2a2ae2dbcce4 2017-04-10T15:04:32.527000Z Completed 2017-03-31 ba423506-b879-11e9-a2a3-2a2ae2dbcce4 2017-1234 some title for phase <p><strong>Some Text</strong>: in HTML</p><br/... 2017-02-20 "some description of project" 123 Title of Project 345 BUSU ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4 567 20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4 Client Server 12312312 Block of text string 2.0 2017-04-10T15:04:32.527000Z 2017-1234
1 NaN NaN NaN <p>Random Note about contact</p> other role user2@domain.tld Open 23 Medium Some string title 2017-04-10T15:04:32.527000Z None None ba42302e-b879-11e9-a2a3-2a2ae2dbcce4 2017-04-10T15:04:32.527000Z Completed 2017-03-31 ba423506-b879-11e9-a2a3-2a2ae2dbcce4 2017-1234 some title for phase <p><strong>Some Text</strong>: in HTML</p><br/... 2017-02-20 "some description of project" 123 Title of Project 345 BUSU ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4 567 20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4 Client Server 12312312 Block of text string 2.0 2017-04-10T15:04:32.527000Z 2017-1234
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.