I've tried using the flatten_json library to flatten it, was having trouble figuring out how to flatten the "project" and "phase" keys so that they do not result in additional "Columns".
I get something similar to the following JSON out of a rest API:
results = requests.get(apiUrl, verify = rootCaCert, headers = headers, params = httpParams)
for i in list(results.json()):
outputContent = flatten(i)
outputJson.append(outputContent)
findingsFrame = pandas.Series(outputJson).to_frame()
sparkSession = SparkSession.builder \
.appName('DEV Canopy Feed') \
.getOrCreate()
df = sparkSession.createDataFrame(findingsFrame)
Input JSON looks like this:
[
{
"status": "Open",
"project_finding__id": 23,
"rating__type": "Medium",
"title": "Some string title"
"date_modified": "2017-04-10T15:04:32.527000Z",
"cvss3_score": null,
"template_finding_id": null,
"project_finding__uuid": "ba42302e-b879-11e9-a2a3-2a2ae2dbcce4",
"references": [
{
"type": "system",
"reference": "id-123",
"title": "some reference title"
}
],
"date_created": "2017-04-10T15:04:32.527000Z",
"phase": {
"status": "Completed",
"end_date": "2017-03-31",
"uuid": "ba423506-b879-11e9-a2a3-2a2ae2dbcce4",
"reference": "2017-1234",
"title": "some title for phase",
"notes": "<p><strong>Some Text</strong>: in HTML</p><br/>\n<br/>\n<strong>this doesn't matter</strong><br/>\nfreeform html representation of notes<br/>\n<br/>\n<strong>Special needs:</strong><br/>",
"contacts": [
{
"notes": "<p>Random Note about contact</p>",
"role": "Primary Requestor",
"email": "user@domain.tld"
},
{
"notes": "<p>Random Note about contact</p>",
"role": "other role",
"email": "user2@domain.tld"
}
],
"start_date": "2017-02-20",
"project": {
"description": ""some description of project"",
"reference": "123",
"title": "Title of Project",
"company": {
"abbreviation": "",
"description": "",
"id": 345,
"name": "BUSU",
"uuid": "ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4"
},
"id": 567,
"uuid": "20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4"
},
"type": "Client Server",
"id": 12312312,
"description": "Block of text string"
},
"cvss2_score": "2.0",
"first_date_created": "2017-04-10T15:04:32.527000Z",
"first_phase_reference": "2017-1234"
here's a function I've used in the past to flatten out nested json.
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
You could try it out and see if it gives the output you are looking for (I have no idea what you're expected output is, as suggested in the comments, you might want to include that):
data = [
{
"status": "Open",
"project_finding__id": 23,
"rating__type": "Medium",
"title": "Some string title",
"date_modified": "2017-04-10T15:04:32.527000Z",
"cvss3_score": None,
"template_finding_id": None,
"project_finding__uuid": "ba42302e-b879-11e9-a2a3-2a2ae2dbcce4",
"references": [
{
"type": "system",
"reference": "id-123",
"title": "some reference title"
}
],
"date_created": "2017-04-10T15:04:32.527000Z",
"phase": {
"status": "Completed",
"end_date": "2017-03-31",
"uuid": "ba423506-b879-11e9-a2a3-2a2ae2dbcce4",
"reference": "2017-1234",
"title": "some title for phase",
"notes": "<p><strong>Some Text</strong>: in HTML</p><br/>\n<br/>\n<strong>this doesn't matter</strong><br/>\nfreeform html representation of notes<br/>\n<br/>\n<strong>Special needs:</strong><br/>",
"contacts": [
{
"notes": "<p>Random Note about contact</p>",
"role": "Primary Requestor",
"email": "user@domain.tld"
},
{
"notes": "<p>Random Note about contact</p>",
"role": "other role",
"email": "user2@domain.tld"
}
],
"start_date": "2017-02-20",
"project": {
"description": ""some description of project"",
"reference": "123",
"title": "Title of Project",
"company": {
"abbreviation": "",
"description": "",
"id": 345,
"name": "BUSU",
"uuid": "ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4"
},
"id": 567,
"uuid": "20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4"
},
"type": "Client Server",
"id": 12312312,
"description": "Block of text string"
},
"cvss2_score": "2.0",
"first_date_created": "2017-04-10T15:04:32.527000Z",
"first_phase_reference": "2017-1234"}]
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
flat = flatten_json(data)
import pandas as pd
import re
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = column.replace('_', '')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
Output:
print (results.to_string())
type reference title notes role email 0_status 0_project_finding__id 0_rating__type 0_title 0_date_modified 0_cvss3_score 0_template_finding_id 0_project_finding__uuid 0_date_created 0_phase_status 0_phase_end_date 0_phase_uuid 0_phase_reference 0_phase_title 0_phase_notes 0_phase_start_date 0_phase_project_description 0_phase_project_reference 0_phase_project_title 0_phase_project_company_abbreviation 0_phase_project_company_description 0_phase_project_company_id 0_phase_project_company_name 0_phase_project_company_uuid 0_phase_project_id 0_phase_project_uuid 0_phase_type 0_phase_id 0_phase_description 0_cvss2_score 0_first_date_created 0_first_phase_reference
0 system id-123 some reference title <p>Random Note about contact</p> Primary Requestor user@domain.tld Open 23 Medium Some string title 2017-04-10T15:04:32.527000Z None None ba42302e-b879-11e9-a2a3-2a2ae2dbcce4 2017-04-10T15:04:32.527000Z Completed 2017-03-31 ba423506-b879-11e9-a2a3-2a2ae2dbcce4 2017-1234 some title for phase <p><strong>Some Text</strong>: in HTML</p><br/... 2017-02-20 "some description of project" 123 Title of Project 345 BUSU ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4 567 20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4 Client Server 12312312 Block of text string 2.0 2017-04-10T15:04:32.527000Z 2017-1234
1 NaN NaN NaN <p>Random Note about contact</p> other role user2@domain.tld Open 23 Medium Some string title 2017-04-10T15:04:32.527000Z None None ba42302e-b879-11e9-a2a3-2a2ae2dbcce4 2017-04-10T15:04:32.527000Z Completed 2017-03-31 ba423506-b879-11e9-a2a3-2a2ae2dbcce4 2017-1234 some title for phase <p><strong>Some Text</strong>: in HTML</p><br/... 2017-02-20 "some description of project" 123 Title of Project 345 BUSU ba4238ee-b879-11e9-a2a3-2a2ae2dbcce4 567 20e56bc4-b87b-11e9-a2a3-2a2ae2dbcce4 Client Server 12312312 Block of text string 2.0 2017-04-10T15:04:32.527000Z 2017-1234
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.