[英]pandas.io.json.json_normalize with very nested json
I have been trying to normalize
a very nested json file I will later analyze.我一直在尝试normalize
一个非常嵌套的 json 文件,稍后我将对其进行分析。 What I am struggling with is how to go more than one level deep to normalize.我正在努力解决的是如何将 go 超过一个级别来规范化。
I went through the pandas.io.json.json_normalize documentation, since it does exactly what I want it to do.我浏览了pandas.io.json.json_normalize文档,因为它完全符合我的要求。
I have been able to normalize part of it and now understand how dictionaries work, but I am still not there.我已经能够规范化其中的一部分,现在了解字典是如何工作的,但我仍然不在那里。
With below code I am able to get only the first level.使用以下代码,我只能获得第一级。
import json
import pandas as pd
from pandas.io.json import json_normalize
with open('authors_sample.json') as f:
d = json.load(f)
raw = json_normalize(d['hits']['hits'])
authors = json_normalize(data = d['hits']['hits'],
record_path = '_source',
meta = ['_id', ['_source', 'journal'], ['_source', 'title'],
['_source', 'normalized_venue_name']
])
I am trying to 'dig' into the 'authors' dictionary with below code, but the record_path = ['_source', 'authors']
throws me TypeError: string indices must be integers
.我正在尝试使用以下代码“挖掘”“作者”字典,但record_path = ['_source', 'authors']
抛出TypeError: string indices must be integers
。 As far as I understand json_normalize
the logic should be good, but I still don't quite understand how to dive into a json with dict
vs list
.据我了解json_normalize
逻辑应该是好的,但我仍然不太明白如何使用dict
与list
深入研究 json 。
I even went through this simple example .我什至经历了这个简单的例子。
authors = json_normalize(data = d['hits']['hits'],
record_path = ['_source', 'authors'],
meta = ['_id', ['_source', 'journal'], ['_source', 'title'],
['_source', 'normalized_venue_name']
])
Below is a chunk of the json file (5 records).下面是 json 文件的一部分(5 条记录)。
{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
u'hits': {u'hits': [{u'_id': u'7CB3F2AD',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': None,
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'Physical Review Letters',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'phys rev lett',
u'pages': None,
u'parent_keywords': [u'Chromatography',
u'Quantum mechanics',
u'Particle physics',
u'Quantum field theory',
u'Analytical chemistry',
u'Quantum chromodynamics',
u'Physics',
u'Mass spectrometry',
u'Chemistry'],
u'pub_date': u'1987-03-02 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'mass spectra', u'elementary particles', u'bound states'],
u'title': u'Evidence for a new meson: A quasinuclear NN-bar bound state',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'Physical Review Letters',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7AF8EBC3',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'affiliations': [u'Punjabi University'],
u'author_id': u'780E3459',
u'author_name': u'munish puri'},
{u'affiliations': [u'Punjabi University'],
u'author_id': u'48D92C79',
u'author_name': u'rajesh dhaliwal'},
{u'affiliations': [u'Punjabi University'],
u'author_id': u'7D9BD37C',
u'author_name': u'r s singh'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'Journal of Industrial Microbiology & Biotechnology',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'j ind microbiol biotechnol',
u'pages': None,
u'parent_keywords': [u'Nuclear medicine',
u'Psychology',
u'Hydrology',
u'Chromatography',
u'X-ray crystallography',
u'Nuclear fusion',
u'Medicine',
u'Fluid dynamics',
u'Thermodynamics',
u'Physics',
u'Gas chromatography',
u'Radiobiology',
u'Engineering',
u'Organic chemistry',
u'High-performance liquid chromatography',
u'Chemistry',
u'Organic synthesis',
u'Psychotherapist'],
u'pub_date': u'2008-04-04 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'flow rate',
u'operant conditioning',
u'packed bed reactor',
u'immobilized enzyme',
u'specific activity'],
u'title': u'Development of a stable continuous flow immobilized enzyme reactor for the hydrolysis of inulin',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'Journal of Industrial Microbiology & Biotechnology',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7521A721',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'7FF872BC',
u'author_name': u'barbara eileen ryan'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Social science',
u'Politics',
u'Sociology',
u'Law'],
u'pub_date': u'1992-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'social movements'],
u'title': u"Feminism and the women's movement : dynamics of change in social movement ideology, and activism",
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7DAEB9A4',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'0299B8E9',
u'author_name': u'fraser j harbutt'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Superconductivity',
u'Nuclear fusion',
u'Geology',
u'Chemistry',
u'Metallurgy'],
u'pub_date': u'1988-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'iron'],
u'title': u'The iron curtain : Churchill, America, and the origins of the Cold War',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7B3236C5',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'7DAB7B72',
u'author_name': u'richard m freeland'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Political Science', u'Economics'],
u'pub_date': u'1985-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'foreign policy'],
u'title': u'The Truman Doctrine and the origins of McCarthyism : foreign policy, domestic politics, and internal security, 1946-1948',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'}],
u'max_score': 1.0,
u'total': 36429433},
u'timed_out': False,
u'took': 170}
In the pandas example (below) what do the brackets mean?在熊猫示例(如下)中,括号是什么意思? Is there a logic to be followed to go deeper with the [].是否有一个逻辑可以用来更深入地使用 []. [...] [...]
result = json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
Each string or list of strings in the ['state', 'shortname', ['info', 'governor']]
value is a path to an element to include, in addition to the selected rows . ['state', 'shortname', ['info', 'governor']]
值中的每个字符串或字符串列表都是要包含的元素的路径,除了所选行。 The second argument json_normalize()
argument ( record_path
, set to 'counties'
in the documentation example) tells the function how to select elements from the input data structure that make up the rows in the output, and the meta
paths adds further metadata that will be included with each of the rows.第二个参数json_normalize()
参数( record_path
,在文档示例中设置为'counties'
)告诉函数如何从构成输出行的输入数据结构中选择元素,并且meta
路径添加了进一步的元数据包含在每一行中。 Think of these as table joins in a database, if you will.如果愿意,可以将这些视为数据库中的表连接。
The input for the US States documentation example has two dictionaries in a list, and both of these dictionaries have a counties
key that references another list of dicts: US States文档示例的输入在一个列表中有两个字典,并且这两个字典都有一个引用另一个字典列表的counties
键:
>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
... 'info': {'governor': 'Rick Scott'},
... 'counties': [{'name': 'Dade', 'population': 12345},
... {'name': 'Broward', 'population': 40000},
... {'name': 'Palm Beach', 'population': 60000}]},
... {'state': 'Ohio',
... 'shortname': 'OH',
... 'info': {'governor': 'John Kasich'},
... 'counties': [{'name': 'Summit', 'population': 1234},
... {'name': 'Cuyahoga', 'population': 1337}]}]
>>> pprint(data[0]['counties'])
[{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}]
>>> pprint(data[1]['counties'])
[{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}]
Between them there are 5 rows of data to use in the output:在它们之间有 5 行数据用于输出:
>>> json_normalize(data, 'counties')
name population
0 Dade 12345
1 Broward 40000
2 Palm Beach 60000
3 Summit 1234
4 Cuyahoga 1337
The meta
argument then names some elements that live next to those counties
lists, and those are then merged in separately. meta
参数然后命名一些位于这些counties
列表旁边的元素,然后将它们单独合并。 The values from the first data[0]
dictionary for those meta
elements are ('Florida', 'FL', 'Rick Scott')
, respectively, and for data[1]
the values are ('Ohio', 'OH', 'John Kasich')
, so you see those values attached to the counties
rows that came from the same top-level dictionary, repeated 3 and 2 times respectively:这些meta
元素的第一个data[0]
字典中的值分别是('Florida', 'FL', 'Rick Scott')
,而data[1]
的值是('Ohio', 'OH', 'John Kasich')
,因此您会看到这些值附加到来自同一顶级字典的counties
行,分别重复了 3 次和 2 次:
>>> data[0]['state'], data[0]['shortname'], data[0]['info']['governor']
('Florida', 'FL', 'Rick Scott')
>>> data[1]['state'], data[1]['shortname'], data[1]['info']['governor']
('Ohio', 'OH', 'John Kasich')
>>> json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
name population state shortname info.governor
0 Dade 12345 Florida FL Rick Scott
1 Broward 40000 Florida FL Rick Scott
2 Palm Beach 60000 Florida FL Rick Scott
3 Summit 1234 Ohio OH John Kasich
4 Cuyahoga 1337 Ohio OH John Kasich
So, if you pass in a list for the meta
argument, then each element in the list is a separate path, and each of those separate paths identifies data to add to the rows in the output.因此,如果您为meta
参数传入一个列表,则列表中的每个元素都是一个单独的路径,并且每个单独的路径都标识要添加到输出行中的数据。
In your example JSON, there are only a few nested lists to elevate with the first argument, like 'counties'
did in the example.在您的示例 JSON 中,只有几个嵌套列表可以使用第一个参数提升,就像示例中的'counties'
一样。 The only example in that datastructure is the nested 'authors'
key;该数据结构中唯一的示例是嵌套的'authors'
键; you'd have to extract each ['_source', 'authors']
path, after which you can add other keys from the parent object to augment those rows.您必须提取每个['_source', 'authors']
路径,然后您可以从父对象中添加其他键以扩充这些行。
The second meta
argument then pulls in the _id
key from the outermost objects, followed by the nested ['_source', 'title']
and ['_source', 'journal']
nested paths.然后第二个meta
参数从最外面的对象中提取_id
键,然后是嵌套的['_source', 'title']
和['_source', 'journal']
嵌套路径。
The record_path
argument takes the authors
lists as the starting point, these look like: record_path
参数以authors
列表为起点,这些看起来像:
>>> d['hits']['hits'][0]['_source']['authors'] # this value is None, and is skipped
>>> d['hits']['hits'][1]['_source']['authors']
[{'affiliations': ['Punjabi University'],
'author_id': '780E3459',
'author_name': 'munish puri'},
{'affiliations': ['Punjabi University'],
'author_id': '48D92C79',
'author_name': 'rajesh dhaliwal'},
{'affiliations': ['Punjabi University'],
'author_id': '7D9BD37C',
'author_name': 'r s singh'}]
>>> d['hits']['hits'][2]['_source']['authors']
[{'author_id': '7FF872BC',
'author_name': 'barbara eileen ryan'}]
>>> # etc.
and so gives you the following rows:所以给你以下几行:
>>> json_normalize(d['hits']['hits'], ['_source', 'authors'])
affiliations author_id author_name
0 [Punjabi University] 780E3459 munish puri
1 [Punjabi University] 48D92C79 rajesh dhaliwal
2 [Punjabi University] 7D9BD37C r s singh
3 NaN 7FF872BC barbara eileen ryan
4 NaN 0299B8E9 fraser j harbutt
5 NaN 7DAB7B72 richard m freeland
and then we can use the third meta
argument to add more columns like _id
, _source.title
and _source.journal
, using ['_id', ['_source', 'journal'], ['_source', 'title']]
:然后我们可以使用第三个meta
参数添加更多列,如_id
、 _source.title
和_source.journal
,使用['_id', ['_source', 'journal'], ['_source', 'title']]
:
>>> json_normalize(
... data['hits']['hits'],
... ['_source', 'authors'],
... ['_id', ['_source', 'journal'], ['_source', 'title']]
... )
affiliations author_id author_name _id \
0 [Punjabi University] 780E3459 munish puri 7AF8EBC3
1 [Punjabi University] 48D92C79 rajesh dhaliwal 7AF8EBC3
2 [Punjabi University] 7D9BD37C r s singh 7AF8EBC3
3 NaN 7FF872BC barbara eileen ryan 7521A721
4 NaN 0299B8E9 fraser j harbutt 7DAEB9A4
5 NaN 7DAB7B72 richard m freeland 7B3236C5
_source.journal
0 Journal of Industrial Microbiology & Biotechno...
1 Journal of Industrial Microbiology & Biotechno...
2 Journal of Industrial Microbiology & Biotechno...
3 The American Historical Review
4 The American Historical Review
5 The American Historical Review
_source.title \
0 Development of a stable continuous flow immobi...
1 Development of a stable continuous flow immobi...
2 Development of a stable continuous flow immobi...
3 Feminism and the women's movement : dynamics o...
4 The iron curtain : Churchill, America, and the...
5 The Truman Doctrine and the origins of McCarth...
You can also have a look at the library flatten_json , which does not require you to write column hierarchies as in json_normalize:您还可以查看库flatten_json ,它不需要您像 json_normalize 那样编写列层次结构:
from flatten_json import flatten
data = d['hits']['hits']
dict_flattened = (flatten(record, '.') for record in data)
df = pd.DataFrame(dict_flattened)
print(df)
See https://github.com/amirziai/flatten .请参阅https://github.com/amirziai/flatten 。
Adding to Sanders comment, more context can be found here as the creator of this function has a medium blog: https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10添加到 Sanders 评论中,可以在此处找到更多上下文,因为此 function 的创建者有一个中型博客: https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10
It is worth keeping in mind that panda's json_normalize can handle most json objects, like arrays for example.值得记住的是,熊猫的 json_normalize 可以处理大多数 json 对象,例如 arrays。 The flatten_json library requires it to be a nested dict. flatten_json 库要求它是一个嵌套的字典。 However, you can work around this requirement by adding the array to a dict like so:但是,您可以通过将数组添加到 dict 来解决此要求,如下所示:
flatten({'response':data}, '.')
In this case, the flatten_json library will actually use a counter in the dot notation to distinguished against duplicates.在这种情况下,flatten_json 库实际上会使用点表示法中的计数器来区分重复项。 For example:例如:
flatten({
'response': [
{'metrics': {'clicks': '0', 'cost_micros': '0', 'impressions': '3'},
'segments': {'date': '2022-12-01'}},
{'metrics': {'clicks': '1', 'cost_micros': '609240', 'impressions': '358'},
'segments': {'date': '2022-12-01'}},
{'metrics': {'clicks': '0', 'cost_micros': '0', 'impressions': '3'},
'segments': {'date': '2022-12-02'}},
{'metrics': {'clicks': '2', 'cost_micros': '40000', 'impressions': '291'},
'segments': {'date': '2022-12-02'}},
{'metrics': {'clicks': '0', 'cost_micros': '0', 'impressions': '2'},
'segments': {'date': '2022-12-03'}},
{'metrics': {'clicks': '2', 'cost_micros': '337754', 'impressions': '241'},
'segments': {'date': '2022-12-03'}},
{'metrics': {'clicks': '0', 'cost_micros': '0', 'impressions': '4'},
'segments': {'date': '2022-12-04'}},
{'metrics': {'clicks': '2', 'cost_micros': '757299', 'impressions': '197'},
'segments': {'date': '2022-12-04'}}
]
}, '.')
Produces:产生:
{'response.0.metrics.clicks': '0',
'response.0.metrics.cost_micros': '0',
'response.0.metrics.impressions': '3',
'response.0.segments.date': '2022-12-01',
'response.1.metrics.clicks': '1',
'response.1.metrics.cost_micros': '609240',
'response.1.metrics.impressions': '358',
'response.1.segments.date': '2022-12-01',
'response.2.metrics.clicks': '0',
'response.2.metrics.cost_micros': '0',
'response.2.metrics.impressions': '3',
'response.2.segments.date': '2022-12-02',
'response.3.metrics.clicks': '2',
'response.3.metrics.cost_micros': '40000',
'response.3.metrics.impressions': '291',
'response.3.segments.date': '2022-12-02',
'response.4.metrics.clicks': '0',
'response.4.metrics.cost_micros': '0',
'response.4.metrics.impressions': '2',
'response.4.segments.date': '2022-12-03',
'response.5.metrics.clicks': '2',
'response.5.metrics.cost_micros': '337754',
'response.5.metrics.impressions': '241',
'response.5.segments.date': '2022-12-03',
'response.6.metrics.clicks': '0',
'response.6.metrics.cost_micros': '0',
'response.6.metrics.impressions': '4',
'response.6.segments.date': '2022-12-04',
'response.7.metrics.clicks': '2',
'response.7.metrics.cost_micros': '757299',
'response.7.metrics.impressions': '197',
'response.7.segments.date': '2022-12-04'}
Sorry, I do not have enough reputation to post a comment.抱歉,我没有足够的声誉来发表评论。 Apparently 50 is required and I have 23.显然需要 50 个,而我有 23 个。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.