[英]How to parse and get specific data from a huge json file to implement search in python
我有一个包含大量信息的 json 文件,所以我试图在有 position 的地方提取特定数据,我需要获取直接名称数据,还尝试在 python 中实现搜索。我正在上传样本的一部分json 文件中的数据 ex.json
`
{
"storables": [
{
"columns": [
{
"position": 0,
"header": {
"id": "",
"indexVersion": 35643,
"generationNum": 35643,
"name": "CAT",
"author": "",
"created": 1620247188226,
"modified": 1668544812673,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"tags": [],
"isExternal": false,
"isDeprecated": false
},
"complete": true,
"incompleteDetail": [],
"isDerived": true,
"dataType": "VARCHAR",
"type": "ATTRIBUTE",
"sageOutputColumnId": "",
"defaultAggrType": "NONE",
"ownerName": "",
"ownerType": "WORKSHEET",
"entityCategory": "DEFAULT",
"spotiqPreference": "DEFAULT",
"isAdditive": false,
"indexType": "DEFAULT",
"indexPriority": 1,
"sources": [
{
"tableId": "",
"tableName": "",
"columnId": "",
"columnName": "CATASTROPHE"
}
],
"synonyms": [],
"injectedInlineValues": [],
"precision": -1,
"scale": 0,
"isPrimaryKey": false,
"isAttributionDimension": true,
"derivationExpr": {
"exprType": "LOGICAL_COLUMN_REFERENCE",
"logicalColumn": {
"header": {
"id": "",
"indexVersion": 35499,
"generationNum": 35499,
"name": "CATASTROPHE",
"author": "",
"created": 1630716505804,
"modified": 1668211006637,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"schemaStripe": "",
"databaseStripe": "",
"tags": [],
"isExternal": false,
"isDeprecated": false
}
},
"joinPaths": [
{
"joins": [
{
"sourceTable": "",
"destinationTable": "",
"content": {
"relationships": [
{
"sourceColumn": "",
"destinationColumn": ""
}
],
"weight": 1
},
"joinType": "INNER",
"type": "USER_DEFINED",
"isOneToOneJoin": false,
"header": {
"id": "",
"indexVersion": 35499,
"generationNum": 35499,
"name": "",
"description": "",
"author": "",
"created": 1650658367043,
"modified": 1668211006686,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"tags": [],
"type": "USER_DEFINED",
"isExternal": false,
"isDeprecated": false
},
"complete": true,
"incompleteDetail": [],
"sourceColumns": [
""
],
"targetColumns": [
""
]
}
]
}
]
}
},
{
"position": 1,
"header": {
"id": "",
"indexVersion": 35643,
"generationNum": 35643,
"name": "Peril",
"author": "",
"created": 1620247188226,
"modified": 1668544812673,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"tags": [],
"isExternal": false,
"isDeprecated": false
},
"complete": true,
"incompleteDetail": [],
"isDerived": true,
"dataType": "VARCHAR",
"type": "ATTRIBUTE",
"sageOutputColumnId": "",
"defaultAggrType": "NONE",
"ownerName": "",
"ownerType": "WORKSHEET",
"entityCategory": "DEFAULT",
"spotiqPreference": "DEFAULT",
"isAdditive": false,
"indexType": "DEFAULT",
"indexPriority": 1,
"sources": [
{
"tableId": "",
"tableName": "",
"columnId": "",
"columnName": "TYPE_OF"
}
],
"synonyms": [],
"injectedInlineValues": [],
"precision": -1,
"scale": 0,
"isPrimaryKey": false,
"isAttributionDimension": true,
"derivationExpr": {
"exprType": "LOGICAL_COLUMN_REFERENCE",
"logicalColumn": {
"header": {
"id": "",
"indexVersion": 35499,
"generationNum": 35499,
"name": "TYPE_OF",
"author": "",
"created": 1630716505804,
"modified": 1668211006637,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"schemaStripe": "",
"databaseStripe": "",
"tags": [],
"isExternal": false,
"isDeprecated": false
}
},
"joinPaths": [
{
"joins": [
{
"sourceTable": "",
"destinationTable": "",
"content": {
"relationships": [
{
"sourceColumn": "",
"destinationColumn": ""
}
],
"weight": 1
},
"joinType": "INNER",
"type": "USER_DEFINED",
"isOneToOneJoin": false,
"header": {
"id": "",
"indexVersion": 35499,
"generationNum": 35499,
"name": "",
"description": "Copy of user table relationship",
"author": "",
"created": 1650658367043,
"modified": 1668211006686,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"tags": [],
"type": "USER_DEFINED",
"isExternal": false,
"isDeprecated": false
},
"complete": true,
"incompleteDetail": [],
"sourceColumns": [
""
],
"targetColumns": [
""
]
}
]
}
]
}
},
{
"position": 2,
"header": {
"id": "",
"indexVersion": 35643,
"generationNum": 35643,
"name": "Job",
"author": "",
"created": 1620247188226,
"modified": 1668544812673,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"tags": [],
"isExternal": false,
"isDeprecated": false
},
"complete": true,
"incompleteDetail": [],
"isDerived": true,
"dataType": "VARCHAR",
"type": "ATTRIBUTE",
"sageOutputColumnId": "",
"defaultAggrType": "NONE",
"ownerName": "",
"ownerType": "WORKSHEET",
"entityCategory": "DEFAULT",
"spotiqPreference": "DEFAULT",
"isAdditive": false,
"indexType": "DEFAULT",
"indexPriority": 1,
"sources": [
{
"tableId": "",
"tableName": "",
"columnId": "",
"columnName": ""
}
],
"synonyms": [],
"injectedInlineValues": [],
"precision": -1,
"scale": 0,
"isPrimaryKey": false,
"isAttributionDimension": true,
"derivationExpr": {
"exprType": "LOGICAL_COLUMN_REFERENCE",
"logicalColumn": {
"header": {
"id": "",
"indexVersion": 35499,
"generationNum": 35499,
"name": "ROTATION_TRADE",
"author": "",
"created": 1630716505804,
"modified": 1668211006637,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"schemaStripe": "",
"databaseStripe": "",
"tags": [],
"isExternal": false,
"isDeprecated": false
}
},
"joinPaths": [
{
"joins": [
{
"sourceTable": "",
"destinationTable": "",
"content": {
"relationships": [
{
"sourceColumn": "",
"destinationColumn": ""
}
],
"weight": 1
},
"joinType": "INNER",
"type": "USER_DEFINED",
"isOneToOneJoin": false,
"header": {
"id": "",
"indexVersion": 35499,
"generationNum": 35499,
"name": "",
"description": "Copy of user table relationship",
"author": "",
"created": 1650658367043,
"modified": 1668211006686,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"tags": [],
"type": "USER_DEFINED",
"isExternal": false,
"isDeprecated": false
},
"complete": true,
"incompleteDetail": [],
"sourceColumns": [
""
],
"targetColumns": [
""
]
}
]
}
]
}
},
{
"position": 3,
"header": {
"id": "",
"indexVersion": 35643,
"generationNum": 35643,
"name": "Job Lenghth",
"author": "",
"created": 1620247188226,
"modified": 1668544812673,
"modifiedBy": "",
"owner": "",
"isDeleted": false,
"isHidden": false,
"tags": [],
"isExternal": false,
"isDeprecated": false
},
"complete": true,
"incompleteDetail": [],
"isDerived": true,
"dataType": "VARCHAR",
"type": "ATTRIBUTE",
`
`
with open('ex.json', 'r') as f:
for line in f:
if 'position' in line:
for line in f:
if ' name: ' in line:
print(line)
` 我尝试了这段 python 代码,但它不起作用。 我不确定如何只返回 position 之后的直接名称。文件中有多个名称实例,但我只需要 position 之后的那个...
import json
with open('ex.json', 'r') as f:
data = json.load(f)
现在您可以访问所有 json 项目,就像您从数据变量访问 python 中的任何字典/对象一样
您的代码可能有效,但您需要稍微更改一下逻辑。 这是解决方案的快速草图:
prevWasPosition = False
with open('ex.json', 'r') as f:
for line in f:
if '"position":' in line:
prevWasPosition = True
continue
if prevWasPosition and '"name":' in line:
print(line)
prevWasPosition = False
请注意,此解决方案基于 json 文件格式正确的假设。 如果不是,您可能会得到意想不到的结果。 更强的解决方案是逐块使用读取文件并将其解析为 json,但它超出了此答案的 scope。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.