简体   繁体   English

使用 json.loads in python 读取文件时出现 memory 错误

[英]out of memory error when reading file with json.loads in python

hi i create a file and fill it with below code:嗨,我创建了一个文件并用以下代码填充它:

i have some lists which i fill them and write them in file.我有一些列表,我填写它们并将它们写在文件中。

each list will be written in one line.每个列表将写在一行中。

import json
import random
import numpy as np

train_dir = '../sq-new/data/0-train.json'
outputdata="./data/1-train-with-50-random-embedded-neg-sample.json"

vec = np.memmap('../sq-new/data/entity2vec.bin', dtype='float32', mode='r',offset=0)

out_dataset = open(outputdata,'w', encoding='utf-8')

data_train = open(train_dir, "r")

 

question_train=[]

subject_pos_train=[]

subject_neg_train=[]

subject_pos_m=[]

subject_neg_m=[]

answer_pos_train=[]

answer_neg_train=[]

answer_pos_m=[]

answer_neg_m=[]

for line in data_train:


    line=json.loads(line)

    true_triples=[]
    false_triples=[]

    for triple in line['triples']:

        if triple['ans']==True:

            true_triples.append(triple)
        
        elif triple['ans']==False:

            false_triples.append(triple)

    pair_count=0

    if (len(true_triples)>0) and (len(false_triples)>0):

        while pair_count<100:

            true_sample=random.choice(true_triples)

            false_sample=random.choice(false_triples)

            question_train.append(line['question'])

            subject_neg_train.append(false_sample['q_et'])

            subject_pos_train.append(true_sample['q_et'])

            answer_neg_train.append(false_sample['c_et'])

            answer_pos_train.append(true_sample['c_et'])

            i=int(false_sample['cet_id'])
            answer_neg_m.append(list(vec[(i*50):((i+1)*50)]))

            i=int(true_sample['cet_id'])
            answer_pos_m.append(list(vec[(i*50):((i+1)*50)]))

            i=int(false_sample['q_et_id'])
            subject_neg_m.append(list(vec[(i*50):((i+1)*50)]))


            i=int(true_sample['q_et_id'])
            subject_pos_m.append(list(vec[(i*50):((i+1)*50)]))

            pair_count=pair_count+1

out_dataset.write(json.dumps(question_train)+"\n")
out_dataset.write(json.dumps(subject_neg_train)+"\n")
out_dataset.write(json.dumps(subject_pos_train)+"\n")
out_dataset.write(json.dumps(answer_neg_train)+"\n")
out_dataset.write(json.dumps(answer_pos_train)+"\n")
out_dataset.write(json.dumps(str(answer_neg_m))+"\n")
out_dataset.write(json.dumps(str(answer_pos_m))+"\n")
out_dataset.write(json.dumps(str(subject_neg_m))+"\n")
out_dataset.write(json.dumps(str(subject_pos_m))+"\n")
        
out_dataset.close()
 

result file size is 16GB结果文件大小为 16GB

when i want to read the file my memory will be fulled completely and code will be killed.当我想读取文件时,我的 memory 将被完全填满,代码将被终止。 my server has 250GB RAM我的服务器有 250GB RAM

here is my code for reading.这是我的阅读代码。

so whats wrong with my codes.所以我的代码有什么问题。

import ast
import json
train_dir = '../create-data/data/1-train-with-50-random-embedded-neg-sample.json'
# train_dir = '../create-data/data/1-train-one-embeded.json'
data_train = open(train_dir, "r")
question_train=json.loads(data_train.readline())
subject_neg_train=json.loads(data_train.readline())
subject_pos_train=json.loads(data_train.readline())
answer_neg_train=json.loads(data_train.readline())
answer_pos_train=json.loads(data_train.readline())
answer_neg_m=ast.literal_eval(json.loads(data_train.readline()))
answer_pos_m=ast.literal_eval(json.loads(data_train.readline()))
subject_neg_m=ast.literal_eval(json.loads(data_train.readline()))
subject_pos_m=ast.literal_eval(json.loads(data_train.readline()))

print(type(subject_neg_train))
print(len(subject_neg_train))


print(type(subject_pos_m))
print(len(subject_pos_m))
print(type(subject_pos_m[0]))

Try pandas尝试pandas

import pandas as pd
df = pd.read_json(train_dir)

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM