![](/img/trans.png)
[英]Python: How to create a csv string (no file) from a list of dictionaries?
[英]Python: Create file from list of dictionaries
我有大约5000个.gz文件,我必须从这些文件中提取“字典列表”形式的数据。
样本源数据:
{"user" : "J101", "ip" : "192.0.0.0", "usage" : "1000", "Location" : "CA",
"time" : "12038098048"}
{"user" : "M101", "ip" : "192.0.0.1", "usage" : "5000",
"time" : "12038098048", "Device" : "iOS" , "user_type" : "Premium"}
{"user" : "T101", "usage" : "10", "Location" : "AK","time" : "12038098048"}
{"user" : "A101", "ip" : "192.0.0.3", "usage" : "2000",
"time" : "12038098048", "user_type" : "Platinum" }
{"user" : "T101", "usage" : "10", "Location" : "AK","time" : "12038098048"}
{"user" : "J101", "ip" : "192.0.0.0", "usage" : "1000", "Location" : "CA",
"time" : "12038098048" }
上方的每一行代表特定事件的数据; 用户J101
和T101
报告了两次数据,因此它们各自有2行。
我处于编写此代码的初始阶段,因此我首先从1 .gz中提取数据,然后尝试查看是否可以解析感兴趣的数据并创建.txt或.csv文件。
我的要求是从这些文件中仅获取很少的属性,例如user
, ip
, time
和usage
。
下面是我编写的从.gz文件中提取数据并以字典列表形式存储数据的代码。
import gzip
from collections import defaultdict
import json
import csv
e_dict = { 'userid' : { 'e_name' : 'user'},
'ipaddr' : { 'e_name' : 'ip' },
'event_time' : { 'e_name' : 'time' },
'usage_in_mb' : { 'e_name' : 'usage' }
}
dict_list = []
inputdict = defaultdict(lambda: None)
count_valueerror = 0
class parser(object):
def read_entries(self):
count = 0
with gzip.open('testfile.gz', 'r') as test:
for row in test:
try:
# Few rows are empty in the source file and have a new line character
if row == "\n":
continue
else:
# Changing the type of each row in file to string type for parsing dictionary
row_new = json.loads(row)
for key, val in e_dict.iteritems():
if val['e_name'] in row_new:
inputdict[key] = row_new[val['e_name']]
except ValueError:
count_valueerror += 1
dict_list.append(inputdict)
def create_csv(self):
with open('dict.csv', 'wb') as csv_file:
for row in dict_list:
for key, val in row:
csvwriter = csv.DictWriter(csv_file, fieldnames= row.keys(), extrasaction='raise', dialect='excel')
csvwriter.writeheader()
csvwriter.writerows(val)
return csv_file
create_csv
方法无法正常工作。 我不确定如何解析dict_list
并将每个字典对象写入csv / text文件中。
我收到此错误ValueError: dict contains fields not in fieldnames: 'p
for create_csv
方法。
在代码中的某个时候,您已将行设置为字符串,例如,
'{"user" : "J101", "ip" : "192.0.0.0", "usage" : "1000", "Location" : "CA", "time" : "12038098048"}'
然后得到想要的计算,
[eval(row)[_] for _ in ['user', 'ip', 'time', 'usage']]
获得类似的结果
['J101', '192.0.0.0', '12038098048', '1000']
我认为问题可能出在您的CSV文件编写器方法中。 您似乎正在写文件的标题和一行,每一行的每个键都有数据。
您可以尝试如下操作:
def create_csv(dict_list):
with open('dict.csv', 'w') as csv_file:
# Create writer, using first item's keys as header values
csvwriter = csv.DictWriter(csv_file, fieldnames=dict_list[0].keys(), extrasaction='raise', dialect='excel')
# Write the header
csvwriter.writeheader()
# Iterate rows in dictionary list
for row in dict_list:
# Write row
csvwriter.writerow(row)
return csv_file
我在机器上尝试过,并且可以正常工作。 让我知道您是否需要。
修改...
两个列表,一个字典生成( cveFieldName , eventFieldName , inputdict )
inputdict = {}
e_list = ('userid', 'user'), ('ipaddr', 'ip'),\
('event_time', 'time'), ('usage_in_mb', 'usage'),\
('test_1', 'test1'), ('test_2', 'test2'),\
('test_3', 'test4'), ('test_5', 'test6')
cveFieldName, eventFieldName = zip(*e_list)
eventFieldName列表使用, inputdict.clear()删除
def read_entries(self):
count_valueerror = 0
with gzip.open('test.gz', 'r') as test:
for row in test:
try:
# Few rows are empty in the source file
# and have a new line character
if row == "\n":
continue
else:
# Changing the type of each row in file
# to string type for parsing dictionary
row_new = json.loads(row)
for idx, x in enumerate(eventFieldName):
inputdict[cveFieldName[idx]] = row_new[x] if x in row_new else ''
except ValueError as e:
print e
count_valueerror += 1
dict_list.append(dict(inputdict))
# inputdict.clear()
cveFieldName使用
def create_csv(self):
with open('dict.csv', 'wb') as csv_file:
csvwriter = csv.DictWriter(
csv_file,
fieldnames=cveFieldName,
extrasaction='raise',
dialect='excel')
csvwriter.writeheader()
for row in dict_list:
try:
csvwriter.writerow(row)
except Exception as e:
print e
return csv_file
dict.csv
userid,ipaddr,event_time,usage_in_mb,test_1,test_2,test_3,test_5
J101,192.0.0.0,12038098048,1000,,,,
M101,192.0.0.1,12038098048,5000,,,,
T101,,12038098048,10,,,,
A101,192.0.0.3,12038098048,2000,,,,
T101,,12038098048,10,,,,
J101,192.0.0.0,12038098048,1000,,,,
inputdict.clear() <=必需的指令
def read_entries(self): count_valueerror = 0 with gzip.open('test.gz', 'r') as test: for row in test: # import pdb; pdb.set_trace() try: # Few rows are empty in the source file # and have a new line character if row == "\\n": continue else: # Changing the type of each row in file # to string type for parsing dictionary row_new = json.loads(row) for key, val in e_dict.iteritems(): if val['e_name'] in row_new: inputdict[key] = row_new[val['e_name']] except ValueError as e: print e count_valueerror += 1 dict_list.append(dict(inputdict)) inputdict.clear() # <==== very important def create_csv(self): with open('dict.csv', 'wb') as csv_file: csvwriter = csv.DictWriter( csv_file, fieldnames=['userid', 'ipaddr', 'event_time', 'usage_in_mb'], extrasaction='raise', dialect='excel') csvwriter.writeheader() for row in dict_list: try: csvwriter.writerow(row) except Exception as e: print e return csv_file
没有指令:inputdict.clear()
userid,ipaddr,event_time,usage_in_mb J101,192.0.0.0,12038098048,1000 M101,192.0.0.1,12038098048,5000 T101,192.0.0.1,12038098048,10 <=== M101 ip address A101,192.0.0.3,12038098048,2000 T101,192.0.0.3,12038098048,10 <=== A101 ip address J101,192.0.0.0,12038098048,1000
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.