[英]Merge multiple *.csv, *.txt, or *.ascii files based on a common field using python
我想将大约8 * .csv文件合并为一个。
示例文件:
ID, Average
34, 4.5
35, 5.6
36, 3.4
另一个文件可能是:
ID, Max
34, 6
35, 7
36, 4
我需要输出为:
ID, Average, Max
34, 4.5, 6
35, 5.6, 7
36, 3.4, 4
这只有一半有效....它将所有数据都附加到相同的两列中。
import glob, string
outfile = open('<directory>/<fileName>.csv','a')
files = glob.glob(r"<directory>/*.csv")
for y in files:
newfile = open(y,'r+')
data = newfile.read()
newfile.close()
outfile.writerow(y)
如何将数据附加到新列,而不是重复“ID”字段?
你有三个问题。
码
#!/usr/bin/env python
import argparse, csv
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='merge csv files on field', version='%(prog)s 1.0')
parser.add_argument('infile', nargs='+', type=str, help='list of input files')
parser.add_argument('--out', type=str, default='temp.csv', help='name of output file')
args = parser.parse_args()
data = {}
fields = []
for fname in args.infile:
with open(fname, 'rb') as df:
reader = csv.DictReader(df)
for line in reader:
# assuming the field is called ID
if line['ID'] not in data:
data[line['ID']] = line
else:
for k,v in line.iteritems():
if k not in data[line['ID']]:
data[line['ID']][k] = v
for k in line.iterkeys():
if k not in fields:
fields.append(k)
del reader
writer = csv.DictWriter(open(args.out, "wb"), fields, dialect='excel')
# write the header at the top of the file
writer.writeheader()
writer.writerows(data)
del writer
请注意,这将忽略具有相同字段名称的数据。
解析器部分的替代方法是:
#!/usr/bin/env python
import glob, csv
if __name__ == '__main__':
infiles = glob.glob('./*.csv')
out = 'temp.csv'
data = {}
fields = []
for fname in infiles:
df = open(fname, 'rb')
reader = csv.DictReader(df)
for line in reader:
# assuming the field is called ID
if line['ID'] not in data:
data[line['ID']] = line
else:
for k,v in line.iteritems():
if k not in data[line['ID']]:
data[line['ID']][k] = v
for k in line.iterkeys():
if k not in fields:
fields.append(k)
del reader
df.close()
writer = csv.DictWriter(open(out, "wb"), fields, dialect='excel')
# write the header at the top of the file
writer.writeheader()
writer.writerows(data)
del writer
data1 = ['1,blue,red',
'2,purple,yellow',
'3,white,brown']
data2 = ['1,fee',
'2,fie',
'3,foe',
'4,fum']
data_table = dict(s.split(',',1) for s in data1)
for line in data2:
key, _ = line.split(',',1)
print ','.join((line, data_table.get(key,',')))
得到:
1,fee,blue,red
2,fie,purple,yellow
3,foe,white,brown
4,fum,,
这是一个csv版本:
import csv
data1 = ['1,blue,red',
'2,purple,yellow',
'3,white,brown']
data2 = ['1,fee',
'2,fie',
'3,foe',
'4,fum']
with open('out.txt','w') as f:
combined = csv.writer(f)
data1 = ['1,blue,red',
'2,purple,yellow',
'3,white,brown']
data2 = ['1,fee',
'2,fie',
'3,foe',
'4,fum']
data_table = dict((row[0], row[1:]) for row in csv.reader(data1))
for row in csv.reader(data2):
key = row[0]
combined.writerow(row + data_table.get(key, ['','']))
也许可以用txtselect工作吗? 我还没有用它,但作者将在下个月在pyArkansas上发表演讲 。
我非常喜欢atpy在桌子上阅读 - 它是多功能的,而且我主要使用它。 此外,如果您将这些表视为数组,而不仅仅是想要切片的大型表,那么它可能更容易使用。 假设每个文件的ID列表的顺序相同,则首先读入一个文件,然后附加到每一行:
data = open('bigtable.txt','w')
table1 = atpy.Table("path/Table1.csv", type="ascii", delimiter=",")
table2 = atpy.Table("path/Table1.txt", type="ascii", delimiter="|")
c = 9 #number of columns
a = []
for ii in range(len(table1)):
a[0].append(table1[ii][0])
a[1].append(table1[ii][1])
a[2].append(table2[ii][1]) #...etc. it was hard to tell from your example what
# columns you wanted where
data.write("%s\n" % a)
data.close
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.