[英]looking for an efficient and fast way to map items from one large file to another in python
我有几个TSV文件,大小从2MB到450MB不等。 我需要彼此映射数据,最后根据这些映射创建一个文件。 文件看起来像这样:文件1:
cluster_123 seq1 seq2,seq3
cluster_456 seq4 seq5,seq6
cluster_789 seq7 seq8
文件2:
cluster_123 id1
cluster_456 id2
seq10 id3
首先,我需要打开集群,以便获得seq:id对:
seq1 id1
.
.
seq10 id3
为此,我做了一本字典:
mapped_seq_id = {'seq1': id1, 'seq10': id3}
现在我需要将此字典映射到一个看起来像这样的文件:file3:
id1 cluster123 function1
id3 seq10 function2
现在,可以使用mapd_seq_id字典中的id将序列映射到函数。 我试图为此创建一个字典,将保留seq:function对:
seq_function_dict = {'seq1': function1, 'seq2': function1, 'seq10', function2}
但是,由于file3很大,因此创建字典可能需要几个小时。 函数字符串可以是10-20个字长。 我在这里使用字典的原因是因为这是最后一步,我需要将seq映射到另一个文件,并使用它来提取另一条信息以创建看起来像这样的最终文件。 请参考代码以获取更多详细信息。
最终文件:
seq function data_from_final_file
实际上,函数create_annotated_dict需要很长时间才能完成。 我想知道的是,除了使用字典以外,是否还有更好,更快的方法在python中做到这一点?
非常感谢。
编辑:添加代码:
#!/usr/bin/env python
import itertools
from collections import defaultdict
from operator import itemgetter
import string
import os
import csv
import sys
current_dir = os.getcwd()
def create_mapping_dictionary(current_dir):
map_file = csv.reader(open(current_dir + '/file1', "rb"), delimiter = '\t')
#file 1 looks like this:
# cluster1 seq1 seq2 20%
# cluster2 seq3 seq4,seq5 55%
# cluster3 seq6 seq7,seq8,seq9 99%
#
# in this function I'd like to create the following dictionary:
# map_dict = {'seq1': cluster1, 'seq2': cluster2, ...'seq6':
# cluster3}
mapped_file = open(current_dir + '/file1_mapped.txt', 'w')
map_dict = dict()
for row in map_file:
temp = list()
list_of_seq = str(row[2])
last_item = (len(row) - 1)
if ',' in b: # in row:
temp.append(row[1])
list_of_seq = row[2].split(',')
for i in list_of_seq:
temp.append(i)
else:
temp.append(row[1])
temp.append(row[2])
for item in temp:
map_dict[item] = row[0]
for k,v in map_dict.iteritems():
mapped_file.write("%s\t%s\n" % (k, v))
map_annotation_to_sequence_headers(current_dir, map_dict)
def map_annotation_to_sequence_headers(current_dir, map_dict):
fltered = csv.reader(open(current_dir + '/file2', "rb"), delimiter = '\t')
##file2 looks like this:
# cluster1 id1 1 55 89 10
# cluster2 id2 77 88 12 876
# cluster3 id3 99 45 123 99
# seq10 id4 67 33 44 11
# seq11 id5 55 113 102 33
#
# in this function I'd like to create the following dictionary:
# map_dict = {'seq1': id1, 'seq2': id1,...'seq6': id3, 'seq10': id4}
#
ids_dict = dict()
for row in filtered:
if 'aa90_' in row[0]:
if row[0] in map_dict.values():
lkeys = [key for key, value in map_dict.iteritems() if value == row[0]]
for i in lkeys:
ids_dict[i] = row[1]
else:
ids_dict[row[0]] = row[1]
create_annotated_dict(current_dir, ids_dict, map_dict)
def create_annotated_dict(current_dir, ids_dict, map_dict):
annotated = csv.reader(open(current_dir + '/file3', "rb"), delimiter = '\t')
##file3 looks like this:
# id1 cluster1 55 89 10 string1 string2 string3
# id1 cluster1 544 8 101 string1 string5 string3
# id1 cluster1 51 83 102 string1 string2 string4
# id2 cluster2 77 88 12 string3 string4 string3
# id4 seq10 33 44 11 string10 string11 string12
# id4 seq10 44 54 31 string10 string11 string12
# id4 seq10 33 44 11 string10 string13 string14
#
#
# in this function I'd like to create the following dictionary:
# paris of seqs and list of their corresponding strings.
# string1_2_3_4_5 = [string1, string2, string3, string4, string5]
# annotated_dict = {'seq1': string1_2_3_4_5 , 'seq2':
# string1_2_3_4_5,...'seq10': string10_11_12_13_14}
#
annotated_dict = dict()
for ids, lines in itertools.groupby(annotated, itemgetter(0)):
temp_list = list()
for row in lines:
if ids in ids_dict.values():
t = row[6].split(' ')
tax = ' '.join(t[0:2])
if row[5] not in temp_list:
temp_list.append(row[5])
if tax not in temp_list:
temp_list.append(tax)
if row[7] not in temp_list:
temp_list.append(row[7])
if row[8] not in temp_list:
temp_list.append(row[8])
if 'cluster' in row[1]:
if row[1] in map_dict.values():
lkeys = [ key for key, value in map_dict.iteritems() if value == row[1]]
for i in lkeys:
annotated_dict[i] = temp_list
else:
annotated_dict[row[1]] = temp_list
temp_list = list()
create_fasta(current_dir, annotated_diact)
def create_fasta(current_dir, annotated_dict):
flat_fasta= csv.reader(open(current_dir + '/file4', "rb"), delimiter = '\t')
##file looks like this:
# >seq1 ACTGAGTAGCAGTAGCAGATGAC
# >seq2 ACATGACAAAACTATCTATCCCA
# >seq3 ACGATGAGTGACGATGAGTCAGT
#
# in this function I need to atach to each seq it corresponding
# annotation drom the annotated dict and to create a file that look
# like this:
# >seq1 string1_2_3_4_5
# ACTGAGTAGCAGTAGCAGATGAC
out_fasta = open(current_dir + 'fasta.out', 'w')
for row in flat_fasta:
seq_name = flat_fasta[0]replace('>','')
if seq_name in annotated_dict.keys():
annotation= annotated_dict[seq_name]
annotated_string= ' '.join(annotation)
new_header = '>' + seq_name + ' ' + annotated_string
#print new_header
out_fasta.write("%s\n%s\n" % (new_header, row[1]))
else:
#this seq had no annotation'
out_fasta.write("%s%s\n%s\n" % ('>', row[1]))
out_fasta.close()
create_mapping_dictionary(current_dir)
在不更改一般方法的情况下:
if seq_name in annotated_dict.keys():
替换为if seq_name in annotated_dict.keys():
if seq_name in annotated_dict:
将嵌套循环内if value in some_dict.values()
所有if value in some_dict.values()
替换为:
values = set(some_dict.values()) for .. for .. if value in values:
(如果some_dict在迭代过程中是恒定的)
temp_list
可能很大,则使用temp_set = set()
; if val not in temp_set: temp_set.add()
而不是if val not in temp_list: temp_list.append()
。 正如@jsbueno所说:考虑使用SQL,例如通过sqlite模块。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.