I have several TSV files ranging in size from 2MB to 450MB. I need to map data from one the other and finally create a file based on these mappings. the files look like this: file 1:
cluster_123 seq1 seq2,seq3
cluster_456 seq4 seq5,seq6
cluster_789 seq7 seq8
file2:
cluster_123 id1
cluster_456 id2
seq10 id3
at first I needed to open the clusters so I could get seq:id pairs:
seq1 id1
.
.
seq10 id3
for that I have made a dictionary:
mapped_seq_id = {'seq1': id1, 'seq10': id3}
now I need to map this dictionary to a file that looks like this: file3:
id1 cluster123 function1
id3 seq10 function2
using the id from the mapped_seq_id dictionary I can now map sequences to functions. I tried created a dict for that that will hold seq:function pairs:
seq_function_dict = {'seq1': function1, 'seq2': function1, 'seq10', function2}
however since file3 is very big creating the dictionary can take hours. the function strings can be 10-20 words long. the reason I am using dictionaries here is because the is a final step where I will need to map 'seq' to another file and use that to extract another piece of information in order to create a final file that looks like this. please refer to code for more details.
final file:
seq function data_from_final_file
in fact, it is function create_annotated_dict that takes very long time to complete. what I would like to know is whether there is a better and faster way to do that in python other than using a dictionary?
many thanks.
EDIT: Added code:
#!/usr/bin/env python
import itertools
from collections import defaultdict
from operator import itemgetter
import string
import os
import csv
import sys
current_dir = os.getcwd()
def create_mapping_dictionary(current_dir):
map_file = csv.reader(open(current_dir + '/file1', "rb"), delimiter = '\t')
#file 1 looks like this:
# cluster1 seq1 seq2 20%
# cluster2 seq3 seq4,seq5 55%
# cluster3 seq6 seq7,seq8,seq9 99%
#
# in this function I'd like to create the following dictionary:
# map_dict = {'seq1': cluster1, 'seq2': cluster2, ...'seq6':
# cluster3}
mapped_file = open(current_dir + '/file1_mapped.txt', 'w')
map_dict = dict()
for row in map_file:
temp = list()
list_of_seq = str(row[2])
last_item = (len(row) - 1)
if ',' in b: # in row:
temp.append(row[1])
list_of_seq = row[2].split(',')
for i in list_of_seq:
temp.append(i)
else:
temp.append(row[1])
temp.append(row[2])
for item in temp:
map_dict[item] = row[0]
for k,v in map_dict.iteritems():
mapped_file.write("%s\t%s\n" % (k, v))
map_annotation_to_sequence_headers(current_dir, map_dict)
def map_annotation_to_sequence_headers(current_dir, map_dict):
fltered = csv.reader(open(current_dir + '/file2', "rb"), delimiter = '\t')
##file2 looks like this:
# cluster1 id1 1 55 89 10
# cluster2 id2 77 88 12 876
# cluster3 id3 99 45 123 99
# seq10 id4 67 33 44 11
# seq11 id5 55 113 102 33
#
# in this function I'd like to create the following dictionary:
# map_dict = {'seq1': id1, 'seq2': id1,...'seq6': id3, 'seq10': id4}
#
ids_dict = dict()
for row in filtered:
if 'aa90_' in row[0]:
if row[0] in map_dict.values():
lkeys = [key for key, value in map_dict.iteritems() if value == row[0]]
for i in lkeys:
ids_dict[i] = row[1]
else:
ids_dict[row[0]] = row[1]
create_annotated_dict(current_dir, ids_dict, map_dict)
def create_annotated_dict(current_dir, ids_dict, map_dict):
annotated = csv.reader(open(current_dir + '/file3', "rb"), delimiter = '\t')
##file3 looks like this:
# id1 cluster1 55 89 10 string1 string2 string3
# id1 cluster1 544 8 101 string1 string5 string3
# id1 cluster1 51 83 102 string1 string2 string4
# id2 cluster2 77 88 12 string3 string4 string3
# id4 seq10 33 44 11 string10 string11 string12
# id4 seq10 44 54 31 string10 string11 string12
# id4 seq10 33 44 11 string10 string13 string14
#
#
# in this function I'd like to create the following dictionary:
# paris of seqs and list of their corresponding strings.
# string1_2_3_4_5 = [string1, string2, string3, string4, string5]
# annotated_dict = {'seq1': string1_2_3_4_5 , 'seq2':
# string1_2_3_4_5,...'seq10': string10_11_12_13_14}
#
annotated_dict = dict()
for ids, lines in itertools.groupby(annotated, itemgetter(0)):
temp_list = list()
for row in lines:
if ids in ids_dict.values():
t = row[6].split(' ')
tax = ' '.join(t[0:2])
if row[5] not in temp_list:
temp_list.append(row[5])
if tax not in temp_list:
temp_list.append(tax)
if row[7] not in temp_list:
temp_list.append(row[7])
if row[8] not in temp_list:
temp_list.append(row[8])
if 'cluster' in row[1]:
if row[1] in map_dict.values():
lkeys = [ key for key, value in map_dict.iteritems() if value == row[1]]
for i in lkeys:
annotated_dict[i] = temp_list
else:
annotated_dict[row[1]] = temp_list
temp_list = list()
create_fasta(current_dir, annotated_diact)
def create_fasta(current_dir, annotated_dict):
flat_fasta= csv.reader(open(current_dir + '/file4', "rb"), delimiter = '\t')
##file looks like this:
# >seq1 ACTGAGTAGCAGTAGCAGATGAC
# >seq2 ACATGACAAAACTATCTATCCCA
# >seq3 ACGATGAGTGACGATGAGTCAGT
#
# in this function I need to atach to each seq it corresponding
# annotation drom the annotated dict and to create a file that look
# like this:
# >seq1 string1_2_3_4_5
# ACTGAGTAGCAGTAGCAGATGAC
out_fasta = open(current_dir + 'fasta.out', 'w')
for row in flat_fasta:
seq_name = flat_fasta[0]replace('>','')
if seq_name in annotated_dict.keys():
annotation= annotated_dict[seq_name]
annotated_string= ' '.join(annotation)
new_header = '>' + seq_name + ' ' + annotated_string
#print new_header
out_fasta.write("%s\n%s\n" % (new_header, row[1]))
else:
#this seq had no annotation'
out_fasta.write("%s%s\n%s\n" % ('>', row[1]))
out_fasta.close()
create_mapping_dictionary(current_dir)
Without changing general approach:
if seq_name in annotated_dict.keys():
by if seq_name in annotated_dict:
replace all if value in some_dict.values()
inside nested loops with:
values = set(some_dict.values()) for .. for .. if value in values:
(if some_dict is constant during iteration)
temp_list
can be large then use temp_set = set()
and if val not in temp_set: temp_set.add()
instead of if val not in temp_list: temp_list.append()
. As @jsbueno said: consider using SQL eg, via sqlite module.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.