简体   繁体   中英

looking for an efficient and fast way to map items from one large file to another in python

I have several TSV files ranging in size from 2MB to 450MB. I need to map data from one the other and finally create a file based on these mappings. the files look like this: file 1:

cluster_123    seq1    seq2,seq3
cluster_456    seq4    seq5,seq6
cluster_789    seq7    seq8

file2:

cluster_123    id1
cluster_456    id2
seq10    id3

at first I needed to open the clusters so I could get seq:id pairs:

seq1    id1
.
.
seq10    id3

for that I have made a dictionary:

mapped_seq_id = {'seq1': id1, 'seq10': id3}

now I need to map this dictionary to a file that looks like this: file3:

id1    cluster123    function1
id3    seq10         function2

using the id from the mapped_seq_id dictionary I can now map sequences to functions. I tried created a dict for that that will hold seq:function pairs:

seq_function_dict = {'seq1': function1, 'seq2': function1, 'seq10', function2}

however since file3 is very big creating the dictionary can take hours. the function strings can be 10-20 words long. the reason I am using dictionaries here is because the is a final step where I will need to map 'seq' to another file and use that to extract another piece of information in order to create a final file that looks like this. please refer to code for more details.

final file:

seq    function    data_from_final_file

in fact, it is function create_annotated_dict that takes very long time to complete. what I would like to know is whether there is a better and faster way to do that in python other than using a dictionary?

many thanks.

EDIT: Added code:

#!/usr/bin/env python

import itertools
from collections import defaultdict
from operator import itemgetter
import string
import os
import csv
import sys

current_dir = os.getcwd()   

def create_mapping_dictionary(current_dir):
    map_file = csv.reader(open(current_dir + '/file1', "rb"), delimiter = '\t')

    #file 1 looks like this:
    #   cluster1   seq1    seq2    20%
    #   cluster2    seq3    seq4,seq5   55%
    #   cluster3    seq6    seq7,seq8,seq9  99%
    #
    # in this function I'd like to create the following dictionary:
    # map_dict = {'seq1': cluster1, 'seq2': cluster2, ...'seq6':
    # cluster3}
    mapped_file = open(current_dir + '/file1_mapped.txt', 'w')
    map_dict = dict()
    for row in map_file:
        temp = list()
        list_of_seq = str(row[2])
        last_item = (len(row) - 1)
        if ',' in b: # in row:
            temp.append(row[1])
            list_of_seq = row[2].split(',')
            for i in list_of_seq:
                    temp.append(i)

        else:
            temp.append(row[1])
            temp.append(row[2])

        for item in temp:
            map_dict[item] = row[0]
    for k,v in map_dict.iteritems():
        mapped_file.write("%s\t%s\n" % (k, v))

    map_annotation_to_sequence_headers(current_dir, map_dict)


def map_annotation_to_sequence_headers(current_dir, map_dict):

    fltered = csv.reader(open(current_dir + '/file2', "rb"), delimiter = '\t')
    ##file2 looks like this:
    #   cluster1    id1 1   55  89  10  
    #   cluster2    id2 77  88  12  876
    #   cluster3    id3 99  45  123 99
    #   seq10   id4 67  33  44  11
    #   seq11   id5 55  113 102 33
    #
    # in this function I'd like to create the following dictionary:
    # map_dict = {'seq1': id1, 'seq2': id1,...'seq6': id3, 'seq10': id4}
    #
    ids_dict = dict()
    for row in filtered:
        if 'aa90_' in row[0]:
            if row[0] in map_dict.values():
                lkeys = [key for key, value in map_dict.iteritems() if value == row[0]]
                for i in lkeys:
                    ids_dict[i] = row[1]   

        else:
            ids_dict[row[0]] = row[1]
    create_annotated_dict(current_dir, ids_dict, map_dict)

def create_annotated_dict(current_dir, ids_dict, map_dict):
    annotated = csv.reader(open(current_dir + '/file3', "rb"), delimiter = '\t')
    ##file3 looks like this:
    #   id1 cluster1   55  89  10   string1 string2 string3
    #   id1 cluster1   544  8  101   string1 string5 string3
    #   id1 cluster1   51  83  102   string1 string2 string4
    #   id2 cluster2    77  88  12  string3 string4 string3
    #   id4 seq10   33  44  11  string10 string11 string12
    #   id4 seq10   44  54  31  string10 string11 string12
    #   id4 seq10   33  44  11  string10 string13 string14
    #   
    #
    # in this function I'd like to create the following dictionary:
    # paris of seqs and list of their corresponding strings.
    # string1_2_3_4_5 = [string1, string2, string3, string4, string5]
    # annotated_dict = {'seq1': string1_2_3_4_5 , 'seq2':
    # string1_2_3_4_5,...'seq10': string10_11_12_13_14}
    #
    annotated_dict = dict()
    for ids, lines in itertools.groupby(annotated, itemgetter(0)):
        temp_list = list()
        for row in lines:
            if ids in ids_dict.values(): 
                t = row[6].split(' ')
                tax = ' '.join(t[0:2])
                if row[5] not in temp_list:
                    temp_list.append(row[5])
                if tax not in temp_list:
                    temp_list.append(tax)
                if row[7] not in temp_list:
                    temp_list.append(row[7])
                if row[8] not in temp_list:
                    temp_list.append(row[8])

            if 'cluster' in row[1]:
                if row[1] in map_dict.values():
                    lkeys = [ key for key, value in map_dict.iteritems() if value == row[1]]
                    for i in lkeys:
                        annotated_dict[i] = temp_list
            else:   
                annotated_dict[row[1]] = temp_list
        temp_list = list()
    create_fasta(current_dir, annotated_diact)

def create_fasta(current_dir, annotated_dict):
    flat_fasta= csv.reader(open(current_dir + '/file4', "rb"), delimiter = '\t')
    ##file looks like this:
    #   >seq1   ACTGAGTAGCAGTAGCAGATGAC
    #   >seq2   ACATGACAAAACTATCTATCCCA
    #   >seq3   ACGATGAGTGACGATGAGTCAGT
    #   
    # in this function I need to atach to each seq it corresponding
    # annotation drom the annotated dict and to create a file that look
    # like this:
    #   >seq1  string1_2_3_4_5
    #   ACTGAGTAGCAGTAGCAGATGAC

    out_fasta = open(current_dir + 'fasta.out', 'w')

    for row in flat_fasta:
        seq_name = flat_fasta[0]replace('>','')
        if seq_name in annotated_dict.keys():
            annotation= annotated_dict[seq_name]
            annotated_string= ' '.join(annotation)
            new_header = '>' + seq_name + ' ' + annotated_string
            #print new_header
            out_fasta.write("%s\n%s\n" % (new_header, row[1]))
        else:
            #this seq had no annotation'
            out_fasta.write("%s%s\n%s\n" % ('>', row[1]))
    out_fasta.close()

create_mapping_dictionary(current_dir)

Without changing general approach:

  • replace if seq_name in annotated_dict.keys(): by if seq_name in annotated_dict:
  • replace all if value in some_dict.values() inside nested loops with:

     values = set(some_dict.values()) for .. for .. if value in values: 

    (if some_dict is constant during iteration)

  • If temp_list can be large then use temp_set = set() and if val not in temp_set: temp_set.add() instead of if val not in temp_list: temp_list.append() .

As @jsbueno said: consider using SQL eg, via sqlite module.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM