how to read in two tab delimited files .txt and map them together by one common column.
For example, from these two files create a mapping of gene to pathway:
First file, pathway.txt
Pathway Protein
Binding and Uptake of Ligands by Scavenger Receptors P69905
Erythrocytes take up carbon dioxide and release oxygen P69905
Metabolism P69905
Amyloids P02647
Metabolism P02647
Hemostasis P68871
Second file, gene.txt
Gene Protein
Fabp3 P11404
HBA1 P69905
APOA1 P02647
Hbb-b1 P02088
HBB P68871
Hba P01942
output would be like,
Gene Protein Pathway
Fabp3 P11404
HBA1 P69905 Binding and Uptake of Ligands by Scavenger Receptors, Erythrocytes take up carbon dioxide and release oxygen, Metabolism
APOA1 P02647 Amyloids, Metabolism
Hbb-b1 P02088
HBB P68871 Hemostasis
Hba P01942
Leave blank if there is no pathway corresponds to gene base on the protein id information.
UPDATE:
import pandas as pd
file1= pd.read_csv("gene.csv")
file2= pd.read_csv("pathway.csv")
output = pd.concat([file1,file2]).fillna(" ")
output= output[["Gene","Protein"]+list(output.columns[1:-1])]
output.to_csv("mapping of gene to pathway.csv", index=False)
So this only gives me the merged file which is not i expected.
>>> from collections import defaultdict
>>> my_dict = defaultdict()
>>> f = open('pathway.txt')
>>> for x in f:
... x = x.strip().split()
... value,key = " ".join(x[:-1]),x[-1]
... if my_dict.get(key,0)==0:
... my_dict[key] = [value]
... else:my_dict[key].append(value)
...
>>> my_dict
defaultdict(None, {'P68871': ['Hemostasis'], 'Protein': ['Pathway'], 'P69905': ['Binding', 'Erythrocytes', 'Metabolism'], 'P02647': ['Amyloids', 'Metabolism']})
>>> f1 = open('gene.txt')
>>> for x in f1:
... value,key = x.strip().split()
... if my_dict.get(key,0)==0:
... print("{:<15}{:<15}".format(value,key))
... else: print("{:<15}{:<15}{}".format(value,key,", ".join(my_dict[key])))
...
Gene Protein Pathway
Fabp3 P11404
HBA1 P69905 Binding and Uptake of Ligands by Scavenger Receptors, Erythrocytes take up carbon dioxide and release oxygen Metabolism
APOA1 P02647 Amyloids, Metabolism
Hbb-b1 P02088
HBB P68871 Hemostasis
Hba P01942
class Protein:
def __init__(self, protein, pathway = None, gene = ""):
self.protein = protein
self.pathways = []
self.gene = gene
if pathway is not None:
self.pathways.append(pathway)
return
def __str__(self):
return "%s\t%s\t%s" % (
self.gene,
self.protein,
", ".join([p for p in self.pathways]))
# protein -> pathway map
proteins = {}
# get the pathways
f1 = file("pathways.txt")
for line in f1.readlines()[1:]:
tokens = line.split()
pathway = " ".join(tokens[:-1])
protein = tokens[-1]
if protein in proteins:
p = proteins[protein]
p.pathways.append(pathway)
else:
p = Protein(protein = protein, pathway = pathway)
proteins[protein] = p
# get the genes
f2 = file("genes.txt")
for line in f2.readlines()[1:]:
gene, protein = line.split()
if protein in proteins:
p = proteins[protein]
p.gene = gene
else:
p = Protein(protein = protein, gene = gene)
proteins[protein] = p
# print the results
print "Gene\tProtein\tPathway"
for protein in proteins.values():
print protein
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.