[英]How do I improve my code to make it run faster?
I am conducting a project in data science to analyse large volumes of cancer genome data, my computer is relatively inefficient and has a low cpu and low ram.我正在开展一个数据科学项目来分析大量癌症基因组数据,我的计算机效率相对较低,并且 CPU 和内存较低。 As a result to run through all the samples it take sufficiently too long.
因此,要遍历所有样本需要足够长的时间。
I have tried reducing any excess code, I have tried getting rid of for loops for list comprehensions, I have used multiprocessing to split up my tasks to run faster.我尝试减少任何多余的代码,我尝试摆脱列表推导的 for 循环,我使用多处理来拆分我的任务以更快地运行。
import re
import xlrd
import os
import time
from multiprocessing import Pool
import collections
import pandas as pd
if os.path.exists("C:\\Users\\js769\\genomemutations\\Input\\ChromosomesVersion") == True:
print("chromosomes in folder")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\Input\\ChromosomesVersion")
print(
"Chromosome Folder Created, Please transfer current version of chromosome number base data to new file."
)
if os.path.exists("C:\\Users\\js769\\genomemutations\\Input\\MutationSamples") == True:
print("Add sample data to run.")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\Input\\MutationSamples")
print("Mutation Sample Folder Created, please add mutation sample data to folder.")
if os.path.exists("C:\\Users\\js769\\genomemutations\\output") == True:
print("3")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\output")
# Require editing of this so it works both on a mac or windows system. Currently this version suited to mac because of higher processing power.
# Require ability to check to see if error occurs
def Main(Yeram):
import os
import glob
import errno
import shutil
import xlrd
import pandas as pd
import time
import re
import numpy as np
FragmentSize = 10000000 # This is fragment size which is adjustable.
# Code not needed
Position1 = Yeram.vectx
Position2 = Yeram.vecty
samplelist = Yeram.samplelist
dictA = Yeram.dictA
FragmentSize = Yeram.FragmentSize
chromosomesizes = Yeram.chromosomesizes
def chromosomex_mutation_data(
chromosomenumber, mutationlist
): # It selects the correct chromosome mutation point data, then it selects the data before the -. Mutation data in form(12-20)
chromosomexlist = ["0-1"]
for mutationposition in mutationlist:
if mutationposition[0:2] == str(chromosomenumber):
chromosomexlist.append(mutationposition[3:])
elif mutationposition[0:2] == (str(chromosomenumber) + ":"):
chromosomexlist.append(mutationposition[2:])
else:
continue
Puremutationdatapoints = [int(mutationposition.split("-")[0]) for mutationposition in chromosomexlist]
return Puremutationdatapoints
def Dictionary_Of_Fragment_mutation(FragmentSize, MutationData, ChromosomeNumber): #
chromosomes = {} # Dictionary
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
# Opening up specific chromosome data and calculating amount of bases present in chromosome
Number_of_fragments = int(chromosomesize / FragmentSize)
for mutation in MutationData:
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
if mutation in range(i * FragmentSize + 1, i * FragmentSize + FragmentSize + 1):
if chromosomes.get(a) == None:
chromosomes.update({a: 1})
else:
b = (chromosomes.get(a)) + 1
chromosomes.update({a: b})
else:
if chromosomes.get(a) == None:
chromosomes.update({a: 0})
else:
continue
return chromosomes # adds
# This adds mutations or no mutation to each fragment for chromosome,makes dicitonaries
def DictionaryRead(FragmentSize, Dict, ChromosomeNumber):
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
Number_of_fragments = int(chromosomesize / FragmentSize)
chromosomefragmentlist = []
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
chromosomefragmentlist.append(str(Dict.get((a))))
return chromosomefragmentlist
# This uses dictionary to create list
def forwardpackage2(FragmentSize, PureMutationData):
C = [] # list of data in numerical order 0 = no mutation
for i in range(1, 23, 1):
A = chromosomex_mutation_data(i, PureMutationData) # Purifies Data
B = Dictionary_Of_Fragment_mutation(FragmentSize, A, i) # Constructs Dictionary
C += DictionaryRead(
FragmentSize, B, i
) # Uses constructed Dictionary amd generates list of numbers, each number being a fragment in numerical order.
return C
def Mutationpointdata(Position1, Position2, dictA, FragmentSize): # Require dictA
vectx = Position1
vecty = Position2
Samplesandmutationpoints = []
for i in range(vectx, vecty):
print(samplelist[i])
new = [k for k, v in dictA.items() if int(v) == samplelist[i]]
mutationlist = [excelsheet.cell_value(i, 23) for i in new]
mutationlist.sort()
Samplesandmutationpoints.append(forwardpackage2(FragmentSize, mutationlist))
return Samplesandmutationpoints
# Opening sample data from excel table
return Mutationpointdata(Position1, Position2, dictA, FragmentSize) # yeram to james samples
def ChromosomeSequenceData(ChromosomeNumber): # Formats the chromosome file into readable information
with open(
r"C:\Users\js769\genomemutations\Input\ChromosomesVersion\chr" + str(ChromosomeNumber) + ".fa"
) as text_file:
text_data = text_file.read()
listA = re.sub("\n", "", text_data)
# list2=[z for z in text_data if z!= "\n"]
if ChromosomeNumber < 10:
ChromosomeSequenceData = listA[5:]
else:
ChromosomeSequenceData = listA[6:]
return ChromosomeSequenceData
def basepercentage_single(
i, FragmentSize, ChromosomeSequenceData
): # Creates a list of base percentage known for certain type of chromosome.
sentence = ChromosomeSequenceData[(i * FragmentSize + 1) : (i * FragmentSize + FragmentSize)]
a = sentence.count("N") + sentence.count("n")
c = str(((FragmentSize - a) / FragmentSize) * 100) + "%"
return c
def basepercentage_multiple(
FragmentSize, ChromosomeSequenceData
): # Creates a a list of base percentages known which correspond with the dna fragments for every chromosome.
fragmentamount = int(len(ChromosomeSequenceData) / FragmentSize)
list = [
basepercentage_single(i, FragmentSize, ChromosomeSequenceData) for i in range(0, (fragmentamount), 1)
]
return list
def FragmentEncodedPercentage(
FragmentSize
): # Packages a list of base percentages known which correspond with the dna fragments for every chromosome.
Initial_list = [basepercentage_multiple(FragmentSize, ChromosomeSequenceData(i)) for i in range(1, 23, 1)]
List_of_fragment_encoded_percentages = [item for sublist in Initial_list for item in sublist]
return List_of_fragment_encoded_percentages
def chromosomefragmentlist(
FragmentSize, ChromosomeNumber
): # Creares a list of fragment sizes for a specific chromosome.
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
Number_of_fragments = int(chromosomesize / FragmentSize)
chromosomefragmentlist = []
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
chromosomefragmentlist.append(str(((a))))
return chromosomefragmentlist
def GenomeFragmentGenerator(
FragmentSize
): # Creates the genome fragments for all chromosomes and adds them all to a list.
list = [chromosomefragmentlist(FragmentSize, i) for i in range(1, 23, 1)]
A = [item for sublist in list for item in sublist]
return A
def excelcreation(
mutationdata, samplelist, alpha, bravo, FragmentSize, A, B
): # Program runs sample alpha to bravo and then constructs excel table
data = {"GenomeFragments": A, "Encoded Base Percentage": B}
for i in range(alpha, bravo):
data.update({str(samplelist[i]): mutationdata[i]})
df = pd.DataFrame(data, index=A)
export_csv = df.to_csv(
r"C:/Users/js769/genomemutations/output/chromosomeAll.csv", index=None, header=True
)
start_time = time.time()
# Code determine base fragment size
FragmentSize = 1000000
chromosomesizes = [] # This calculates the base pair sizes for each chromosome.
for i in range(1, 23):
with open(r"C:\Users\js769\genomemutations\Input\ChromosomesVersion\chr" + str(i) + ".fa") as text_file:
text_data = text_file.read()
list = re.sub("\n", "", text_data)
if i < 10:
chromosomesizes.append(len(list[5:]))
else:
chromosomesizes.append(len(list[6:]))
wb = xlrd.open_workbook("C:/Users/js769/genomemutations/input/MutationSamples/Complete Sample For lungs.xlsx")
excelsheet = wb.sheet_by_index(0)
excelsheet.cell_value(0, 0)
sampleswithduplicates = [excelsheet.cell_value(i, 5) for i in range(1, excelsheet.nrows)]
samplelist = []
for sample in sampleswithduplicates:
if sample not in samplelist:
samplelist.append(int(sample)) # Constructs list of sample , each sample only comes up once
dictA = {}
counter = 1 # Creates a dictionary where it counts the
for sample in sampleswithduplicates:
dictA.update({counter: int(sample)})
counter = counter + 1
A = GenomeFragmentGenerator(FragmentSize)
B = FragmentEncodedPercentage(FragmentSize)
value = collections.namedtuple(
"value", ["vectx", "vecty", "samplelist", "dictA", "FragmentSize", "chromosomesizes"]
)
SampleValues = (
value(
vectx=0,
vecty=2,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=2,
vecty=4,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=4,
vecty=6,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=6,
vecty=8,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=8,
vecty=10,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=10,
vecty=12,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=12,
vecty=14,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=14,
vecty=16,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
)
print("starting multiprocessing")
if __name__ == "__main__":
with Pool(4) as p:
result = p.map(Main, SampleValues)
Allmutationdata = []
for i in result:
for b in i:
Allmutationdata.append(b)
excelcreation(Allmutationdata, samplelist, 0, 16, FragmentSize, A, B)
print("My program took " + str(time.time() - start_time) + " to run")
So the program runs that isn't the issue, the issue is the time it runs,can anyone spot anywhere where my code maybe at fault.所以程序运行不是问题,问题是它运行的时间,任何人都可以发现我的代码可能有问题的任何地方。
This article How to make your pandas loop run 72,000x faster has really resonated with me and I think will help you.这篇文章如何使您的 pandas 循环运行速度提高 72,000 倍,这确实引起了我的共鸣,我认为会对您有所帮助。
It provides clear instructions on how to vectorize
your for loops to drastically speed them up它提供了有关如何
vectorize
for 循环以显着加快它们的清晰说明
Methods to speed up a For Loop
:加快
For Loop
的方法:
Utilize pandas iterrows()
利用 pandas
iterrows()
~321 times faster约快 321 倍
Example例子
for index, row in dataframe.iterrows():
print(index, row)
Pandas Vectorization Pandas 矢量化
~9280 times faster ~9280 倍快
Example例子
df.loc[((col1 == val1) & (col2 == val2)), column_name] = conditional_result
Numpy Vectorization Numpy 矢量化
~72,000 times faster快约 72,000 倍
Example例子
df.loc[((col1.values == val1) & (col2.values == val2)), column_name] = conditional_result
By adding .values
we receive a numpy array.通过添加
.values
我们收到一个 numpy 数组。
Credit for the timing results goes to this article计时结果归功于这篇文章
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.