[英]Smith-Waterman algorithm to generate matrix in Python
我正在使用Python使用Smith-Waterman算法生成动态编程矩阵。
这是我到目前为止的内容:
def score(base1,base2):
base1=base1.upper()
base2=base2.upper()
if base1 not in 'ACTG' or base2 not in 'ACTG':
print 'Not DNA base!'
sys.exit()
elif base1==base2:
return 3
elif base1+base2=='AG' or base1+base2=='GA':
return -1
elif base1+base2=='CT' or base1+base2=='TC':
return -1
else:
return -2
import sys
seq1 = sys.argv[1]
seq2 = sys.argv[2]
mRows = len(seq1)
nCols = len(seq2)
gap = int(sys.argv[3])
matrix = []
# generate empty matrix
for x in range(mRows + 1):
matrix.append([])
for y in range(nCols + 1):
matrix[x].append(0)
for i in range(1, mRows + 1):
for j in range(1, nCols + 1):
dscore = matrix[i-1][j-1] + score(seq1[i-1], seq2[j-1])
vscore = matrix[i-1][j] + gap
hscore = matrix[i][j-1] + gap
matrix[i][j]=max(0, vscore, hscore, dscore)
使用输入:sw.py ATGCAT ACCT -1
我得到矩阵输出:
0 0 0 0 0
0 0 0 0 0
0 0 0 0 3
0 0 0 0 2
0 0 0 0 1
0 0 0 0 0
0 0 0 0 3
通过一些故障排除,我能够看到在嵌套的for循环中,只有使用最终值j(对于此特定输入为4)的分数存储在矩阵中,即仅存储最后一列。
我的问题是为什么会这样?我该如何解决? 为什么for循环跳回去而不继续到可变分数?
我的一些疑难解答:
for i in range(1, mRows + 1):
for j in range(1, nCols + 1):
print 'this is i', i
print 'this is j', j
print 'seq1', seq1[i-1], 'seq2', seq2[j-1]
dscore = matrix[i-1][j-1] + score(seq1[i-1], seq2[j-1])
vscore = matrix[i-1][j] + gap
hscore = matrix[i][j-1] + gap
matrix[i][j]=max(0, vscore, hscore, dscore)
print 'Vscore = ', vscore
print 'Hscore = ', hscore
print 'Dscore = ', dscore
print '\n'
得到:
this is i 1
this is j 1
seq1 A seq2 A
this is i 1
this is j 2
seq1 A seq2 C
this is i 1
this is j 3
seq1 A seq2 C
this is i 1
this is j 4
seq1 A seq2 T
Vscore = -1
Hscore = -1
Dscore = -2
this is i 2
this is j 1
seq1 T seq2 A
this is i 2
this is j 2
seq1 T seq2 C
this is i 2
this is j 3
seq1 T seq2 C
this is i 2
this is j 4
seq1 T seq2 T
Vscore = -1
Hscore = -1
Dscore = 3
this is i 3
this is j 1
seq1 G seq2 A
this is i 3
this is j 2
seq1 G seq2 C
this is i 3
this is j 3
seq1 G seq2 C
this is i 3
this is j 4
seq1 G seq2 T
Vscore = 2
Hscore = -1
Dscore = -2
this is i 4
this is j 1
seq1 C seq2 A
this is i 4
this is j 2
seq1 C seq2 C
this is i 4
this is j 3
seq1 C seq2 C
this is i 4
this is j 4
seq1 C seq2 T
Vscore = 1
Hscore = -1
Dscore = -1
this is i 5
this is j 1
seq1 A seq2 A
this is i 5
this is j 2
seq1 A seq2 C
this is i 5
this is j 3
seq1 A seq2 C
this is i 5
this is j 4
seq1 A seq2 T
Vscore = 0
Hscore = -1
Dscore = -2
this is i 6
this is j 1
seq1 T seq2 A
this is i 6
this is j 2
seq1 T seq2 C
this is i 6
this is j 3
seq1 T seq2 C
this is i 6
this is j 4
seq1 T seq2 T
Vscore = -1
Hscore = -1
Dscore = 3
谢谢!
我为Smith-Waterman算法编写了一个类,您可能会发现它很有用。 我建议为此任务使用numpy数组,因为它们通常比用于更大序列的列表更有效。
class LocalAligner(object):
#Constructor
def __init__(self, match=None, mismatch=None, gap=None, fileP=None, fileQ=None):
#Default args
#StringQ = GCTGGAAGGCAT
#StringP = GCAGAGCACG
#Match Score = +5, Mismatch Score = -4, Gap Penalty = -4
if match is None and mismatch is None and gap is None and fileP is None and fileQ is None:
#string1
self.q = "GCTGGAAGGCAT"
self.stringQName = "Sequence Q"
#string2
self.p = "GCAGAGCACG"
self.stringPName = "Sequence P"
#Scoring parameter
self.gapPen = -4
self.mismatchPen = -4
self.matchScore = 5
#User has given sequences and scoring arguments to the object
elif match is not None and mismatch is not None and gap is not None and fileP is not None and fileQ is not None:
#Default string name if one is not present in the file
self.stringQName = "String Q"
self.q = self.parseFile(fileQ, 1)
#Default string name if one is not present in the file
self.stringQName = "String P"
self.p = self.parseFile(fileP, 2)
#Scoring parameters given at the command line
self.gapPen = int(gap)
self.mismatchPen = int(mismatch)
self.matchScore = int(match)
#Final sequence alignments
self.finalQ = ""
self.finalP = ""
#Create a table and initialize to zero
#We will use numpy arrays as they are generally more efficient than lists for large amounts of data (ie sequences)
self.MatrixA = np.empty(shape=[len(self.p)+1,len(self.q)+1])
#Create b table
self.MatrixB = np.empty(shape=[len(self.p)+1,len(self.q)+1])
#Store max score and location
self.maxScore = 0
self.maxI = None
self.maxJ =None
#Populates the A and B tables
#A table holds the scores and the B table holds the direction of the optimal solution for each sub problem
def calcTables(self):
#insert initial blank string 1
try:
self.q = '-' + self.q
except IOError:
print("Error with sequence 1")
#insert initial blank string 2
try:
self.p = '-' + self.p
except IOError:
print("Error with sequence 2")
#Initialize row and column 0 for A and B tables
self.MatrixA[:,0] = 0
self.MatrixA[0,:] = 0
self.MatrixB[:,0] = 0
self.MatrixB[0,:] = 0
for i in range(1,len(self.p)):
for j in range(1, len(self.q)):
#Look for match
if self.p[i] == self.q[j]:
#Match found
self.MatrixA[i][j] = self.MatrixA[i-1][j-1] + self.matchScore
#3 == "diagonal" for traversing solution
self.MatrixB[i][j] = 3
#Check for max score
if self.MatrixA[i][j] > self.maxScore:
self.maxScore = self.MatrixA[i][j]
self.maxI = i
self.maxJ = j
#Match not found
else:
self.MatrixA[i][j] = self.findMaxScore(i,j)
#Finds the maximum score either in the north or west neighbor in the A table
#Due to the ordering, gaps are checked first
def findMaxScore(self, i, j):
#North score
qDelet = self.MatrixA[i-1][j] + self.gapPen
#West score
pDelet = self.MatrixA[i][j-1] + self.gapPen
#Diagonal Score
mismatch = self.MatrixA[i-1][j-1] + self.mismatchPen
#Determine the max score
maxScore = max(qDelet, pDelet, mismatch)
#Set B table
if qDelet == maxScore:
self.MatrixB[i][j] = 2 #2 == "up" for traversing solution
elif pDelet == maxScore:
self.MatrixB[i][j] = 1 #1 == "left" for traversing solution
elif mismatch == maxScore:
self.MatrixB[i][j] = 3 #3 == "diagonal" for traversing solution
return maxScore
#Calculate the alignment with the highest score by tracing back the highest scoring local solution
#Integers:
#3 -> "DIAGONAL" -> match
#2 -> "UP" -> gap in string q
#1 -> "LEFT" -> gap in string p
#were used in the B table
def calcAlignemnt(self, i=None, j=None):
#Default arguments to the maximum score in the A table
if i is None and j is None:
i = self.maxI
j = self.maxJ
#Base case, end of the local alignment
if i == 0 or j == 0:
return
#Optimal solution "DIAGONAL"
if self.MatrixB[i][j] == 3:
self.calcAlignemnt(i-1 , j-1)
self.finalQ += self.q[j]
self.finalP += self.p[i]
else:
#Optimal solution "UP"
if self.MatrixB[i][j] == 2:
self.calcAlignemnt(i-1, j)
self.finalQ += '-'
self.finalP += self.p[i]
else:
#Optimal solution "LEFT"
self.calcAlignemnt(i, j-1)
self.finalP += '-'
self.finalQ += self.p[j]
#Parse the input sequence file for string
#Assumes fasta format
def parseFile(self, filePath, stringNumber):
#Empty sequence
seq = ""
#Parse the file
with open(filePath) as f:
for line in f:
#Remove new line characters
line = line.replace('\r',"") #Windows
line = line.replace('\n', "")
#Header encountered
if line.startswith(">"):
if stringNumber == 2:
self.stringQName = line.replace('>',"")
continue
elif stringNumber == 1:
self.stringPName = line.replace('>',"")
continue
else:
continue
#Append line
seq += line
f.close()
return seq
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.