[英]How to find all occurrences of a non - contiguous substring in Python?
前提:
我目前正在解决以下问题:
我必须找到字符串中所有出现的子字符串的索引组合,其中子字符串不一定是连续的。
我的测试参数:
Main_String = ACGGTTAACGTGACGGTTAAGSSGSSTSSTSSASSA
Substring = GGTTAA
不连续意味着Main_String
出现的Substring
可能如下所示: GGSTTSAASSS
其中子字符串是GGTTAA
和 MAIN_STRING - GGSTTSAASSS
。
子字符串虽然被随机字符(在我们的例子中是 S)切割,但仍然可以在主字符串中找到。 因此,可能的答案是(format:Letter((index + 1) in Main_String)) G(1)G(2)T(4)T(5)A(7)A(8) = 1, 2, 4, 5, 7, 8.
这很容易获得第一场比赛。 我需要得到所有可能的变化,所以如果我们使用上面的测试参数,正确的答案是:
3, 4, 5, 6, 7, 8 and 3, 12, 17, 18, 19, 20 and 3, 15, 17, 18, 19, 20 and so on up to 21, 24, 27, 30, 33, 36
.
题:
我需要一种算法,它可以为我提供给定字符串中非连续子字符串的所有可能变体。
问题:
这是我到目前为止的代码,它在一定程度上起作用,但不返回所有可能的变化,只返回其中的一些。
dna = ''
counter = -1
dna_subseq = ''
dna_subseq_indexes = []
with open('Rosalind_dna.txt', 'r') as file:
data = file.read().split('\n')
for line in data:
if line == '':
continue
if 'Rosalind' in line and counter < 1:
counter += 1
continue
elif 'Rosalind' not in line and counter < 1:
dna += line
elif 'Rosalind' not in line and counter >= 1:
dna_subseq += line
result = 0
dna_subseq_minus_start = dna_subseq[1:]
def find_next(start_parameter, base):
result_func = dna.find(base, start_parameter)
if result_func + 1 in dna_subseq_indexes_subcombo:
if result_func + 1 == 0:
return
find_next(start_parameter + 1, base)
else:
dna_subseq_indexes_subcombo.append(result_func + 1)
return
for index, value in enumerate(dna):
global_start = index
result = 0
while result != -1:
dna_subseq_indexes_subcombo = []
if value == dna_subseq[0]:
dna_subseq_indexes_subcombo.append(index + 1)
Flag = True
for base in dna_subseq_minus_start:
if Flag:
start = global_start
Flag = False
result = dna.find(base, start)
if result + 1 in dna_subseq_indexes_subcombo:
find_next(start + 1, base)
else:
dna_subseq_indexes_subcombo.append(result + 1)
start += 1
dna_subseq_indexes.append(dna_subseq_indexes_subcombo)
global_start += 1
else:
break
final_result = []
for x in dna_subseq_indexes:
test = x.copy()
test.sort()
if test == x:
final_result.append(x)
else:
continue
print(final_result)
我不确定您的算法是否可以找到所有解决方案,即使它会被修复。 我尝试了这种逻辑:找到最接近 dna 左侧的初始序列,并递归搜索从右到左的所有偏角。 这样,解决方案会自动排序。
dna = 'ACGGTTAACGTGACGGTTAAGSSGSSTSSTSSASSA'
dna_len = len(dna)
dna_subseq = 'GGTTAA'
subseq_len = len(dna_subseq)
count = 0
print_mode = True # Prints the solutions, set to False to collect them instead
# Finds a single solution starting from the previous one or from a null solution
def find_one_solution(prev_solution, subseq_start, dna_start):
global dna, dna_subseq, subseq_len, count, mode
searched = dna_subseq[subseq_start]
coll = prev_solution[:subseq_start]
subseq_idx = subseq_start
for i in range(dna_start, len(dna), 1):
letter = dna[i]
if letter == searched:
coll.append(i)
subseq_idx += 1
if (subseq_idx == subseq_len): break
else: searched = dna_subseq[subseq_idx]
if len(coll) < subseq_len: return None
count += 1
if (print_mode): print(coll)
return coll
# Recursive function
def find_all_solutions(solutions, solution, subseq_start, limit):
global dna, dna_subseq, subseq_len, print_mode
for start in range(subseq_len-1, limit-1, -1):
# last element
if start == subseq_len-1:
while True:
temp = find_one_solution(solution, start, solution[-1]+1)
if temp == None: break
else: solution = temp
if (not print_mode): solutions.append(solution)
# other elements
else:
# finds the next solution
temp = find_one_solution(solution, start, solution[start]+1)
if temp == None:
continue
else:
solution = temp
if (not print_mode): solutions.append(solution)
# and restarts from end with subseq_start as the left limit
find_all_solutions(solutions, solution, subseq_len-1, start)
def main():
all_solutions = []
# Finds the initial solution
initial_solution = [0] * subseq_len
initial_solution = find_one_solution(initial_solution, 0, initial_solution[0])
if initial_solution == None:
print("No solution found")
else:
if (not print_mode): all_solutions.append(initial_solution)
# Finds all other solutions
find_all_solutions(all_solutions, initial_solution, subseq_len-1, 0)
if (not print_mode): print(all_solutions)
print("Total count:", count)
if __name__=="__main__":
main()
#289 solutions found : [[2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 12], [2, 3, 4, 5, 6, 18], [2, 3, 4, 5, 6, 19], [2, 3, 4, 5, 6, 32], [2, 3, 4, 5, 6, 35], [2, 3, 4, 5, 7, 12], [2, 3, 4, 5, 7, 18], [2, 3, 4, 5, 7, 19], [2, 3, 4, 5, 7, 32], [2, 3, 4, 5, 7, 35], [2, 3, 4, 5, 12, 18], [2, 3, 4, 5, 12, 19], [2, 3, 4, 5, 12, 32], [2, 3, 4, 5, 12, 35], [2, 3, 4, 5, 18, 19], [2, 3, 4, 5, 18, 32], [2, 3, 4, 5, 18, 35], [2, 3, 4, 5, 19, 32], [2, 3, 4, 5, 19, 35], [2, 3, 4, 5, 32, 35], [2, 3, 4, 10, 12, 18], [2, 3, 4, 10, 12, 19], [2, 3, 4, 10, 12, 32], [2, 3, 4, 10, 12, 35], [2, 3, 4, 10, 18, 19], [2, 3, 4, 10, 18, 32], [2, 3, 4, 10, 18, 35], [2, 3, 4, 10, 19, 32], [2, 3, 4, 10, 19, 35], [2, 3, 4, 10, 32, 35], [2, 3, 4, 16, 18, 19], [2, 3, 4, 16, 18, 32], [2, 3, 4, 16, 18, 35], [2, 3, 4, 16, 19, 32], [2, 3, 4, 16, 19, 35], [2, 3, 4, 16, 32, 35], [2, 3, 4, 17, 18, 19], [2, 3, 4, 17, 18, 32], [2, 3, 4, 17, 18, 35], [2, 3, 4, 17, 19, 32], [2, 3, 4, 17, 19, 35], [2, 3, 4, 17, 32, 35], [2, 3, 4, 26, 32, 35], [2, 3, 4, 29, 32, 35], [2, 3, 5, 10, 12, 18], [2, 3, 5, 10, 12, 19], [2, 3, 5, 10, 12, 32], [2, 3, 5, 10, 12, 35], [2, 3, 5, 10, 18, 19], [2, 3, 5, 10, 18, 32], [2, 3, 5, 10, 18, 35], [2, 3, 5, 10, 19, 32], [2, 3, 5, 10, 19, 35], [2, 3, 5, 10, 32, 35], [2, 3, 5, 16, 18, 19], [2, 3, 5, 16, 18, 32], [2, 3, 5, 16, 18, 35], [2, 3, 5, 16, 19, 32], [2, 3, 5, 16, 19, 35], [2, 3, 5, 16, 32, 35], [2, 3, 5, 17, 18, 19], [2, 3, 5, 17, 18, 32], [2, 3, 5, 17, 18, 35], [2, 3, 5, 17, 19, 32], [2, 3, 5, 17, 19, 35], [2, 3, 5, 17, 32, 35], [2, 3, 5, 26, 32, 35], [2, 3, 5, 29, 32, 35], [2, 3, 10, 16, 18, 19], [2, 3, 10, 16, 18, 32], [2, 3, 10, 16, 18, 35], [2, 3, 10, 16, 19, 32], [2, 3, 10, 16, 19, 35], [2, 3, 10, 16, 32, 35], [2, 3, 10, 17, 18, 19], [2, 3, 10, 17, 18, 32], [2, 3, 10, 17, 18, 35], [2, 3, 10, 17, 19, 32], [2, 3, 10, 17, 19, 35], [2, 3, 10, 17, 32, 35], [2, 3, 10, 26, 32, 35], [2, 3, 10, 29, 32, 35], [2, 3, 16, 17, 18, 19], [2, 3, 16, 17, 18, 32], [2, 3, 16, 17, 18, 35], [2, 3, 16, 17, 19, 32], [2, 3, 16, 17, 19, 35], [2, 3, 16, 17, 32, 35], [2, 3, 16, 26, 32, 35], [2, 3, 16, 29, 32, 35], [2, 3, 17, 26, 32, 35], [2, 3, 17, 29, 32, 35], [2, 3, 26, 29, 32, 35], [2, 9, 10, 16, 18, 19], [2, 9, 10, 16, 18, 32], [2, 9, 10, 16, 18, 35], [2, 9, 10, 16, 19, 32], [2, 9, 10, 16, 19, 35], [2, 9, 10, 16, 32, 35], [2, 9, 10, 17, 18, 19], [2, 9, 10, 17, 18, 32], [2, 9, 10, 17, 18, 35], [2, 9, 10, 17, 19, 32], [2, 9, 10, 17, 19, 35], [2, 9, 10, 17, 32, 35], [2, 9, 10, 26, 32, 35], [2, 9, 10, 29, 32, 35], [2, 9, 16, 17, 18, 19], [2, 9, 16, 17, 18, 32], [2, 9, 16, 17, 18, 35], [2, 9, 16, 17, 19, 32], [2, 9, 16, 17, 19, 35], [2, 9, 16, 17, 32, 35], [2, 9, 16, 26, 32, 35], [2, 9, 16, 29, 32, 35], [2, 9, 17, 26, 32, 35], [2, 9, 17, 29, 32, 35], [2, 9, 26, 29, 32, 35], [2, 11, 16, 17, 18, 19], [2, 11, 16, 17, 18, 32], [2, 11, 16, 17, 18, 35], [2, 11, 16, 17, 19, 32], [2, 11, 16, 17, 19, 35], [2, 11, 16, 17, 32, 35], [2, 11, 16, 26, 32, 35], [2, 11, 16, 29, 32, 35], [2, 11, 17, 26, 32, 35], [2, 11, 17, 29, 32, 35], [2, 11, 26, 29, 32, 35], [2, 14, 16, 17, 18, 19], [2, 14, 16, 17, 18, 32], [2, 14, 16, 17, 18, 35], [2, 14, 16, 17, 19, 32], [2, 14, 16, 17, 19, 35], [2, 14, 16, 17, 32, 35], [2, 14, 16, 26, 32, 35], [2, 14, 16, 29, 32, 35], [2, 14, 17, 26, 32, 35], [2, 14, 17, 29, 32, 35], [2, 14, 26, 29, 32, 35], [2, 15, 16, 17, 18, 19], [2, 15, 16, 17, 18, 32], [2, 15, 16, 17, 18, 35], [2, 15, 16, 17, 19, 32], [2, 15, 16, 17, 19, 35], [2, 15, 16, 17, 32, 35], [2, 15, 16, 26, 32, 35], [2, 15, 16, 29, 32, 35], [2, 15, 17, 26, 32, 35], [2, 15, 17, 29, 32, 35], [2, 15, 26, 29, 32, 35], [2, 20, 26, 29, 32, 35], [2, 23, 26, 29, 32, 35], [3, 9, 10, 16, 18, 19], [3, 9, 10, 16, 18, 32], [3, 9, 10, 16, 18, 35], [3, 9, 10, 16, 19, 32], [3, 9, 10, 16, 19, 35], [3, 9, 10, 16, 32, 35], [3, 9, 10, 17, 18, 19], [3, 9, 10, 17, 18, 32], [3, 9, 10, 17, 18, 35], [3, 9, 10, 17, 19, 32], [3, 9, 10, 17, 19, 35], [3, 9, 10, 17, 32, 35], [3, 9, 10, 26, 32, 35], [3, 9, 10, 29, 32, 35], [3, 9, 16, 17, 18, 19], [3, 9, 16, 17, 18, 32], [3, 9, 16, 17, 18, 35], [3, 9, 16, 17, 19, 32], [3, 9, 16, 17, 19, 35], [3, 9, 16, 17, 32, 35], [3, 9, 16, 26, 32, 35], [3, 9, 16, 29, 32, 35], [3, 9, 17, 26, 32, 35], [3, 9, 17, 29, 32, 35], [3, 9, 26, 29, 32, 35], [3, 11, 16, 17, 18, 19], [3, 11, 16, 17, 18, 32], [3, 11, 16, 17, 18, 35], [3, 11, 16, 17, 19, 32], [3, 11, 16, 17, 19, 35], [3, 11, 16, 17, 32, 35], [3, 11, 16, 26, 32, 35], [3, 11, 16, 29, 32, 35], [3, 11, 17, 26, 32, 35], [3, 11, 17, 29, 32, 35], [3, 11, 26, 29, 32, 35], [3, 14, 16, 17, 18, 19], [3, 14, 16, 17, 18, 32], [3, 14, 16, 17, 18, 35], [3, 14, 16, 17, 19, 32], [3, 14, 16, 17, 19, 35], [3, 14, 16, 17, 32, 35], [3, 14, 16, 26, 32, 35], [3, 14, 16, 29, 32, 35], [3, 14, 17, 26, 32, 35], [3, 14, 17, 29, 32, 35], [3, 14, 26, 29, 32, 35], [3, 15, 16, 17, 18, 19], [3, 15, 16, 17, 18, 32], [3, 15, 16, 17, 18, 35], [3, 15, 16, 17, 19, 32], [3, 15, 16, 17, 19, 35], [3, 15, 16, 17, 32, 35], [3, 15, 16, 26, 32, 35], [3, 15, 16, 29, 32, 35], [3, 15, 17, 26, 32, 35], [3, 15, 17, 29, 32, 35], [3, 15, 26, 29, 32, 35], [3, 20, 26, 29, 32, 35], [3, 23, 26, 29, 32, 35], [9, 11, 16, 17, 18, 19], [9, 11, 16, 17, 18, 32], [9, 11, 16, 17, 18, 35], [9, 11, 16, 17, 19, 32], [9, 11, 16, 17, 19, 35], [9, 11, 16, 17, 32, 35], [9, 11, 16, 26, 32, 35], [9, 11, 16, 29, 32, 35], [9, 11, 17, 26, 32, 35], [9, 11, 17, 29, 32, 35], [9, 11, 26, 29, 32, 35], [9, 14, 16, 17, 18, 19], [9, 14, 16, 17, 18, 32], [9, 14, 16, 17, 18, 35], [9, 14, 16, 17, 19, 32], [9, 14, 16, 17, 19, 35], [9, 14, 16, 17, 32, 35], [9, 14, 16, 26, 32, 35], [9, 14, 16, 29, 32, 35], [9, 14, 17, 26, 32, 35], [9, 14, 17, 29, 32, 35], [9, 14, 26, 29, 32, 35], [9, 15, 16, 17, 18, 19], [9, 15, 16, 17, 18, 32], [9, 15, 16, 17, 18, 35], [9, 15, 16, 17, 19, 32], [9, 15, 16, 17, 19, 35], [9, 15, 16, 17, 32, 35], [9, 15, 16, 26, 32, 35], [9, 15, 16, 29, 32, 35], [9, 15, 17, 26, 32, 35], [9, 15, 17, 29, 32, 35], [9, 15, 26, 29, 32, 35], [9, 20, 26, 29, 32, 35], [9, 23, 26, 29, 32, 35], [11, 14, 16, 17, 18, 19], [11, 14, 16, 17, 18, 32], [11, 14, 16, 17, 18, 35], [11, 14, 16, 17, 19, 32], [11, 14, 16, 17, 19, 35], [11, 14, 16, 17, 32, 35], [11, 14, 16, 26, 32, 35], [11, 14, 16, 29, 32, 35], [11, 14, 17, 26, 32, 35], [11, 14, 17, 29, 32, 35], [11, 14, 26, 29, 32, 35], [11, 15, 16, 17, 18, 19], [11, 15, 16, 17, 18, 32], [11, 15, 16, 17, 18, 35], [11, 15, 16, 17, 19, 32], [11, 15, 16, 17, 19, 35], [11, 15, 16, 17, 32, 35], [11, 15, 16, 26, 32, 35], [11, 15, 16, 29, 32, 35], [11, 15, 17, 26, 32, 35], [11, 15, 17, 29, 32, 35], [11, 15, 26, 29, 32, 35], [11, 20, 26, 29, 32, 35], [11, 23, 26, 29, 32, 35], [14, 15, 16, 17, 18, 19], [14, 15, 16, 17, 18, 32], [14, 15, 16, 17, 18, 35], [14, 15, 16, 17, 19, 32], [14, 15, 16, 17, 19, 35], [14, 15, 16, 17, 32, 35], [14, 15, 16, 26, 32, 35], [14, 15, 16, 29, 32, 35], [14, 15, 17, 26, 32, 35], [14, 15, 17, 29, 32, 35], [14, 15, 26, 29, 32, 35], [14, 20, 26, 29, 32, 35], [14, 23, 26, 29, 32, 35], [15, 20, 26, 29, 32, 35], [15, 23, 26, 29, 32, 35], [20, 23, 26, 29, 32, 35]]
为了测试算法的有效性,我使用了一个更简单的序列,从中可以很容易地手动找到所有解决方案:
dna = 'AAAAAAAA'
dna_subseq = 'AAAA'
请注意,我使用基于零的索引,因为它更简单,但如果需要,您可以轻松地将 1 添加到所有结果中。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.