[英]Pythonic way to “merge” strings, handling all possible lengths
我正在嘗試解決生物信息學中一個相當普遍的問題,而不必訴諸一堆if語句。
手頭的問題:
我給了兩個重疊的字符串和預期輸出的長度,我想產生一個合並的字符串。 這里是字符串可能重疊的所有方式:(在以下示例中, -
表示該字符串在該位置沒有任何內容。在示例之后解釋了consensus()
位。):
# size=13
xxxOVERLAP---
---OVERLAPyyy
# expected output: xxx + consensus(xOVERLAP, yOVERLAP) + yyy
# size=7
---OVERLAPxxx
yyyOVERLAP---
# expected output: consensus(xOVERLAP, yOVERLAP)
# size=7
OVERLAP
OVERLAP
# expected output: consensus(xOVERLAP, yOVERLAP)
# size=10
xxxOVERLAP
---OVERLAP
# expected output: xxx + consensus(xOVERLAP, yOVERLAP)
# size=10
OVERLAP---
OVERLAPyyy
# expected output: consensus(xOVERLAP, yOVERLAP) + yyy
# size > len(x) + len(y)
# no overlap, produce error:
xxx---
---yyy
# expected output: error
生成的合並字符串需要以x
的開頭和y
的結尾結束 。 重疊的區域需要傳遞給另一個函數, consensus()
,用於合並重疊的區域。 這是字符串可能重疊的所有方式:(在以下示例中, -
表示該字符串在該位置沒有任何內容)
def merge(x, y, size):
# do the mergeing
return part of x that doesn't overlap + consensus(overlap) + part of y that doesn't overlap.
我可以編寫一堆if語句來識別每種情況並分別進行處理,但是我一直在努力尋找一個更優雅的解決方案。 我考慮的一種方法是對字符串進行填充(x的結尾和y的開頭),以便所有情況都像第二個示例一樣,但這似乎效率不高,無法接受,因為這樣做時我會制作新的字符串而且我正在將此功能應用於數百萬個字符串。
我將從產生每個字符的生成器開始:
def merge_gen(x, y, overhang):
buffer = ' ' * overhang
for s in map(set, zip(buffer + x, y + buffer)):
yield max(s)
overhang
為len(x) - size
(請參見下文)
如下所示:
>>> list(merge_gen('OVERLAPXXX', 'YYYOVERLAP', 3))
['Y', 'Y', 'Y', 'O', 'V', 'E', 'R', 'L', 'A', 'P', 'X', 'X', 'X']
然后,您可以實現包含consensus
功能的merge
功能,如下所示:
def merge(x, y, size):
length = len(x)
overhang = size - length
overlap = length - overhang
gen = merge_gen(x, y, overhang)
result = ''
result += ''.join(next(gen) for _ in range(overhang))
result += consensus(''.join(next(gen) for _ in range(overlap)))
result += ''.join(next(gen) for _ in range(overhang))
return result
我希望這在Python3中相當有效; 大量的生成器,很少的字符串浪費掉,等等。
(*) 顯然,這是從集中獲取單個物品的快速方法。 在這種情況下,我們知道該集合只有一個元素,我們只想提取。
這是您正在尋找的功能嗎?
def consensus(left, right, ignore_blank_padding=True):
if ignore_blank_padding:
left = left.strip()
right = right.strip()
slide = len(left) + len(right) - 1
#slides the strings over each other one spot at a time
solutions = []
for i in range(slide):
lft_test = left[-(i+1):]
rgt_test = right[:min(len(right), i+1)]
#print(lft_test, rgt_test)
if lft_test == rgt_test:
lft_garbage = left[:-(i+1)]
rgt_garbage = right[min(len(right), (i+1)):]
solutions.append((lft_garbage, lft_test, rgt_garbage))
#if more than one overlap combo is found, keeps only the longest
if len(solutions) > 1:
sol_lenghts = [len(i[1]) for i in solutions]
longest_index = sol_lenghts.index(max(an_lens))
solutions = solutions[longest_index]
return solutions
elif len(solutions) == 0:
return None
else:
return solutions[0]
left = 'xxxxHEY'
right = 'HEYxx'
consensus(left, right)
> ('xxxx', 'HEY', 'xx')
left = 'xxHEYHEY'
right = 'HEYHEYxxx'
consensus(left, right)
> ('xx', 'HEYHEY', 'xxx')
left = 'xxHEY '
right = ' HEYHEYxxxx'
consensus(left, right)
> ('xx', 'HEY', 'HEYxxxx')
left = 'HEY'
right = ' HEYHEYxxxx'
consensus(left, right)
> ('', 'HEY', 'HEYxxxx')
將舊的答案留在滑動窗口中,但此處有指定的重疊部分:
def consensus(left, right, size, ignore_blank_padding=True):
if ignore_blank_padding:
left = left.strip()
right = right.strip()
solutions = None
lft_test = left[-(size):]
rgt_test = right[:size]
if lft_test == rgt_test:
lft_garbage = left[:-(size)]
rgt_garbage = right[min(len(right), (size)):]
solutions = (lft_garbage, lft_test, rgt_garbage)
return solutions
left = 'xxxxHEY'
right = 'HEYxx'
consensus(left, right, 3)
> ('xxxx', 'HEY', 'xx')
left = 'xxHEYHEY'
right = 'HEYHEYxxx'
consensus(left, right, 6)
> ('xx', 'HEYHEY', 'xxx')
left = 'xxHEY '
right = ' HEYHEYxxxx'
consensus(left, right, 3)
> ('xx', 'HEY', 'HEYxxxx')
left = 'HEY'
right = ' HEYHEYxxxx'
consensus(left, right, 3)
> ('', 'HEY', 'HEYxxxx')
這是一個有效的示例,但是使用了“ if語句過多的方式”方法,該方法難以閱讀,難以推理並且非常不雅致:
def extra_left(x, y, size):
if size - len(y) > 0:
return x[:size - len(y)]
else:
return ""
def extra_right(x, y, size):
if size - len(x) > 0:
return y[len(x) - size:]
else:
return ""
def overlap(x, y, size):
if len(x) < size and len(y) < size:
x_overlap = x[size - len(y):]
y_overlap = y[:len(x) - size]
if len(x) < size and len(y) == size:
x_overlap = x
y_overlap = y[:len(x) - size]
if len(x) < size and len(y) > size:
x_overlap = x
y_overlap = y[len(y)-size:size]
if len(x) == size and len(y) < size:
x_overlap = x[size - len(y):]
y_overlap = y
if len(x) == size and len(y) == size:
x_overlap = x
y_overlap = y
if len(x) == size and len(y) > size:
x_overlap = x
y_overlap = y[len(y) - size:]
if len(x) > size and len(y) < size:
x_overlap = x[size - len(y):size]
y_overlap = y
if len(x) > size and len(y) == size:
x_overlap = x[:size]
y_overlap = y
if len(x) > size and len(y) > size:
x_overlap = x[:size]
y_overlap = y[-size:]
if len(x) + len(y) < size:
raise RuntimeError("x and y do not overlap with this size")
return consensus(x_overlap, y_overlap)
def consensus(x, y):
assert len(x) == len(y)
return x
def merge(x, y, size):
return extra_left(x, y, size) + overlap(x, y, size) + extra_right(x, y, size)
這是一些單元測試(使用pytest )
class Tests:
def test1(self):
"""
len(x) < size and len(y) < size:
xxxOVERLAP---
---OVERLAPyyy
# expected output: xxx + consensus(xOVERLAP, yOVERLAP) + yyy
"""
x = "AAAATTTTTTT"
y = "TTTTTTTCCC"
size = 14
assert merge(x, y, size) == "AAAA" + consensus("TTTTTTT", "TTTTTTT") + "CCC"
def test2(self):
"""
if len(x) < size and len(y) == size:
# size=10
OVERLAP---
OVERLAPyyy
# expected output: consensus(xOVERLAP, yOVERLAP) + yyy
"""
x = "TTTTTTT"
y = "TTTTTTTCCC"
size = 10
assert merge(x, y, size) == consensus("TTTTTTT", "TTTTTTT") + "CCC"
def test3(self):
"""
if len(x) < size and len(y) > size:
---OVERLAP---
yyyOVERLAPyyy
"""
x = "TTTTTTT"
y = "CCCTTTTTTTCCC"
size = 10
assert merge(x, y, size) == consensus("TTTTTTT", "TTTTTTT") + "CCC"
def test4(self):
"""
if len(x) == size and len(y) < size:
# size=10
xxxOVERLAP
---OVERLAP
# expected output: xxx + consensus(xOVERLAP, yOVERLAP)
"""
x = "AAATTTTTTT"
y = "TTTTTTT"
size = 10
assert merge(x, y, size) == "AAA" + consensus("TTTTTTT", "TTTTTTT")
def test5(self):
"""
if len(x) == size and len(y) == size:
# size=7
OVERLAP
OVERLAP
# expected output: consensus(xOVERLAP, yOVERLAP)
"""
x = "TTTTTTT"
y = "TTTTTTT"
size = 7
assert merge(x, y, size) == consensus("TTTTTTT", "TTTTTTT")
def test6(self):
"""
if len(x) == size and len(y) > size:
# size=10
--xxxOVERLAP
yyyyyOVERLAP
# expected output: consensus(xOVERLAP, yOVERLAP)
"""
x = "AAATTTTTTT"
y = "CCCCCTTTTTTT"
size = 10
assert merge(x, y, size) == "AAA" + consensus("TTTTTTT", "TTTTTTT")
def test7(self):
"""
if len(x) > size and len(y) < size:
xxxOVERLAPxxx
---OVERLAP---
"""
x = "AAATTTTTTTAAA"
y = "TTTTTTT"
size = 10
assert merge(x, y, size) == "AAA" + consensus("TTTTTTT", "TTTTTTT")
def test8(self):
"""
if len(x) > size and len(y) == size:
---OVERLAPxxx
---OVERLAP---
"""
x = "TTTTTTTAAA"
y = "TTTTTTT"
size = 7
assert merge(x, y, size) == consensus("TTTTTTT", "TTTTTTT")
def test9(self):
"""
if len(x) > size and len(y) > size:
---OVERLAPxxx
yyyOVERLAP---
# expected output: consensus(xOVERLAP, yOVERLAP)
"""
x = "TTTTTTTAAA"
y = "CCCTTTTTTT"
size = 7
assert merge(x, y, size) == consensus("TTTTTTT", "TTTTTTT")
def test_error(self):
"""
# no overlap, produce error:
xxx---
---yyy
# expected output: error
"""
x = "AAA"
y = "TTT"
size = 7
with pytest.raises(RuntimeError):
merge(x, y, size)
他們都通過了:
test_merge.py::Tests::test1 PASSED
test_merge.py::Tests::test2 PASSED
test_merge.py::Tests::test3 PASSED
test_merge.py::Tests::test4 PASSED
test_merge.py::Tests::test5 PASSED
test_merge.py::Tests::test6 PASSED
test_merge.py::Tests::test7 PASSED
test_merge.py::Tests::test8 PASSED
test_merge.py::Tests::test9 PASSED
test_merge.py::Tests::test_error PASSED
====================================================================== 10 passed in 0.02 seconds =======================================================================
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.