I want to write a Python code that computes the longest common substring between two strings from the input.
Example:
word1 = input('Give 1. word: xlaqseabcitt')
word2 = input('Give 2. word: peoritabcpeor')
Wanted output:
abc
I have code like this so far:
word1 = input("Give 1. word: ")
word2 = input("Give 2. word: ")
longestSegment = ""
tempSegment = ""
for i in range(len(word1)):
if word1[i] == word2[i]:
tempSegment += word1[i]
else:
tempSegment = ""
if len(tempSegment) > len(longestSegment):
longestSegment = tempSegment
print(longestSegment)
I end up with IndexError when word2 is shorter than word1, and it does not give me the common substring.
EDIT: I found this solution:
string1 = input('Give 1. word: ')
string2 = input('Give 2. word: ')
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
print(answer)
However, I would like to see a library function call that could be used to compute the longest common substring between two strings.
Alternatively, please suggest a more concise code to achieve the same.
You can build a dictionary from the first string containing the positions of each character, keyed on the characters. Then go through the second string and compare the substring of each character with the rest of the second string at that position:
# extract common prefix
def common(A,B) :
firstDiff = (i for i,(a,b) in enumerate(zip(A,B)) if a!=b) # 1st difference
commonLen = next(firstDiff,min(len(A),len(B))) # common length
return A[:commonLen]
word1 = "xlaqseabcitt"
word2 = "peoritabcpeor"
# position(s) of each character in word1
sub1 = dict()
for i,c in enumerate(word1): sub1.setdefault(c,[]).append(i)
# maximum (by length) of common prefixes from matching first characters
maxSub = max((common(word2[i:],word1[j:])
for i,c in enumerate(word2)
for j in sub1.get(c,[])),key=len)
print(maxSub) # abc
For me, looks like the solution that works is using the suffix_trees
package :
from suffix_trees import STree
a = ["xxx ABC xxx", "adsa abc"]
st = STree.STree(a)
print(st.lcs()) # "abc"
difflib
from difflib import SequenceMatcher
string1 = "apple pie available"
string2 = "come have some apple pies"
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print(string1[match.a:match.a + match.size])
print(string2[match.b:match.b + match.size])
>>>
apple pie
apple pie
" pylcs
is a super fast C++ library which adopts dynamic programming (DP) algorithm to solve two classic LCS problems."
Usage:
import pylcs
A = 'xlaqseabcitt'
B = 'peoritabcpeor'
res = pylcs.lcs_string_idx(A, B)
substring = ''.join([B[i] for i in res if i != -1])
print(substring) # 'abc'
Unfortunately, it's not a single function call. But still it's pretty short.
res
is a mapping of characters in A
to the indices of characters in B
of the longest common substring (-1 indicates that this character in A
is NOT part of the longest common substring ):
[-1, -1, -1, -1, -1, -1, 6, 7, 8, -1, -1, -1]
Here is an answer if you later want to compute any number of strings. It should return the longest common substring. It work with the different test i gave it. (as long as you don't use the '§' character) It is not a library but you can still import the functions in your code just like a library. You can use the same logic with your own code (only for two strings.) Do so as follows (put both files in the same directory for the sake of simplicity). I am supposing you will call the file findmatch.py.
import findmatch
longest_substring = findmatch.prep(['list', 'of', 'strings'])
Here is the code that should be in 'findmatch.py'.
def main(words,first):
nextreference = first
reference = first
for word in words:
foundsub = False
print('reference : ',reference)
print('word : ', word)
num_of_substring = 0
length_longest_substring = 0
for i in range(len(word)):
print('nextreference : ', nextreference)
letter = word[i]
print('letter : ', letter)
if word[i] in reference:
foundsub = True
num_of_substring += 1
locals()['substring'+str(num_of_substring)] = word[i]
print('substring : ', locals()['substring'+str(num_of_substring)])
for j in range(len(reference)-i):
if word[i:i+j+1] in reference:
locals()['substring'+str(num_of_substring) ]= word[i:i+j+1]
print('long_sub : ',locals()['substring'+str(num_of_substring)])
print('new : ',len(locals()['substring'+str(num_of_substring)]))
print('next : ',len(nextreference))
print('ref : ', len(reference))
longer = (len(reference)<len(locals()['substring'+str(num_of_substring)]))
longer2 = (len(nextreference)<len(locals()['substring'+str(num_of_substring)]))
if (num_of_substring==1) or longer or longer2:
nextreference = locals()['substring'+str(num_of_substring)]
if not foundsub:
for i in range(len(words)):
words[i] = words[i].replace(reference, '§')
#§ should not be used in any of the strings, put a character you don't use here
print(words)
try:
nextreference = main(words, first)
except Exception as e:
return None
reference = nextreference
return reference
def prep(words):
first = words[0]
words.remove(first)
answer = main(words, first)
return answer
if __name__ == '__main__':
words = ['azerty','azertydqse','fghertqdfqf','ert','sazjjjjjjjjjjjert']
#just some absurd examples any word in here
substring = prep(words)
print('answer : ',substring)
It is basically creating your own library.
I hope this aswers helps someone.
Here is a recursive solution:
def lcs(X, Y, m, n):
if m == 0 or n == 0:
return 0
elif X[m - 1] == Y[n - 1]:
return 1 + lcs(X, Y, m - 1, n - 1);
else:
return max(lcs(X, Y, m, n - 1), lcs(X, Y, m - 1, n));
Since someone asked for a multiple-word solution, here's one:
def multi_lcs(words):
words.sort(key=lambda x:len(x))
search = words.pop(0)
s_len = len(search)
for ln in range(s_len, 0, -1):
for start in range(0, s_len-ln+1):
cand = search[start:start+ln]
for word in words:
if cand not in word:
break
else:
return cand
return False
>>> multi_lcs(['xlaqseabcitt', 'peoritabcpeor'])
'abc'
>>> multi_lcs(['xlaqseabcitt', 'peoritabcpeor', 'visontatlasrab'])
'ab'
for small strings, copy this into a file in your project, let's say string_utils.py
def find_longest_common_substring(string1, string2):
s1 = string1
s2 = string2
longest_substring = ""
longest_substring_i1 = None
longest_substring_i2 = None
# iterate through every index (i1) of s1
for i1, c1 in enumerate(s1):
# for each index (i2) of s2 that matches s1[i1]
for i2, c2 in enumerate(s2):
# if start of substring
if c1 == c2:
delta = 1
# make sure we aren't running past the end of either string
while i1 + delta < len(s1) and i2 + delta < len(s2):
# if end of substring
if s2[i2 + delta] != s1[i1 + delta]:
break
# still matching characters move to the next character in both strings
delta += 1
substring = s1[i1:(i1 + delta)]
# print(f'substring candidate: {substring}')
# replace longest_substring if newly found substring is longer
if len(substring) > len(longest_substring):
longest_substring = substring
longest_substring_i1 = i1
longest_substring_i2 = i2
return (longest_substring, longest_substring_i1, longest_substring_i2)
Then it can be used as follows:
import string_utils
print(f"""(longest substring, index of string1, index of string2):
{ string_utils.find_longest_common_substring("stackoverflow.com", "tackerflow")}""")
For any that are curious the print statement when uncommented prints:
substring candidate: tack
substring candidate: ack
substring candidate: ck
substring candidate: o
substring candidate: erflow
substring candidate: rflow
substring candidate: flow
substring candidate: low
substring candidate: ow
substring candidate: w
substring candidate: c
substring candidate: o
(longest substring, index of string1, index of string2):
('erflow', 7, 4)
Here is a naive solution in terms of time complexity but simple enough to understand:
def longest_common_substring(a, b):
"""Find longest common substring between two strings A and B."""
if len(a) > len(b):
a, b = b, a
for i in range(len(a), 0, -1):
for j in range(len(a) - i + 1):
if a[j:j + i] in b:
return a[j:j + i]
return ''
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.