I have some strings representing the order of events:
s1 = 'A->B->E->D->A->C->B->D'
s2= 'A->B->C->A->B'
s3 = 'A->B->A
In each string I want to find all repeating patterns of max length N.
import itertools
def find_all_comb(event_list,max_events):
all_combs = []
for j in range(1,max_events+1):
all_combs.extend(list(set(['->'.join(x) for x in list(itertools.combinations(event_list,j))])))
return all_combs
def find_repeating_patterns(x):
split_events = x.split("->")
all_combs = find_all_comb(split_events,int(len(x)/2))
repeating_patterns = []
for comb in all_combs:
c_split_event = [p for p in split_events if p in comb]
if '->'.join(c_split_event).count(comb) > 1:
repeating_patterns.extend([comb])
output_list = []
longest_repeating_patterns = [s for s in repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in repeating_patterns)]
while output_list != longest_repeating_patterns:
if longest_repeating_patterns == []:
break
output_list = longest_repeating_patterns.copy()
longest_repeating_patterns = [s for s in longest_repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in longest_repeating_patterns)]
return output_list
For s1
, this returns the correct pattern [A,B,D]
, and for s2
, it returns [A,B]
. For s3
, it should return [A]
, but it returns an empty list. This is because of the line:
[s for s in repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in repeating_patterns)]
which does not allow len(s) > len(i)
.
How would I be able to capture both cases here?
Here is a simpler and more efficient solution:
def longest_subsequence(events, limit, sep='->'):
events = list(enumerate(events.split(sep)))
output = {}
seen = {}
for n in range(limit, 0, -1):
for combination in itertools.combinations(events, n):
indexes, key = zip(*combination)
if key in seen:
if key not in output and seen[key].isdisjoint(indexes):
output[key] = sep.join(key)
else:
seen[key] = set(indexes)
if output:
break
return list(output.values())
This looks at the longest matches first and terminates early if one is found. It eliminates self-overlapping repeated subsequences by saving the indexes of the last match and comparing them with the current candidate.
Demo:
samples = (
'A->B->E->D->A->C->B->D',
'A->B->C->A->B',
'A->B->A',
'A->B->E->D->A->C->B->E->D->A',
'B->B->B->C->C',
'A->B->A->B->C->C',
'A',
'',
)
for index, sample in enumerate(samples, 1):
result = longest_subsequence(sample, 4)
print('(%s) %r\n%s\n' % (index, sample, result))
Output:
(1) 'A->B->E->D->A->C->B->D'
['A->B->D']
(2) 'A->B->C->A->B'
['A->B']
(3) 'A->B->A'
['A']
(4) 'A->B->E->D->A->C->B->E->D->A'
['A->B->E->D', 'B->E->D->A']
(5) 'B->B->B->C->C'
['B->C']
(6) 'A->B->A->B->C->C'
['A->B->C']
(7) 'A'
[]
(8) ''
[]
Adding one extra line which adds anything that is not a subsequence to any of the sequences in the output_list
is one solution to this.
import itertools
def find_all_comb(event_list,max_events):
all_combs = []
for j in range(1,max_events+1):
all_combs.extend(list(set(['->'.join(x) for x in list(itertools.combinations(event_list,j))])))
return all_combs
def find_repeating_patterns(x):
split_events = x.split("->")
all_combs = find_all_comb(split_events,int(len(x)/2))
repeating_patterns = []
for comb in all_combs:
c_split_event = [p for p in split_events if p in comb]
if '->'.join(c_split_event).count(comb) > 1:
repeating_patterns.extend([comb])
output_list = []
longest_repeating_patterns = [s for s in repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in repeating_patterns)]
while output_list != longest_repeating_patterns:
if longest_repeating_patterns == []:
break
output_list = longest_repeating_patterns.copy()
longest_repeating_patterns = [s for s in longest_repeating_patterns if any(set(s).issuperset(set(i)) and len(s) > len(i) for i in longest_repeating_patterns)]
output_list.extend([s for s in repeating_patterns if not any(set(i).issuperset(set(s)) for i in output_list)]) <--- ADDED LINE FOR SOLUTION
return output_list
s1 = A->B->E->D->A->C->B->D
s2 = A->B->C->A->B
s3 = A->B->A
print(find_repeating_patterns(s1))
output: [A->B->D]
print(find_repeating_patterns(s2))
output: [A->B]
print(find_repeating_patterns(s3))
output: [A]
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.