![](/img/trans.png)
[英]Is there a way to find minimum element in a set in less than O(n) time?
[英]Return string that is not a substring of other strings - is it possible in time less than O(n^2)?
您将获得一个字符串数组。 你必须只返回那些不是数组中其他字符串的子字符串的字符串。 输入 - ['abc','abcd','ab','def','efgd']
。 输出应该是 - 'abcd'
和'efgd'
我在python中提出了一个时间复杂度为O(n ^ 2)的解决方案。 是否有可能提供较少时间复杂度的解决方案? 我的解决方案
def sub(l,s):
l1=l
for i in range (len(l)):
l1[i]=''.join(sorted(l1[i]))
for i in l1:
if s in i:
return True
return False
def main(l):
for i in range(len(l)):
if sub(l[0:i-1]+l[i+1:],l[i])==False:
print l[i]
main(['abc','abcd','ab','def','efgd'])
记忆是个问题吗? 你可以转向经过考验的真实......真的!
建一个后缀树!
鉴于你的输入['abc','abcd','ab','def','efgd']
我们会有一棵树
_
/ | \
a e d
/ | \
b* f e
/ | \
c* g f*
/ |
d* d*
利用所述树的DFS(深度优先搜索)搜索,您将找到最深的叶子abcd
, efgd
和def
树遍历非常简单,您的时间复杂度为O(n*m).
比之前的O(n^2)
时间好得多。
使用这种方法,添加新密钥变得简单,并且仍然可以轻松找到唯一密钥。
考虑添加密钥deg
你的新树大约是
_
/ | \
a e d
/ | \
b* f e
/ | / \
c* g g* f*
/ |
d* d*
使用这个新树,执行DFS搜索以获得不是其他密钥的唯一密钥仍然是一件简单的事情。
from typing import List
class Trie(object):
class Leaf(object):
def __init__(self, data, is_key):
self.data = data
self.is_key = is_key
self.children = []
def __str__(self):
return "{}{}".format(self.data, "*" if self.is_key else "")
def __init__(self, keys):
self.root = Trie.Leaf('', False)
for key in keys:
self.add_key(key)
def add_key(self, key):
self._add(key, self.root.children)
def has_suffix(self, suffix):
leaf = self._find(suffix, self.root.children)
if not leaf:
return False
# This is only a suffix if the returned leaf has children and itself is not a key
if not leaf.is_key and leaf.children:
return True
return False
def includes_key(self, key):
leaf = self._find(key, self.root.children)
if not leaf:
return False
return leaf.is_key
def delete(self, key):
"""
If the key is present as a unique key as in it does not have any children nor are any of its nodes comprised of
we should delete all of the nodes up to the root
If the key is a prefix of another long key in the trie, umark the leaf node
if the key is present in the trie and contains no children but contains nodes that are keys we should delete all
nodes up to the first encountered key
:param key:
:return:
"""
if not key:
raise KeyError
self._delete(key, self.root.children, None)
def _delete(self, key, children: List[Leaf], parents: (List[Leaf], None), key_idx=0, parent_key=False):
if not parents:
parents = [self.root]
if key_idx >= len(key):
return
key_end = True if len(key) == key_idx + 1 else False
suffix = key[key_idx]
for leaf in children:
if leaf.data == suffix:
# we have encountered a leaf node that is a key we can't delete these
# this means our key shares a common branch
if leaf.is_key:
parent_key = True
if key_end and leaf.children:
# We've encountered another key along the way
if parent_key:
leaf.is_key = False
else:
# delete all nodes recursively up to the top of the first node that has multiple children
self._clean_parents(key, key_idx, parents)
elif key_end and not leaf.children:
# delete all nodes recursively up to the top of the first node that has multiple children
self._clean_parents(key, key_idx, parents)
# Not at the key end so we need to keep traversing the tree down
parents.append(leaf)
self._delete(key, leaf.children, parents, key_idx + 1, key_end)
def _clean_parents(self, key, key_idx, parents):
stop = False
while parents and not stop:
p = parents.pop()
# Need to stop processing a removal at a branch
if len(p.children) > 1:
stop = True
# Locate our branch and kill its children
for i in range(len(p.children)):
if p.children[i].data == key[key_idx]:
p.children.pop(i)
break
key_idx -= 1
def _find(self, key, children: List[Leaf]):
if not key:
raise KeyError
match = False
if len(key) == 1:
match = True
suffix = key[0]
for leaf in children:
if leaf.data == suffix and not match:
return self._find(key[1:], leaf.children)
elif leaf.data == suffix and match:
return leaf
return None
def _add(self, key, children: List[Leaf]):
if not key:
return
is_key = False
if len(key) == 1:
is_key = True
suffix = key[0]
for leaf in children:
if leaf.data == suffix:
self._add(key[1:], leaf.children)
break
else:
children.append(Trie.Leaf(suffix, is_key))
self._add(key[1:], children[-1].children)
return
@staticmethod
def _has_children(leaf):
return bool(leaf.children)
def main():
keys = ['ba', 'bag', 'a', 'abc', 'abcd', 'abd', 'xyz']
trie = Trie(keys)
print(trie.includes_key('ba')) # True
print(trie.includes_key('b')) # False
print(trie.includes_key('dog')) # False
print(trie.has_suffix('b')) # True
print(trie.has_suffix('ab')) # True
print(trie.has_suffix('abd')) # False
trie.delete('abd') # Should only remove the d
trie.delete('a') # should unmark a as a key
trie.delete('ba') # should remove the ba trie
trie.delete('xyz') # Should remove the entire branch
trie.delete('bag') # should only remove the g
print(trie)
if __name__ == "__main__":
main()
请注意上面的trie实现没有实现DFS搜索; 然而,为您提供一些惊人的腿部工作,以便开始。
使用Aho-Corasick应该允许你获得O(n)
渐近运行时间,代价是增加额外的内存使用,以及更高的成本固定乘数(被big-O表示法忽略,但仍然有意义)。 算法的复杂性是几个组件的总和,但它们都没有相乘,因此它应该是所有度量的线性(字符串数,字符串长度,最长字符串等)。
使用pyahocorasick
,你会做一个初始传递来制作一个可以一次扫描所有字符串的自动机:
import ahocorasick
# This code assumes no duplicates in mystrings (which would make them mutually
# substrings). Easy to handle if needed, but simpler to avoid for demonstration
mystrings = ['abc','abcd','ab','def','efgd']
# Build Aho-Corasick automaton, involves O(n) (in combined length of mystrings) work
# Allows us to do single pass scans of a string for all strings in mystrings
# at once
aut = ahocorasick.Automaton()
for s in mystrings:
# mapping string to itself means we're informed directly of which substring
# we hit as we scan
aut.add_word(s, s)
aut.make_automaton()
# Initially, assume all strings are non-substrings
nonsubstrings = set(mystrings)
# Scan each of mystrings for substrings from other mystrings
# This only involves a single pass of each s in mystrings thanks to Aho-Corasick,
# so it's only O(n+m) work, where n is again combined length of mystrings, and
# m is the number of substrings found during the search
for s in mystrings:
for _, substr in aut.iter(s):
if substr != s:
nonsubstrings.discard(substr)
# A slightly more optimized version of the above loop, but admittedly less readable:
# from operator import itemgetter
# getsubstr = itemgetter(1)
# for s in mystrings:
# nonsubstrings.difference_update(filter(s.__ne__, map(getsubstr, aut.iter(s))))
for nonsub in nonsubstrings:
print(nonsub)
注意:令人讨厌的是,我现在正在一台没有编译器的机器上,所以我无法安装pyahocorasick
来测试这段代码,但我之前已经使用过它,而且我相信这应该可行,模糊愚蠢的拼写错误。
弹出第一个元素。 遍历每个剩余的元素,看看较短的字符串是否是较长字符串的子字符串。 重复。 那应该是O(n log n)
编辑:粗略的实施草案
def not_substrings(l):
mask = [True]*len(l)
for i in range(len(l)):
if not mask[i]:
continue
for j in range(i+1, len(l)):
if len(l[i]) > len(l[j]):
if l[j] in l[i]:
mask[j] = False
elif l[j] == l[i]:
mask[j] = False
mask[i] = False
else:
if l[i] in l[j]:
mask[i] = False
if mask[i]:
print l[i]
我没有运行此代码,但它应该大致正确。 我不知道是否有一种没有掩码的方法,或者[True]*len(l)
语句的时间复杂度。 我没有做过任何严谨的分析,但这种看起来n log n
给我,因为每次迭代仅在列表上,而不是整个列表的滞留迭代。
使用set
对象保留所有子字符串。 这个速度更快但是使用了大量内存,如果每个字符串很短,你可以尝试这个。
import string
import random
from itertools import combinations
def get_substrings(w):
return (w[s:e] for s, e in combinations(range(len(w)+1), 2))
def get_not_substrings(words):
words = sorted(set(words), key=len, reverse=True)
substrings = set()
for w in words:
if w not in substrings:
yield w
substrings.update(get_substrings(w))
words = ["".join(random.choice(string.ascii_lowercase)
for _ in range(random.randint(1, 12))) for _ in range(10000)]
res = list(get_not_substrings(words))
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.