[英]How to implement the remove function of a trie in python?
我已經閱讀了 python 中的 trie 的以下實現: https://stackoverflow.com/a/11016430/2225221
並嘗試為其設置刪除功能。 基本上,即使一開始我也遇到了問題:如果你想從樹中刪除一個詞,它可以有子“詞”,或者它可以是另一個詞的“子詞”。
如果你用“del dict[key]”刪除,你也刪除了上面提到的這兩種詞。 誰能幫助我,如何正確刪除所選單詞(讓我們假設它在特里)
基本上,要從trie中刪除一個單詞(在鏈接的答案中已實現),您只需刪除其_end
標記,例如:
def remove_word(trie, word):
current_dict = trie
for letter in word:
current_dict = current_dict.get(letter, None)
if current_dict is None:
# the trie doesn't contain this word.
break
else:
del current_dict[_end]
但是請注意,這不能確保trie的最小大小。 刪除單詞后,左側的樹狀結構中可能有不再被任何單詞使用的分支。 這不會影響數據結構的正確性,僅表示該Trie可能消耗比絕對必要更多的內存。 您可以通過從葉節點向后迭代並刪除分支,直到找到具有多個子節點的分支,來改善這一點。
編輯:這是一個想法,您可以如何實現刪除功能,該功能還剔除所有不必要的分支。 可能有一種更有效的方法,但這可能會讓您入門:
def remove_word2(trie, word):
current_dict = trie
path = [current_dict]
for letter in word:
current_dict = current_dict.get(letter, None)
path.append(current_dict)
if current_dict is None:
# the trie doesn't contain this word.
break
else:
if not path[-1].get(_end, None):
# the trie doesn't contain this word (but a prefix of it).
return
deleted_branches = []
for current_dict, letter in zip(reversed(path[:-1]), reversed(word)):
if len(current_dict[letter]) <= 1:
deleted_branches.append((current_dict, letter))
else:
break
if len(deleted_branches) > 0:
del deleted_branches[-1][0][deleted_branches[-1][1]]
del path[-1][_end]
本質上,它首先找到將要刪除的單詞的“路徑”,然后向后迭代以找到可以刪除的節點。 然后,它刪除可以刪除的路徑的根(這也會隱式刪除_end
節點)。
我認為最好以遞歸方式進行,代碼如下:
def remove(self, word):
self.delete(self.tries, word, 0)
def delete(self, dicts, word, i):
if i == len(word):
if 'end' in dicts:
del dicts['end']
if len(dicts) == 0:
return True
else:
return False
else:
return False
else:
if word[i] in dicts and self.delete(dicts[word[i]], word, i + 1):
if len(dicts[word[i]]) == 0:
del dicts[word[i]]
return True
else:
return False
else:
return False
處理此類結構的一種方法是通過遞歸 。 在這種情況下,關於遞歸的好處在於,它壓縮到了Trie的底部,然后將返回的值通過分支傳遞回去。
下面的功能就是這樣做的。 萬一輸入的單詞是另一個單詞的前綴,它將轉到葉子並刪除_end
值。 然后,它傳遞一個布爾值( boo
),該值指示current_dict
仍在外圍分支中。 一旦我們達到了當前dict有多個孩子的地步,我們將刪除相應的分支並將boo設置為False
以便每個剩余的遞歸都將無效。
def trie_trim(term, trie=SYNONYMS, prev=0):
# checks that we haven't hit the end of the word
if term:
first, rest = term[0], term[1:]
current_length = len(trie)
next_length, boo = trie_trim(rest, trie=trie[first], prev=current_length)
# this statement avoids trimming excessively if the input is a prefix because
# if the word is a prefix, the first returned value will be greater than 1
if boo and next_length > 1:
boo = False
# this statement checks for the first occurrence of the current dict having more than one child
# or it checks that we've hit the bottom without trimming anything
elif boo and (current_length > 1 or not prev):
del trie[first]
boo = False
return current_length, boo
# when we do hit the end of the word, delete _end
else:
del trie[_end]
return len(trie) + 1, True
def remove_a_word_util(self, word, idx, node):
if len(word) == idx:
node.is_end_of_word = False
return bool(node.children)
ch = word[idx]
if ch not in node.children:
return True
flag = self.remove_a_word_util(word, idx+1, node.children[ch])
if flag:
return True
node.children.pop(ch)
return bool(node.children) or node.is_end_of_word
有點長,但我希望這有助於回答您的問題:
class Trie:
WORD_END = "$"
def __init__(self):
self.trie = {}
def insert(self, word):
cur = self.trie
for char in word:
if char not in cur:
cur[char] = {}
cur = cur[char]
cur[Trie.WORD_END] = word
def delete(self, word):
def _delete(word, cur_trie, i=0):
if i == len(word):
if Trie.WORD_END not in cur_trie:
raise ValueError("'%s' is not registered in the trie..." %word)
cur_trie.pop(Trie.WORD_END)
if len(cur_trie) > 0:
return False
return True
if word[i] not in cur_trie:
raise ValueError("'%s' is not registered in the trie..." %word)
cont = _delete(word, cur_trie[word[i]], i+1)
if cont:
cur_trie.pop(word[i])
if Trie.WORD_END in cur_trie:
return False
return True
return False
_delete(word, self.trie)
t = Trie()
t.insert("bar")
t.insert("baraka")
t.insert("barakalar")
t.delete("barak") # raises error as 'barak' is not a valid WORD_END although it is a valid path.
t.delete("bareka") # raises error as 'e' does not exist in the path.
t.delete("baraka") # deletes the WORD_END of 'baraka' without deleting any letter as there is 'barakalar' afterwards.
t.delete("barakalar") # deletes until the previous word (until the first Trie.WORD_END; "$" - by going backwards with recursion) in the same path (until 'baraka').
如果您需要整個 DS:
class TrieNode:
def __init__(self):
self.children = {}
self.wordCounter = 0
self.prefixCounter = 0
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word: str) -> None:
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node.prefixCounter += 1
node = node.children[char]
node.wordCounter += 1
def countWordsEqualTo(self, word: str) -> int:
node = self.root
if node.children:
for char in word:
node = node.children[char]
else:
return 0
return node.wordCounter
def countWordsStartingWith(self, prefix: str) -> int:
node = self.root
if node.children:
for char in prefix:
node = node.children[char]
else:
return 0
return node.prefixCounter
def erase(self, word: str) -> None:
node = self.root
for char in word:
if node.children:
node.prefixCounter -= 1
node = node.children[char]
else:
return None
node.wordCounter -= 1
if node.wordCounter == 0:
self.dfsRemove(self.root, word, 0)
def dfsRemove(self, node: TrieNode, word: str, idx: int) -> None:
if len(word) == idx:
node.wordCounter = 0
return
char = word[idx]
if char not in node.children:
return
self.dfsRemove(node.children[char], word, idx+1)
node.children.pop(char)
trie = Trie();
trie.insert("apple"); #// Inserts "apple".
trie.insert("apple"); #// Inserts another "apple".
print(trie.countWordsEqualTo("apple")) #// There are two instances of "apple" so return 2.
print(trie.countWordsStartingWith("app")) #// "app" is a prefix of "apple" so return 2.
trie.erase("apple") #// Erases one "apple".
print(trie.countWordsEqualTo("apple")) #// Now there is only one instance of "apple" so return 1.
print(trie.countWordsStartingWith("app")) #// return 1
trie.erase("apple"); #// Erases "apple". Now the trie is empty.
print(trie.countWordsEqualTo("apple")) #// return 0
print(trie.countWordsStartingWith("app")) #// return 0
我認為這個實現是最簡潔和最容易理解的。
def removeWord(word, node=None):
if not node:
node = self.root
if word == "":
node.isEnd = False
return
newnode = node.children[word[0]]
removeWord(word[1:], newnode)
if not newnode.isEnd and len(newnode.children) == 0:
del node.children[word[0]]
雖然一開始使用默認參數node=None
有點難以理解,但這是處理標記單詞node.isEnd = False
同時修剪無關節點的 Trie 刪除的最簡潔實現。
Trie.removeWord("ToBeDeletedWord")
。ToBeDeletedWord
的結束節點,最后一次遞歸調用removeWord("", <node d>)
node.isEnd = False
。 稍后,該節點不再被標記為isEnd
並且它沒有子節點,因此我們可以調用刪除運算符。TobeDeletedWor
、 TobeDeletedWo
、 TobeDeletedW
等)將觀察到它也不是結束節點並且沒有更多子節點。 這些節點也將被刪除。您將不得不閱讀此文幾次,但此實現簡潔、易讀且正確。 困難在於遞歸發生在函數中間,而不是在開始或結束時。
TL;DR
class TrieNode:
children: dict[str, "TrieNode"]
def __init__(self) -> None:
self.children = {}
self.end = False
def __contains__(self, char: str) -> bool:
return char in self.children
def __getitem__(self, __name: str) -> "TrieNode":
return self.children[__name]
def __setitem__(self, __name: str, __value: "TrieNode") -> None:
self.children[__name] = __value
def __len__(self):
return len(self.children)
def __delitem__(self, __name: str):
del self.children[__name]
class Trie:
def __init__(self, words: list[str]) -> None:
self.root = TrieNode()
for w in words:
self.insert(w)
def insert(self, word: str):
curr = self.root
for c in word:
curr = curr.children.setdefault(c, TrieNode())
curr.end = True
def remove(self, word: str):
def _remove(node: TrieNode, index: int):
if index >= len(word):
node.end = False
if not node.children:
return True
elif word[index] in node:
if _remove(node[word[index]], index + 1):
del node[word[index]]
_remove(self.root, 0)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.