Python 实现Trie

Trie

Trie 即字典树,无需废话,直接看 维基百科 上的解释。

还有leetcode上的这个问题: https://leetcode.com/problems/implement-trie-prefix-tree/description/

当数据量不是很大的时候,可以直接使用字典的嵌套方式实现的,但是对于大数据量的trie,嵌套的字典可能变得就很笨拙,或者也可以说不是那么的高效,但是如果刚开始学习,可以使用嵌套字典的方式实现,也是最简单的方式,可以用简短的几行代码实现:

class Trie():
    def __init__(self):
        self._end = '*'
        self.trie = dict()
    def __repr__(self):
        return repr(self.trie)
    def make_trie(self, *words):
        trie = dict()
        for word in words:
            temp_dict = trie
            for letter in word:
                temp_dict = temp_dict.setdefault(letter, {})
            temp_dict[self._end] = self._end
        return trie
    def find_word(self, word):
        sub_trie = self.trie
        for letter in word:
            if letter in sub_trie:
                sub_trie = sub_trie[letter]
            else:
                return False
        else:
            if self._end in sub_trie:
                return True
            else:
                return False
    def add_word(self, word):
        if self.find_word(word):
            print("Word Already Exists")
            return self.trie
        temp_trie = self.trie
        for letter in word:
            if letter in temp_trie:
                temp_trie = temp_trie[letter]
            else:
                temp_trie = temp_trie.setdefault(letter, {})
        temp_trie[self._end] = self._end
        return temp_trie

如果对于setdefault不是太熟悉,可以直接在字典中查找一个键,此处即为字母或者_end。如果字母存在,就返回相应的值;如果不存在,就设定默认的值为一个空字典或者_end。

这样的答案并不让人满意,再来看看leetcode的答案:

from collections import defaultdict
class Trie:
    def __init__(self):
        Initialize your data structure here.
        self.root = defaultdict()
    def insert(self, word):
        Inserts a word into the trie.
        :type word: str
        :rtype: void
        current = self.root
        for letter in word:
            current = current.setdefault(letter, {})
        current.setdefault("_end")
    def search(self, word):
        Returns if the word is in the trie.
        :type word: str
        :rtype: bool
        current = self.root
        for letter in word:
            if letter not in current:
                return False
            current = current[letter]
        if "_end" in current:
            return True
        return False
    def startsWith(self, prefix):
        Returns if there is any word in the trie that starts with the given prefix.
        :type prefix: str
        :rtype: bool
        current = self.root
        for letter in prefix:
            if letter not in current:
                return False
            current = current[letter]
        return True

再来一个更简单的,不用defaultdict:

class Trie:
    def __init__(self):
        Initialize your data structure here.
        self.Trie = {}
    def insert(self, word):
        Inserts a word into the trie.
        :type word: str
        :rtype: void
        curr = self.Trie
        for w in word:
            if w not in curr:
                curr[w] = {}
            curr = curr[w]
        curr['#'] = 1
    def search(self, word):
        Returns if the word is in the trie.
        :type word: str
        :rtype: bool
        curr = self.Trie
        for w in word:
            if w not in curr:
                return False
            curr = curr[w]
        return "#" in curr
    def startsWith(self, prefix):
        Returns if there is any word in the trie that starts with the given prefix.
        :type prefix: str
        :rtype: bool
        curr = self.Trie
        for w in prefix:
            if w not in curr:
                return False
            curr = curr[w]
        return True

再来一个功能完整一点的版本:

class Trie:
    def __init__(self):
        self.__final = False
        self.__nodes = {}
    def __repr__(self):
        return 'Trie<len={}, final={}>'.format(len(self), self.__final)
    def __getstate__(self):
        return self.__final, self.__nodes
    def __setstate__(self, state):
        self.__final, self.__nodes = state
    def __len__(self):
        return len(self.__nodes)
    def __bool__(self):
        return self.__final
    def __contains__(self, array):
            return self[array]
        except KeyError:
            return False
    def __iter__(self):
        yield self
        for node in self.__nodes.values():
            yield from node
    def __getitem__(self, array):
        return self.__get(array, False)
    def create(self, array):
        self.__get(array, True).__final = True
    def read(self):
        yield from self.__read([])
    def update(self, array):
        self[array].__final = True
    def delete(self, array):
        self[array].__final = False
    def prune(self):
        for key, value in tuple(self.__nodes.items()):
            if not value.prune():
                del self.__nodes[key]
        if not len(self):
            self.delete([])
        return self
    def __get(self, array, create):
        if array:
            head, *tail = array
            if create and head not in self.__nodes:
                self.__nodes[head] = Trie()
            return self.__nodes[head].__get(tail, create)
        return self
    def __read(self, name):
        if self.__final:
            yield name
        for key, value in self.__nodes.items():
            yield from value.__read(name + [key])

对于更大型的项目,这样的方式就不适合了,可以考虑这个python模块:https://github.com/kmike/marisa-trie,可以节约很多内存空间。还有

python-trie - a simple pure python implementation.
PyTrie - a more advanced pure python implementation.
google/pygtrie - developed google