Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding suffix tree #323

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pydatastructs/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,10 @@
)

__all__.extend(trie.__all__)

from . import suffix_tree
from .suffix_tree import(
SuffixTree
)

__all__.extend(suffix_tree.__all__)
273 changes: 273 additions & 0 deletions pydatastructs/strings/suffix_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
from pydatastructs.utils.misc_util import SuffixNode

__all__ = [
'SuffixTree'
]

class SuffixTree():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
class SuffixTree():
class SuffixTree(object):

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops left that point

Copy link
Member Author

@Arvind-raj06 Arvind-raj06 Feb 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have come early ☺

"""
Represents Suffix Tree.

Examples
========

>>> from pydatastructs.strings import SuffixTree as suffix
>>> s = suffix('hello')
>>> s.find('he')
0
>>> s.find_all('l')
{2, 3}
>>> s.find('f')
-1
>>> lt=["abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda"]
>>> s1 = suffix(lt)
>>> s1.lcs()
'abeced'

References
==========

.. [1] https://en.wikipedia.org/wiki/Suffix_tree
.. [2] https://en.wikipedia.org/wiki/Generalized_suffix_tree
"""

def __new__(cls, input=''):
obj = object.__new__(cls)
obj.root = SuffixNode()
obj.root.depth = 0
obj.root.idx = 0
obj.root.parent = obj.root
obj.root._add_suffix_link(obj.root)
if not input == '':
obj.build(input)
return obj

@classmethod
def methods(cls):
return ['__new__', 'lcs', 'find', 'find_all']

def _check_input(self, input):
if isinstance(input, str):
return 'str'
elif isinstance(input, list):
if all(isinstance(item, str) for item in input):
return 'list'
raise ValueError("String argument should be of type String or a list of strings")

def build(self, x):
"""
Builds the Suffix tree on the given input.

Parameters
==========

x: str or list of str

Returns
=======

None
"""
type = self._check_input(x)
if type == 'str':
x += next(self._terminalSymbolsGenerator())
self._build(x)
if type == 'list':
self._build_generalized(x)

def _build(self, x):
self.word = x
self._build_McCreight(x)

def _build_McCreight(self, x):
u = self.root
d = 0
for i in range(len(x)):
while u.depth == d and u._has_transition(x[d + i]):
u = u._get_transition_link(x[d + i])
d = d + 1
while d < u.depth and x[u.idx + d] == x[i + d]:
d = d + 1
if d < u.depth:
u = self._create_node(x, u, d)
self._create_leaf(x, i, u, d)
if not u._get_suffix_link():
self._compute_slink(x, u)
u = u._get_suffix_link()
d = d - 1
if d < 0:
d = 0

def _create_node(self, x, u, d):
i = u.idx
p = u.parent
v = SuffixNode(idx=i, depth=d)
v._add_transition_link(u, x[i + d])
u.parent = v
p._add_transition_link(v, x[i + p.depth])
v.parent = p
return v

def _create_leaf(self, x, i, u, d):
w = SuffixNode()
w.idx = i
w.depth = len(x) - i
u._add_transition_link(w, x[i + d])
w.parent = u
return w

def _compute_slink(self, x, u):
d = u.depth
v = u.parent._get_suffix_link()
while v.depth < d - 1:
v = v._get_transition_link(x[u.idx + v.depth + 1])
if v.depth > d - 1:
v = self._create_node(x, v, d - 1)
u._add_suffix_link(v)

def _build_generalized(self, xs):
terminal_gen = self._terminalSymbolsGenerator()
_xs = ''.join([x + next(terminal_gen) for x in xs])
self.word = _xs
self._generalized_word_starts(xs)
self._build(_xs)
self.root._traverse(self._label_generalized)

def _label_generalized(self, node):
if node.is_leaf():
x = {self._get_word_start_index(node.idx)}
else:
x = {n for ns in node.transition_links.values() for n in ns.generalized_idxs}
node.generalized_idxs = x

def _get_word_start_index(self, idx):
i = 0
for _idx in self.word_starts[1:]:
if idx < _idx:
return i
else:
i += 1
return i

def lcs(self, stringIdxs = -1):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please avoid using short forms. Use the full name, largest_common_substring.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We had some methods added to algorithms under this module related with longest common substring I believe. With that backtracking thing. How is this method different from that one?

Copy link
Member Author

@Arvind-raj06 Arvind-raj06 Feb 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Backtracking just searches all the strings in the given list by going in reverse direction from the child to root and finds the longest among that but by using this method we give users the freedom to search the string from the index they want. And this function just reduces the comparison time by removing all the non subset of longest sequence

"""
Finds the Largest Common Substring of Strings provided in stringIdxs.
If stringIdxs is not provided, the LCS of all strings is returned.

Parameters
==========

stringIdxs: int or list of int

Returns
=======

Longest Common Substring
"""
if stringIdxs == -1 or not isinstance(stringIdxs, list):
stringIdxs = set(range(len(self.word_starts)))
else:
stringIdxs = set(stringIdxs)
deepestNode = self._find_lcs(self.root, stringIdxs)
start = deepestNode.idx
end = deepestNode.idx + deepestNode.depth
return self.word[start:end]

def _find_lcs(self, node, stringIdxs):
nodes = [self._find_lcs(n, stringIdxs)
for n in node.transition_links.values()
if n.generalized_idxs.issuperset(stringIdxs)]
if nodes == []:
return node
deepestNode = max(nodes, key=lambda n: n.depth)
return deepestNode

def _generalized_word_starts(self, xs):
self.word_starts = []
i = 0
for n in range(len(xs)):
self.word_starts.append(i)
i += len(xs[n]) + 1

def find(self, y):
"""
Finds the starting position of the substring y in the string used for
building the Suffix tree.

Parameters
==========

y: str

Returns
=======

Index of the starting position of string y in the string used for building the Suffix tree
-1 if y is not a substring.
"""
node = self.root
while True:
edge = self._edgeLabel(node, node.parent)
if edge.startswith(y):
return node.idx

i = 0
while (i < len(edge) and edge[i] == y[0]):
y = y[1:]
i += 1

if i != 0:
if i == len(edge) and y != '':
pass
else:
return -1

node = node._get_transition_link(y[0])
if not node:
return -1

def find_all(self, y):
"""
Finds the starting position of the substring y in the string used for
building the Suffix tree.

Parameters
==========

y: str

Returns
=======

Set of Index of the starting positions of string y in the string used for building the Suffix tree
{} if y is not a substring.
"""
node = self.root
while True:
edge = self._edgeLabel(node, node.parent)
if edge.startswith(y):
break
i = 0
while (i < len(edge) and edge[i] == y[0]):
y = y[1:]
i += 1
if i != 0:
if i == len(edge) and y != '':
pass
else:
return {}
node = node._get_transition_link(y[0])
if not node:
return {}

leaves = node._get_leaves()
return {n.idx for n in leaves}

def _edgeLabel(self, node, parent):
return self.word[node.idx + parent.depth: node.idx + node.depth]

def _terminalSymbolsGenerator(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the logic of this method? How it is doing it's job?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in range from Hexadecimal (57344 - 1114109) where it can accommodate the nodes of these range and generate symbols for the suffix tree with ascii as terminal symbols. The yield helps optimizing the code that it don't have to initialize the UPPA value again

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These ranges are used just to make sure that the symbols aren't part of the suffix tree inputs

UPPAs = list(list(range(0xE000, 0xF8FF+1)) + list(range(0xF0000, 0xFFFFD+1)) + list(range(0x100000, 0x10FFFD+1)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use a more descriptive variable name. It's a bit difficult to understand the purpose from the name, UPPAs.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep changed

for i in UPPAs:
yield (chr(i))
raise ValueError("To many input strings.")
24 changes: 24 additions & 0 deletions pydatastructs/strings/tests/test_suffixtree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from pydatastructs import SuffixTree
from pydatastructs.utils.raises_util import raises
import random, string

def test_suffixtree():
"""
References
==========
.. https://www.cise.ufl.edu/~sahni/dsaaj/enrich/c16/suffix.htm

"""
s = SuffixTree("HelloworldHe")
assert s.find("Hel") == 0
assert s.find_all("He") == {0, 10}
assert s.find("Win") == -1
assert s.find_all("go") == {}

f = ['integer', 'inteinteger', 'integralerint', 'iaingerntier', 'regetnerireg', 'reger']
s = SuffixTree(f)
assert s.lcs() == 'er'

assert raises(ValueError, lambda: SuffixTree(123))
res = (100, 1, 0)
assert raises(ValueError, lambda: SuffixTree(res))
3 changes: 2 additions & 1 deletion pydatastructs/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Set,
CartesianTreeNode,
RedBlackTreeNode,
TrieNode
TrieNode,
SuffixNode
)
__all__.extend(misc_util.__all__)
60 changes: 59 additions & 1 deletion pydatastructs/utils/misc_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
'Set',
'CartesianTreeNode',
'RedBlackTreeNode',
'TrieNode'
'TrieNode',
'SuffixNode'
]

_check_type = lambda a, t: isinstance(a, t)
Expand Down Expand Up @@ -446,3 +447,60 @@ def _comp(u, v, tcomp):
return False
else:
return tcomp(u, v)

class SuffixNode(Node):
"""
Represents nodes in the suffix tree data structure.

Parameters
==========

string: The string to be stored in the tree.
Optional, by default None.
list: A list of strings to be stored in suffix tree.
Optional, by default None.
"""

__slots__ = ['_suffix_link', 'transition_links', 'idx', 'depth', 'parent', 'generalized_idxs']

def __new__(cls, idx=-1, parentNode=None, depth=-1):
obj = object.__new__(cls)
obj._suffix_link = None
obj.transition_links = {}
obj.idx = idx
obj.depth = depth
obj.parent = parentNode
obj.generalized_idxs = {}
return obj

def _add_suffix_link(self, snode):
self._suffix_link = snode

def _get_suffix_link(self):
if self._suffix_link is not None:
return self._suffix_link
else:
return False

def _get_transition_link(self, suffix):
return False if suffix not in self.transition_links else self.transition_links[suffix]

def _add_transition_link(self, snode, suffix):
self.transition_links[suffix] = snode

def _has_transition(self, suffix):
return suffix in self.transition_links

def is_leaf(self):
return len(self.transition_links) == 0

def _traverse(self, f):
for node in self.transition_links.values():
node._traverse(f)
f(self)

def _get_leaves(self):
if self.is_leaf():
return {self}
else:
return {x for n in self.transition_links.values() for x in n._get_leaves()}