-
Notifications
You must be signed in to change notification settings - Fork 273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding suffix tree #323
Open
Arvind-raj06
wants to merge
9
commits into
codezonediitj:main
Choose a base branch
from
Arvind-raj06:Allgood
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Adding suffix tree #323
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
4432547
Adding suffix
Arvind-raj06 598330b
Let's see
Arvind-raj06 391302d
Adding ref
Arvind-raj06 7fd9da7
Update test_suffixtree.py
Arvind-raj06 68ef229
Adding docs
Arvind-raj06 57fd9f9
Fixed lcs
Arvind-raj06 cac7126
Fixing code
Arvind-raj06 b315c5e
Fixing code and docs
Arvind-raj06 e3586fa
Merge branch 'master' into Allgood
Arvind-raj06 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,316 @@ | ||
from pydatastructs.utils.misc_util import SuffixNode | ||
|
||
__all__ = [ | ||
'SuffixTree' | ||
] | ||
|
||
class SuffixTree(object): | ||
""" | ||
Represents Suffix Tree. | ||
|
||
Examples | ||
======== | ||
|
||
>>> from pydatastructs.strings import SuffixTree as suffix | ||
>>> s = suffix('hello') | ||
>>> s.find('he') | ||
0 | ||
>>> s.find_all('l') | ||
{2, 3} | ||
>>> s.find('f') | ||
-1 | ||
>>> lt=["abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda"] | ||
>>> s1 = suffix(lt) | ||
>>> s1.longest_common_substring() | ||
'abeced' | ||
|
||
References | ||
========== | ||
|
||
.. [1] https://en.wikipedia.org/wiki/Suffix_tree | ||
.. [2] https://en.wikipedia.org/wiki/Generalized_suffix_tree | ||
""" | ||
|
||
def __new__(cls, input=''): | ||
obj = object.__new__(cls) | ||
obj.root = SuffixNode() | ||
obj.root.depth = 0 | ||
obj.root.idx = 0 | ||
obj.root.parent = obj.root | ||
obj.root._add_suffix_link(obj.root) | ||
if not input == '': | ||
obj.build(input) | ||
return obj | ||
|
||
@classmethod | ||
def methods(cls): | ||
return ['__new__', 'lcs', 'find', 'find_all'] | ||
|
||
def _check_input(self, input): | ||
""" | ||
Check if the input is str ot list of str. | ||
""" | ||
if isinstance(input, str): | ||
return 'str' | ||
elif isinstance(input, list): | ||
if all(isinstance(item, str) for item in input): | ||
return 'list' | ||
raise ValueError("String argument should be of type String or a list of strings") | ||
|
||
def build(self, x): | ||
""" | ||
Builds the Suffix tree on the given input. | ||
|
||
Parameters | ||
========== | ||
|
||
x: str or list of str | ||
|
||
Returns | ||
======= | ||
|
||
None | ||
""" | ||
type = self._check_input(x) | ||
if type == 'str': | ||
x += next(self._terminal_symbols_generator()) | ||
self._build(x) | ||
if type == 'list': | ||
self._build_generalized(x) | ||
|
||
def _build(self, x): | ||
""" | ||
Builds suffix tree with string. | ||
""" | ||
self.word = x | ||
self._build_McCreight(x) | ||
|
||
def _build_McCreight(self, x): | ||
u = self.root | ||
d = 0 | ||
for i in range(len(x)): | ||
while u.depth == d and u._has_transition(x[d + i]): | ||
u = u._get_transition_link(x[d + i]) | ||
d = d + 1 | ||
while d < u.depth and x[u.idx + d] == x[i + d]: | ||
d = d + 1 | ||
if d < u.depth: | ||
u = self._create_node(x, u, d) | ||
self._create_leaf(x, i, u, d) | ||
if not u._get_suffix_link(): | ||
self._compute_slink(x, u) | ||
u = u._get_suffix_link() | ||
d = d - 1 | ||
if d < 0: | ||
d = 0 | ||
|
||
def _create_node(self, x, u, d): | ||
""" | ||
Creates node for the suffix tree | ||
with transition links. | ||
""" | ||
i = u.idx | ||
p = u.parent | ||
v = SuffixNode(idx=i, depth=d) | ||
v._add_transition_link(u, x[i + d]) | ||
u.parent = v | ||
p._add_transition_link(v, x[i + p.depth]) | ||
v.parent = p | ||
return v | ||
|
||
def _create_leaf(self, x, i, u, d): | ||
""" | ||
Creates the leaf node for the | ||
suffix tree. | ||
""" | ||
w = SuffixNode() | ||
w.idx = i | ||
w.depth = len(x) - i | ||
u._add_transition_link(w, x[i + d]) | ||
w.parent = u | ||
return w | ||
|
||
def _compute_slink(self, x, u): | ||
d = u.depth | ||
v = u.parent._get_suffix_link() | ||
while v.depth < d - 1: | ||
v = v._get_transition_link(x[u.idx + v.depth + 1]) | ||
if v.depth > d - 1: | ||
v = self._create_node(x, v, d - 1) | ||
u._add_suffix_link(v) | ||
|
||
def _build_generalized(self, xs): | ||
""" | ||
Builds the generalized suffix tree with list | ||
of string. | ||
""" | ||
terminal_gen = self._terminal_symbols_generator() | ||
_xs = ''.join([x + next(terminal_gen) for x in xs]) | ||
self.word = _xs | ||
self._generalized_word_starts(xs) | ||
self._build(_xs) | ||
self.root._traverse(self._label_generalized) | ||
|
||
def _label_generalized(self, node): | ||
""" | ||
Helper method that labels the nodes of GST with | ||
indexes of strings found in their descendants. | ||
""" | ||
if node.is_leaf(): | ||
x = {self._get_word_start_index(node.idx)} | ||
else: | ||
x = {n for ns in node.transition_links.values() for n in ns.generalized_idxs} | ||
node.generalized_idxs = x | ||
|
||
def _get_word_start_index(self, idx): | ||
""" | ||
Helper method that returns the index of the | ||
string based on node's starting index. | ||
""" | ||
i = 0 | ||
for _idx in self.word_starts[1:]: | ||
if idx < _idx: | ||
return i | ||
else: | ||
i += 1 | ||
return i | ||
|
||
def longest_common_substring(self, stringIdxs = -1): | ||
""" | ||
Finds the Largest Common Substring of Strings provided in stringIdxs. | ||
If stringIdxs is not provided, the LCS of all strings is returned. | ||
|
||
Parameters | ||
========== | ||
|
||
stringIdxs: int or list of int | ||
|
||
Returns | ||
======= | ||
|
||
Longest Common Substring | ||
""" | ||
if stringIdxs == -1 or not isinstance(stringIdxs, list): | ||
stringIdxs = set(range(len(self.word_starts))) | ||
else: | ||
stringIdxs = set(stringIdxs) | ||
deepestNode = self._find_lcs(self.root, stringIdxs) | ||
start = deepestNode.idx | ||
end = deepestNode.idx + deepestNode.depth | ||
return self.word[start:end] | ||
|
||
def _find_lcs(self, node, stringIdxs): | ||
""" | ||
Helper method for longest common substring | ||
of the labelled Generalized suffix tree. | ||
""" | ||
nodes = [self._find_lcs(n, stringIdxs) | ||
for n in node.transition_links.values() | ||
if n.generalized_idxs.issuperset(stringIdxs)] | ||
if nodes == []: | ||
return node | ||
deepestNode = max(nodes, key=lambda n: n.depth) | ||
return deepestNode | ||
|
||
def _generalized_word_starts(self, xs): | ||
""" | ||
Helper method fidning the starting indexes | ||
of strings in Generalized suffix tree. | ||
""" | ||
self.word_starts = [] | ||
i = 0 | ||
for n in range(len(xs)): | ||
self.word_starts.append(i) | ||
i += len(xs[n]) + 1 | ||
|
||
def find(self, y): | ||
""" | ||
Finds the starting position of the substring y in the string used for | ||
building the Suffix tree. | ||
|
||
Parameters | ||
========== | ||
|
||
y: str | ||
|
||
Returns | ||
======= | ||
|
||
Index of the starting position of string y in the string used for building the Suffix tree | ||
-1 if y is not a substring. | ||
""" | ||
node = self.root | ||
while True: | ||
edge = self._edge_label(node, node.parent) | ||
if edge.startswith(y): | ||
return node.idx | ||
|
||
i = 0 | ||
while (i < len(edge) and edge[i] == y[0]): | ||
y = y[1:] | ||
i += 1 | ||
|
||
if i != 0: | ||
if i == len(edge) and y != '': | ||
pass | ||
else: | ||
return -1 | ||
|
||
node = node._get_transition_link(y[0]) | ||
if not node: | ||
return -1 | ||
|
||
def find_all(self, y): | ||
""" | ||
Finds the starting position of the substring y in the string used for | ||
building the Suffix tree. | ||
|
||
Parameters | ||
========== | ||
|
||
y: str | ||
|
||
Returns | ||
======= | ||
|
||
Set of Index of the starting positions of string y in the string used for building the Suffix tree | ||
{} if y is not a substring. | ||
""" | ||
node = self.root | ||
while True: | ||
edge = self._edge_label(node, node.parent) | ||
if edge.startswith(y): | ||
break | ||
i = 0 | ||
while (i < len(edge) and edge[i] == y[0]): | ||
y = y[1:] | ||
i += 1 | ||
if i != 0: | ||
if i == len(edge) and y != '': | ||
pass | ||
else: | ||
return {} | ||
node = node._get_transition_link(y[0]) | ||
if not node: | ||
return {} | ||
|
||
leaves = node._get_leaves() | ||
return {n.idx for n in leaves} | ||
|
||
def _edge_label(self, node, parent): | ||
""" | ||
Helper method returns the edge label | ||
between a node and it's parent. | ||
""" | ||
return self.word[node.idx + parent.depth: node.idx + node.depth] | ||
|
||
def _terminal_symbols_generator(self): | ||
""" | ||
Generator of unique terminal symbols used for building the Generalized Suffix Tree. | ||
Unicode Private Use Area is used to ensure that terminal symbols are not part | ||
of the input string. | ||
""" | ||
unicode = list(list(range(0xE000, 0xF8FF+1)) + list(range(0xF0000, 0xFFFFD+1)) + list(range(0x100000, 0x10FFFD+1))) | ||
for i in unicode: | ||
yield (chr(i)) | ||
raise ValueError("To many input strings.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from pydatastructs import SuffixTree | ||
from pydatastructs.utils.raises_util import raises | ||
import random, string | ||
|
||
def test_suffixtree(): | ||
""" | ||
References | ||
========== | ||
.. https://www.cise.ufl.edu/~sahni/dsaaj/enrich/c16/suffix.htm | ||
|
||
""" | ||
s = SuffixTree("HelloworldHe") | ||
assert s.find("Hel") == 0 | ||
assert s.find_all("He") == {0, 10} | ||
assert s.find("Win") == -1 | ||
assert s.find_all("go") == {} | ||
assert raises(AttributeError, lambda: s.longest_common_substring()) | ||
|
||
f = ['integer', 'inteinteger', 'integralerint', 'iaingerntier', 'regetnerireg', 'reger'] | ||
s = SuffixTree(f) | ||
assert s.longest_common_substring() == 'er' | ||
f = ['integer', 'inteinteger', 'integralerint', 'iainegerntier', 'regetnerireg', 'reger'] | ||
s = SuffixTree(f) | ||
assert s.longest_common_substring(7) == 'eg' | ||
|
||
assert raises(ValueError, lambda: SuffixTree(123)) | ||
res = (100, 1, 0) | ||
assert raises(ValueError, lambda: SuffixTree(res)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
CartesianTreeNode, | ||
RedBlackTreeNode, | ||
TrieNode, | ||
SuffixNode, | ||
SkipNode | ||
) | ||
__all__.extend(misc_util.__all__) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if I want to pass a string as a query input?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LCS finds the common substring among the given list of strings as for this if a new string which has to be check must be added in the initial tree instance, do you mean the character to start check of lcs from