Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update metaclusterers.py #42

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions modisco/metaclusterers.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ def fit(self, seqlets):
self.get_vector_from_seqlet(x)
for x in seqlets]))
self._fit(attribute_vectors)
self.fit_called = True
self.fit_called = True
return self

def _fit(self, attribute_vectors):
raise NotImplementedError()
Expand Down Expand Up @@ -159,6 +160,11 @@ def weak_vector_to_pattern(self, vector):
assert False
return to_return

def get_all_possible_compatible_patterns(self, pattern):
all_possible_patterns = list(
itertools.product(*[(x,0) for x in pattern]))
return all_possible_patterns

def check_pattern_compatibility(self, pattern_to_check, reference_pattern):
return all([(pattern_elem==reference_elem or reference_elem==0)
for pattern_elem, reference_elem
Expand Down Expand Up @@ -287,17 +293,15 @@ def save_hdf5(self, grp):

def _fit(self, attribute_vectors):

all_possible_activity_patterns =\
list(itertools.product(*[(1,-1,0) for x
in range(attribute_vectors.shape[1])]))
all_possible_activity_patterns = set()

activity_pattern_to_attribute_vectors = defaultdict(list)
for vector in attribute_vectors:
vector_activity_pattern = self.vector_to_pattern(vector)
compatible_activity_patterns =\
self.get_compatible_patterns(
vector_activity_pattern, all_possible_activity_patterns)
self.get_all_possible_compatible_patterns(vector_activity_pattern)
for compatible_activity_pattern in compatible_activity_patterns:
all_possible_activity_patterns.add(compatible_activity_pattern)
activity_pattern_to_attribute_vectors[
self.pattern_to_str(
compatible_activity_pattern)].append(vector)
Expand Down
21 changes: 15 additions & 6 deletions modisco/tfmodisco_workflow/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ def __init__(self,
max_passing_windows_frac=0.2,
separate_pos_neg_thresholds=False,
verbose=True,
min_seqlets_per_task=None):
min_seqlets_per_task=None,
max_seqlets_during_metacluster_fit=np.inf):

if (min_seqlets_per_task is not None):
raise DeprecationWarning(
Expand All @@ -180,6 +181,7 @@ def __init__(self,
self.max_passing_windows_frac = max_passing_windows_frac
self.separate_pos_neg_thresholds = separate_pos_neg_thresholds
self.verbose = verbose
self.max_seqlets_during_metacluster_fit = max_seqlets_during_metacluster_fit

self.build()

Expand Down Expand Up @@ -248,11 +250,12 @@ def __call__(self, task_names, contrib_scores,
+" Consider dropping target_seqlet_fdr")


if int(self.min_metacluster_size_frac * len(seqlets)) > self.min_metacluster_size:
print("min_metacluster_size_frac * len(seqlets) = {0} is more than min_metacluster_size={1}.".\
format(int(self.min_metacluster_size_frac * len(seqlets)), self.min_metacluster_size))
if int(self.min_metacluster_size_frac
* min(len(seqlets),self.max_seqlets_during_metacluster_fit)) > self.min_metacluster_size:
print("min_metacluster_size_frac * min(len(seqlets),self.max_seqlets_during_metacluster_fit) = {0} is more than min_metacluster_size={1}.".\
format(int(self.min_metacluster_size_frac * min(len(seqlets),self.max_seqlets_during_metacluster_fit)), self.min_metacluster_size))
print("Using it as a new min_metacluster_size")
self.min_metacluster_size = int(self.min_metacluster_size_frac * len(seqlets))
self.min_metacluster_size = int(self.min_metacluster_size_frac * min(len(seqlets),self.max_seqlets_during_metacluster_fit))


if (self.weak_threshold_for_counting_sign is None):
Expand Down Expand Up @@ -288,7 +291,13 @@ def __call__(self, task_names, contrib_scores,
weak_threshold_for_counting_sign=
weak_threshold_for_counting_sign)

metaclustering_results = metaclusterer.fit_transform(seqlets)
if (len(seqlets) > self.max_seqlets_during_metacluster_fit):
indices = np.random.RandomState(1234).choice(
a=len(seqlets), size=self.max_seqlets_during_metacluster_fit, replace=False)
seqlets_to_metacluster = [seqlets[x] for x in indices]
else:
seqlets_to_metacluster = seqlets
metaclustering_results = metaclusterer.fit(seqlets_to_metacluster).transform(seqlets)
metacluster_indices = np.array(
metaclustering_results.metacluster_indices)
metacluster_idx_to_activity_pattern =\
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
description='TF MOtif Discovery from Importance SCOres',
long_description="""Algorithm for discovering consolidated patterns from base-pair-level importance scores""",
url='https://github.com/kundajelab/tfmodisco',
version='0.5.1.2',
version='0.5.1.3',
packages=find_packages(),
package_data={
'': ['cluster/phenograph/louvain/*convert*', 'cluster/phenograph/louvain/*community*', 'cluster/phenograph/louvain/*hierarchy*']
Expand Down