Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Intake lists #52

Merged
merged 27 commits into from
May 6, 2021
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
cc6331e
Merge pull request #1 from cytomining/master
michaelbornholdt Apr 26, 2021
dda345e
Add list input to enrichment.py
michaelbornholdt Apr 26, 2021
470ef91
delete prints
michaelbornholdt Apr 26, 2021
a0f7030
Apply Black
michaelbornholdt Apr 26, 2021
9d0eb6f
modify test two new input
michaelbornholdt Apr 26, 2021
4b123b6
Change the input of precision_recall.py
michaelbornholdt Apr 26, 2021
1584d13
Black changes
michaelbornholdt Apr 26, 2021
d732dce
fix tests
michaelbornholdt Apr 27, 2021
86e5314
rerun the demo
michaelbornholdt Apr 27, 2021
a908730
also accept ints
michaelbornholdt Apr 30, 2021
03cb138
ints and floats also allowed
michaelbornholdt Apr 30, 2021
2b38adb
add further tests
michaelbornholdt Apr 30, 2021
d6a24f4
Black
michaelbornholdt Apr 30, 2021
60d8868
fix test
michaelbornholdt Apr 30, 2021
2f631da
Add Demo
michaelbornholdt Apr 30, 2021
25f58bc
Fix test
michaelbornholdt Apr 30, 2021
6d42cd2
change input to floats
michaelbornholdt May 4, 2021
bbdfe74
correct doc
michaelbornholdt May 4, 2021
06d46aa
Merge remote-tracking branch 'origin/intake_lists' into intake_lists
michaelbornholdt May 4, 2021
fc15e74
change input to floats
michaelbornholdt May 4, 2021
ad3754d
Merge remote-tracking branch 'origin/intake_lists' into intake_lists
michaelbornholdt May 4, 2021
c353179
More tests
michaelbornholdt May 4, 2021
9123f51
named percentile in enrichment.py
michaelbornholdt May 4, 2021
5fb4d24
update docstring
michaelbornholdt May 6, 2021
bc679fe
Update cytominer_eval/tests/test_operations/test_enrichment.py
michaelbornholdt May 6, 2021
990e14e
add comment for test
michaelbornholdt May 6, 2021
04be210
finalize test enrichment
michaelbornholdt May 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions cytominer_eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
enrichment,
)


gwaybio marked this conversation as resolved.
Show resolved Hide resolved
def evaluate(
profiles: pd.DataFrame,
features: List[str],
Expand All @@ -26,11 +27,11 @@ def evaluate(
similarity_metric: str = "pearson",
replicate_reproducibility_quantile: np.float = 0.95,
replicate_reproducibility_return_median_cor: bool = False,
precision_recall_k: int = 10,
precision_recall_k: Union[int, List[int]] = 10,
grit_control_perts: List[str] = ["None"],
grit_replicate_summary_method: str = "mean",
mp_value_params: dict = {},
enrichment_percentile: float = 0.5,
enrichment_percentile: Union[float, List[float]] = 0.99,
):
r"""Evaluate profile quality and strength.

Expand Down Expand Up @@ -85,7 +86,7 @@ def evaluate(
Only used when `operation='replicate_reproducibility'`. If True, then also
return pairwise correlations as defined by replicate_groups and
similarity metric
precision_recall_k : {10, ...}, optional
precision_recall_k : int or list of ints {10, ...}, optional
Only used when `operation='precision_recall'`. Used to calculate precision and
recall considering the top k profiles according to pairwise similarity.
grit_control_perts : {None, ...}, optional
Expand All @@ -100,7 +101,7 @@ def evaluate(
Only used when `operation='mp_value'`. A key, item pair of optional parameters
for calculating mp value. See also
:py:func:`cytominer_eval.operations.util.default_mp_value_parameters`
percentile : float, optional
percentile : float or list of floats, optional
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
Only used when `operation='enrichment'`. Determines the percentage of top connections
used for the enrichment calculation.
"""
Expand Down
76 changes: 43 additions & 33 deletions cytominer_eval/operations/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
import numpy as np
import pandas as pd
from typing import List
from typing import List, Union
import scipy

from .util import assign_replicates, calculate_grit, check_grit_replicate_summary_method
Expand All @@ -14,8 +14,10 @@


def enrichment(
similarity_melted_df: pd.DataFrame, replicate_groups: List[str], percentile: 0.9,
) -> dict:
similarity_melted_df: pd.DataFrame,
replicate_groups: List[str],
percentile: Union[float, List[float]],
) -> pd.DataFrame:
"""Calculate the enrichment score. This score is based on the fisher exact odds score. Similar to the other functions, the closest connections are determined and checked with the replicates.
This score effectively calculates how much better the distribution of correct connections is compared to random.

Expand All @@ -28,48 +30,56 @@ def enrichment(
replicate_groups : List
a list of metadata column names in the original profile dataframe to use as
replicate columns.
percentile : float
percentile : List of floats
Determines what percentage of top connections used for the enrichment calculation.

Returns
-------
dict
percentile, threshold, odds ratio and p value
"""
# threshold based on percentile of top connections
threshold = similarity_melted_df.similarity_metric.quantile(percentile)

result = []
replicate_truth_df = assign_replicates(
similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
)
# calculate the individual components of the contingency tables
v11 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric>@threshold"
# loop over all percentiles
if type(percentile) == float:
percentile = [percentile]
for p in percentile:
# threshold based on percentile of top connections
threshold = similarity_melted_df.similarity_metric.quantile(p)

# calculate the individual components of the contingency tables
v11 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric>@threshold"
)
)
)
v12 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric>@threshold"
v12 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric>@threshold"
)
)
)
v21 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric<=@threshold"
v21 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric<=@threshold"
)
)
)
v22 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric<=@threshold"
v22 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric<=@threshold"
)
gwaybio marked this conversation as resolved.
Show resolved Hide resolved
)
)

v = np.asarray([[v11, v12], [v21, v22]])
r = scipy.stats.fisher_exact(v, alternative="greater")
result = {
"percentile": percentile,
"threshold": threshold,
"ods_ratio": r[0],
"p-value": r[1],
}
return result
v = np.asarray([[v11, v12], [v21, v22]])
r = scipy.stats.fisher_exact(v, alternative="greater")
result.append(
{
"enrichment_percentile": p,
"threshold": threshold,
"ods_ratio": r[0],
"p-value": r[1],
}
)
result_df = pd.DataFrame(result)
return result_df
21 changes: 13 additions & 8 deletions cytominer_eval/operations/precision_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import numpy as np
import pandas as pd
from typing import List
from typing import List, Union

from .util import assign_replicates, calculate_precision_recall
from cytominer_eval.transform.util import set_pair_ids, assert_melt
Expand All @@ -13,7 +13,7 @@
def precision_recall(
similarity_melted_df: pd.DataFrame,
replicate_groups: List[str],
k: int,
k: Union[int, List[int]],
) -> pd.DataFrame:
"""Determine the precision and recall at k for all unique replicate groups
based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)
Expand All @@ -27,7 +27,7 @@ def precision_recall(
replicate_groups : List
a list of metadata column names in the original profile dataframe to use as
replicate columns.
k : int
k : List of ints or int
an integer indicating how many pairwise comparisons to threshold.

Returns
Expand All @@ -49,11 +49,16 @@ def precision_recall(
"{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
for x in replicate_groups
]

# Calculate precision and recall for all groups
precision_recall_df = similarity_melted_df.groupby(replicate_group_cols).apply(
lambda x: calculate_precision_recall(x, k=k)
)
# iterate over all k
precision_recall_df = pd.DataFrame()
if type(k) == int:
k = [k]
for k_ in k:
# Calculate precision and recall for all groups
precision_recall_df_at_k = similarity_melted_df.groupby(
replicate_group_cols
).apply(lambda x: calculate_precision_recall(x, k=k_))
precision_recall_df = precision_recall_df.append(precision_recall_df_at_k)

# Rename the columns back to the replicate groups provided
rename_cols = dict(zip(replicate_group_cols, replicate_groups))
Expand Down
16 changes: 4 additions & 12 deletions cytominer_eval/tests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,7 @@ def test_evaluate_replicate_reprod_return_cor_true():

assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949
assert sorted(med_cor_df.columns.tolist()) == sorted(
[
"Metadata_gene_name",
"Metadata_pert_name",
"similarity_metric",
]
["Metadata_gene_name", "Metadata_pert_name", "similarity_metric",]
)


Expand Down Expand Up @@ -160,7 +156,7 @@ def test_evaluate_precision_recall():
replicate_groups=["Metadata_broad_sample"],
operation="precision_recall",
similarity_metric="pearson",
precision_recall_k=k,
precision_recall_k=[k],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you try one of these without a list? (i mean one of either lines 144 or 163)

It should work either way

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a quick comment on line 151 noting the difference between evaluate on line 152 and evaluate on line 137?

For future changes to this code, it will be important to note that we're also testing the list/int capability of this argument

)

assert (
Expand Down Expand Up @@ -205,9 +201,7 @@ def test_evaluate_grit():
top_result = (
grit_results_df.sort_values(by="grit", ascending=False)
.reset_index(drop=True)
.iloc[
0,
]
.iloc[0,]
)
assert np.round(top_result.grit, 4) == 2.3352
assert top_result.group == "PTK2"
Expand All @@ -233,9 +227,7 @@ def test_evaluate_grit():
top_result = (
grit_results_df.sort_values(by="grit", ascending=False)
.reset_index(drop=True)
.iloc[
0,
]
.iloc[0,]
)

assert np.round(top_result.grit, 4) == 0.9990
Expand Down
30 changes: 14 additions & 16 deletions cytominer_eval/tests/test_operations/test_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,37 +42,35 @@


def test_enrichment():
result = []
for p in np.arange(1, 0.97, -0.005):
r = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=p,
)
result.append(r)
result_df = pd.DataFrame(result)
percent_list = np.arange(1, 0.97, -0.005)
result = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=percent_list,
)

# check for correct shape and starts with 1.0
assert result_df.shape == (7, 4)
assert result_df.percentile[0] == 1.0
assert result.shape == (7, 4)
assert result.enrichment_percentile[0] == 1.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a check to the second element in this Series?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure what you mean

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assert that result.enrichment_percentile[1] equals what you'd calculate by hand.

assert result.enrichment_percentile[1] == 0.995
# check if the higher percentiles are larger than the small one
assert result_df.percentile[1] > result_df.percentile.iloc[-1]
assert result.enrichment_percentile[1] > result.enrichment_percentile.iloc[-1]
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved


def test_compare_functions():
percentile = 0.9
percent_list = [0.95, 0.9]
eval_res = evaluate(
profiles=df,
features=features,
meta_features=meta_features,
replicate_groups=replicate_groups,
operation="enrichment",
similarity_metric="pearson",
enrichment_percentile=percentile,
enrichment_percentile=percent_list,
)
enr_res = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=percentile,
percentile=percent_list,
)
assert enr_res == eval_res
assert enr_res.equals(eval_res)
gwaybio marked this conversation as resolved.
Show resolved Hide resolved
20 changes: 14 additions & 6 deletions cytominer_eval/tests/test_operations/test_precision_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,30 @@


def test_precision_recall():
result = precision_recall(
result_list = precision_recall(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
k=10,
k=[5, 10],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also keep a test for k as an int?

This test is good, but maybe add a test asserting that the outputs are the same for k=10 and k=[10]

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

)

assert len(result.k.unique()) == 1
assert result.k.unique()[0] == 10
result_int = precision_recall(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
k=5,
)

assert len(result_list.k.unique()) == 2
assert result_list.k.unique()[0] == 5

# ITGAV has a really strong profile
assert (
result.sort_values(by="recall", ascending=False)
result_list.sort_values(by="recall", ascending=False)
.reset_index(drop=True)
.iloc[0, :]
.Metadata_gene_name
== "ITGAV"
)

assert all(x in result.columns for x in replicate_groups)
assert all(x in result_list.columns for x in replicate_groups)

assert result_int.equals(result_list.query("k == 5"))
8 changes: 7 additions & 1 deletion cytominer_eval/transform/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@

def get_available_eval_metrics():
r"""Output the available eval metrics in the cytominer_eval library"""
return ["replicate_reproducibility", "precision_recall", "grit", "mp_value", "enrichment"]
return [
"replicate_reproducibility",
"precision_recall",
"grit",
"mp_value",
"enrichment",
]


def get_available_similarity_metrics():
Expand Down
Loading