Intake lists (#52)

* Add list input to enrichment.py * delete prints * Apply Black * modify test two new input * Change the input of precision_recall.py * Black changes * fix tests * rerun the demo * also accept ints Co-authored-by: Greg Way <[email protected]> * ints and floats also allowed * add further tests * Black * fix test * Add Demo * Fix test * change input to floats Co-authored-by: Greg Way <[email protected]> * correct doc * change input to floats Co-authored-by: Greg Way <[email protected]> * More tests * named percentile in enrichment.py * update docstring Co-authored-by: Greg Way <[email protected]> * Update cytominer_eval/tests/test_operations/test_enrichment.py Co-authored-by: Greg Way <[email protected]> * add comment for test * finalize test enrichment Co-authored-by: Greg Way <[email protected]>
cytomining · May 6, 2021 · 7f94b11 · 7f94b11
1 parent 220b296
commit 7f94b11
Show file tree

Hide file tree

Showing 8 changed files with 241 additions and 178 deletions.
diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py
@@ -17,6 +17,7 @@
     enrichment,
 )
 
+
 def evaluate(
     profiles: pd.DataFrame,
     features: List[str],
@@ -26,11 +27,11 @@ def evaluate(
     similarity_metric: str = "pearson",
     replicate_reproducibility_quantile: np.float = 0.95,
     replicate_reproducibility_return_median_cor: bool = False,
-    precision_recall_k: int = 10,
+    precision_recall_k: Union[int, List[int]] = 10,
     grit_control_perts: List[str] = ["None"],
     grit_replicate_summary_method: str = "mean",
     mp_value_params: dict = {},
-    enrichment_percentile: float = 0.5,
+    enrichment_percentile: Union[float, List[float]] = 0.99,
 ):
     r"""Evaluate profile quality and strength.
 
@@ -85,7 +86,7 @@ def evaluate(
         Only used when `operation='replicate_reproducibility'`. If True, then also
         return pairwise correlations as defined by replicate_groups and
         similarity metric
-    precision_recall_k : {10, ...}, optional
+    precision_recall_k : int or list of ints {10, ...}, optional
         Only used when `operation='precision_recall'`. Used to calculate precision and
         recall considering the top k profiles according to pairwise similarity.
     grit_control_perts : {None, ...}, optional
@@ -100,7 +101,7 @@ def evaluate(
         Only used when `operation='mp_value'`. A key, item pair of optional parameters
         for calculating mp value. See also
         :py:func:`cytominer_eval.operations.util.default_mp_value_parameters`
-    percentile : float, optional
+    enrichment_percentile : float or list of floats, optional
         Only used when `operation='enrichment'`. Determines the percentage of top connections
         used for the enrichment calculation.
     """

diff --git a/cytominer_eval/operations/enrichment.py b/cytominer_eval/operations/enrichment.py
@@ -2,7 +2,7 @@
 """
 import numpy as np
 import pandas as pd
-from typing import List
+from typing import List, Union
 import scipy
 
 from .util import assign_replicates, calculate_grit, check_grit_replicate_summary_method
@@ -14,8 +14,10 @@
 
 
 def enrichment(
-    similarity_melted_df: pd.DataFrame, replicate_groups: List[str], percentile: 0.9,
-) -> dict:
+    similarity_melted_df: pd.DataFrame,
+    replicate_groups: List[str],
+    percentile: Union[float, List[float]],
+) -> pd.DataFrame:
     """Calculate the enrichment score. This score is based on the fisher exact odds score. Similar to the other functions, the closest connections are determined and checked with the replicates.
     This score effectively calculates how much better the distribution of correct connections is compared to random.
 
@@ -28,48 +30,56 @@ def enrichment(
     replicate_groups : List
         a list of metadata column names in the original profile dataframe to use as
         replicate columns.
-    percentile :  float
+    percentile :  List of floats
         Determines what percentage of top connections used for the enrichment calculation.
 
     Returns
     -------
     dict
         percentile, threshold, odds ratio and p value
     """
-    # threshold based on percentile of top connections
-    threshold = similarity_melted_df.similarity_metric.quantile(percentile)
-
+    result = []
     replicate_truth_df = assign_replicates(
         similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
     )
-    # calculate the individual components of the contingency tables
-    v11 = len(
-        replicate_truth_df.query(
-            "group_replicate==True and similarity_metric>@threshold"
+    # loop over all percentiles
+    if type(percentile) == float:
+        percentile = [percentile]
+    for p in percentile:
+        # threshold based on percentile of top connections
+        threshold = similarity_melted_df.similarity_metric.quantile(p)
+
+        # calculate the individual components of the contingency tables
+        v11 = len(
+            replicate_truth_df.query(
+                "group_replicate==True and similarity_metric>@threshold"
+            )
         )
-    )
-    v12 = len(
-        replicate_truth_df.query(
-            "group_replicate==False and similarity_metric>@threshold"
+        v12 = len(
+            replicate_truth_df.query(
+                "group_replicate==False and similarity_metric>@threshold"
+            )
         )
-    )
-    v21 = len(
-        replicate_truth_df.query(
-            "group_replicate==True and similarity_metric<=@threshold"
+        v21 = len(
+            replicate_truth_df.query(
+                "group_replicate==True and similarity_metric<=@threshold"
+            )
         )
-    )
-    v22 = len(
-        replicate_truth_df.query(
-            "group_replicate==False and similarity_metric<=@threshold"
+        v22 = len(
+            replicate_truth_df.query(
+                "group_replicate==False and similarity_metric<=@threshold"
+            )
         )
-    )
 
-    v = np.asarray([[v11, v12], [v21, v22]])
-    r = scipy.stats.fisher_exact(v, alternative="greater")
-    result = {
-        "percentile": percentile,
-        "threshold": threshold,
-        "ods_ratio": r[0],
-        "p-value": r[1],
-    }
-    return result
+        v = np.asarray([[v11, v12], [v21, v22]])
+        r = scipy.stats.fisher_exact(v, alternative="greater")
+        result.append(
+            {
+                "enrichment_percentile": p,
+                "threshold": threshold,
+                "ods_ratio": r[0],
+                "p-value": r[1],
+            }
+        )
+    result_df = pd.DataFrame(result)
+    return result_df
diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 import pandas as pd
-from typing import List
+from typing import List, Union
 
 from .util import assign_replicates, calculate_precision_recall
 from cytominer_eval.transform.util import set_pair_ids, assert_melt
@@ -13,7 +13,7 @@
 def precision_recall(
     similarity_melted_df: pd.DataFrame,
     replicate_groups: List[str],
-    k: int,
+    k: Union[int, List[int]],
 ) -> pd.DataFrame:
     """Determine the precision and recall at k for all unique replicate groups
     based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)
@@ -27,7 +27,7 @@ def precision_recall(
     replicate_groups : List
         a list of metadata column names in the original profile dataframe to use as
         replicate columns.
-    k : int
+    k : List of ints or int
         an integer indicating how many pairwise comparisons to threshold.
 
     Returns
@@ -49,11 +49,16 @@ def precision_recall(
         "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
         for x in replicate_groups
     ]
-
-    # Calculate precision and recall for all groups
-    precision_recall_df = similarity_melted_df.groupby(replicate_group_cols).apply(
-        lambda x: calculate_precision_recall(x, k=k)
-    )
+    # iterate over all k
+    precision_recall_df = pd.DataFrame()
+    if type(k) == int:
+        k = [k]
+    for k_ in k:
+        # Calculate precision and recall for all groups
+        precision_recall_df_at_k = similarity_melted_df.groupby(
+            replicate_group_cols
+        ).apply(lambda x: calculate_precision_recall(x, k=k_))
+        precision_recall_df = precision_recall_df.append(precision_recall_df_at_k)
 
     # Rename the columns back to the replicate groups provided
     rename_cols = dict(zip(replicate_group_cols, replicate_groups))

diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py
@@ -111,11 +111,7 @@ def test_evaluate_replicate_reprod_return_cor_true():
 
     assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949
     assert sorted(med_cor_df.columns.tolist()) == sorted(
-        [
-            "Metadata_gene_name",
-            "Metadata_pert_name",
-            "similarity_metric",
-        ]
+        ["Metadata_gene_name", "Metadata_pert_name", "similarity_metric",]
     )
 
 
@@ -134,6 +130,7 @@ def test_evaluate_precision_recall():
 
     for k in ks:
 
+        # first test the function with k = float, later we test with k = list of floats
         result = evaluate(
             profiles=gene_profiles,
             features=gene_features,
@@ -152,15 +149,15 @@ def test_evaluate_precision_recall():
             result.query("recall == 1").shape[0]
             == expected_result["gene"]["recall"][str(k)]
         )
-
+        # test function with argument k = list of floats, should give same result as above
         result = evaluate(
             profiles=compound_profiles,
             features=compound_features,
             meta_features=compound_meta_features,
             replicate_groups=["Metadata_broad_sample"],
             operation="precision_recall",
             similarity_metric="pearson",
-            precision_recall_k=k,
+            precision_recall_k=[k],
         )
 
         assert (
@@ -205,9 +202,7 @@ def test_evaluate_grit():
     top_result = (
         grit_results_df.sort_values(by="grit", ascending=False)
         .reset_index(drop=True)
-        .iloc[
-            0,
-        ]
+        .iloc[0,]
     )
     assert np.round(top_result.grit, 4) == 2.3352
     assert top_result.group == "PTK2"
@@ -233,9 +228,7 @@ def test_evaluate_grit():
     top_result = (
         grit_results_df.sort_values(by="grit", ascending=False)
         .reset_index(drop=True)
-        .iloc[
-            0,
-        ]
+        .iloc[0,]
     )
 
     assert np.round(top_result.grit, 4) == 0.9990

diff --git a/cytominer_eval/tests/test_operations/test_enrichment.py b/cytominer_eval/tests/test_operations/test_enrichment.py
@@ -42,37 +42,43 @@
 
 
 def test_enrichment():
-    result = []
-    for p in np.arange(1, 0.97, -0.005):
-        r = enrichment(
-            similarity_melted_df=similarity_melted_df,
-            replicate_groups=replicate_groups,
-            percentile=p,
-        )
-        result.append(r)
-    result_df = pd.DataFrame(result)
+    percent_list = np.arange(1, 0.97, -0.005)
+    result = enrichment(
+        similarity_melted_df=similarity_melted_df,
+        replicate_groups=replicate_groups,
+        percentile=percent_list,
+    )
 
     # check for correct shape and starts with 1.0
-    assert result_df.shape == (7, 4)
-    assert result_df.percentile[0] == 1.0
+    assert result.shape == (7, 4)
+    assert result.enrichment_percentile[0] == 1.0
+    assert result.enrichment_percentile[1] == 0.995
     # check if the higher percentiles are larger than the small one
-    assert result_df.percentile[1] > result_df.percentile.iloc[-1]
+    assert result.enrichment_percentile[1] > result.enrichment_percentile.iloc[-1]
+
+    result_int = enrichment(
+        similarity_melted_df=similarity_melted_df,
+        replicate_groups=replicate_groups,
+        percentile=0.97,
+    )
+
+    assert result_int.enrichment_percentile[0] == result.enrichment_percentile.iloc[-1]
 
 
 def test_compare_functions():
-    percentile = 0.9
+    percent_list = [0.95, 0.9]
     eval_res = evaluate(
         profiles=df,
         features=features,
         meta_features=meta_features,
         replicate_groups=replicate_groups,
         operation="enrichment",
         similarity_metric="pearson",
-        enrichment_percentile=percentile,
+        enrichment_percentile=percent_list,
     )
     enr_res = enrichment(
         similarity_melted_df=similarity_melted_df,
         replicate_groups=replicate_groups,
-        percentile=percentile,
+        percentile=percent_list,
     )
-    assert enr_res == eval_res
+    assert enr_res.equals(eval_res)
diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py
@@ -39,22 +39,30 @@
 
 
 def test_precision_recall():
-    result = precision_recall(
+    result_list = precision_recall(
         similarity_melted_df=similarity_melted_df,
         replicate_groups=replicate_groups,
-        k=10,
+        k=[5, 10],
     )
 
-    assert len(result.k.unique()) == 1
-    assert result.k.unique()[0] == 10
+    result_int = precision_recall(
+        similarity_melted_df=similarity_melted_df,
+        replicate_groups=replicate_groups,
+        k=5,
+    )
+
+    assert len(result_list.k.unique()) == 2
+    assert result_list.k.unique()[0] == 5
 
     # ITGAV has a really strong profile
     assert (
-        result.sort_values(by="recall", ascending=False)
+        result_list.sort_values(by="recall", ascending=False)
         .reset_index(drop=True)
         .iloc[0, :]
         .Metadata_gene_name
         == "ITGAV"
     )
 
-    assert all(x in result.columns for x in replicate_groups)
+    assert all(x in result_list.columns for x in replicate_groups)
+
+    assert result_int.equals(result_list.query("k == 5"))
diff --git a/cytominer_eval/transform/util.py b/cytominer_eval/transform/util.py
@@ -7,7 +7,13 @@
 
 def get_available_eval_metrics():
     r"""Output the available eval metrics in the cytominer_eval library"""
-    return ["replicate_reproducibility", "precision_recall", "grit", "mp_value", "enrichment"]
+    return [
+        "replicate_reproducibility",
+        "precision_recall",
+        "grit",
+        "mp_value",
+        "enrichment",
+    ]
 
 
 def get_available_similarity_metrics():