added return smoothed anndatas

IlyaLab · Jan 3, 2024 · 772664c · 772664c
1 parent 9605ed3
commit 772664c
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 1 deletion.
diff --git a/gssnng/score_cells.py b/gssnng/score_cells.py
@@ -229,7 +229,8 @@ def _proc_data(
         samp_neighbors: int,
         noise_trials: int,
         ranked: bool,
-        cores: int
+        cores: int,
+        return_data: int
                      ):
     """
     In many cases, the neighbors should be defined.  If you have mixed clinical endpoints,
@@ -247,6 +248,7 @@ def _proc_data(
     :param noise_trials: number of noisy samples to create, integer
     :param ranked: whether the gene expression counts should be rank ordered
     :param cores: number of parallel processes to work through groupby groups
+    :param return_data: should the smoothed data list be returned?
 
     :returns: scores in a dict for each cell in a list.
     """
@@ -284,6 +286,9 @@ def _proc_data(
     data_list = _build_data_list(adata, groupby, cats, recompute_neighbors, samp_neighbors, smooth_mode)
     # then we can start scoring cells #
 
+    if return_data == 1:
+        return(data_list)
+
     # building up the argument list for the parallel call of _score_all_cells_all_sets
     arglist = []
     for smoothed_adata, groupname in data_list:

diff --git a/gssnng/smooth_anndatas.py b/gssnng/smooth_anndatas.py
@@ -0,0 +1,55 @@
+import anndata
+from gssnng.score_cells import _proc_data
+#from gssnng.util import error_checking
+from typing import Union
+
+def smooth_anndata(
+        adata: anndata.AnnData,
+        groupby: Union[str, list, dict],
+        smooth_mode: str,
+        recompute_neighbors: int,
+        method_params: dict,
+        cores: int
+    ) -> anndata.AnnData:
+
+    """
+    gene set scoring (all gene sets in file) with nearest neighbor smoothing of the expression matrix
+
+    Improved single cell scoring by:
+    - smoothing the data matrix
+        - adding noise to the nearest neighbor smoothing via `samp_neighbors`
+    - adding noise to the expression data itself (via noise_trials)
+
+    :param adata
+        anndata.AnnData containing the cells to be scored
+    :param groupby
+        either a column label in adata.obs, and all categories taken, or a dict specifies one group.
+    :param smooth_mode
+        `adjacency` or `connectivity`, which representation of the neighborhood graph to use.
+        `adjacency` weights all neighbors equally, `connectivity` weights close neighbors more
+    :param recompute_neighbors
+        should neighbors be recomputed within each group, 0 for no, >0 for yes and specifies N
+    :param method_params
+        specific params for each method.
+    :param cores
+        number of parallel processes to work through groupby groups
+
+    :returns: adata with gene set scores in .obs
+    """
+
+    return_data = 1
+    noise_trials = 0  ### not used currently
+    samp_neighbors = None
+
+    #error_checking2(adata, recompute_neighbors, method_params)  # UPDATE
+
+    if method_params == None:
+        method_params = dict()
+
+    # score each cell with the list of gene sets
+    data_list = _proc_data(adata, None, groupby, smooth_mode, recompute_neighbors,
+                                  None, method_params, None,
+                                  noise_trials, None, cores, return_data)
+
+    print("**done**")
+    return(data_list)
diff --git a/gssnng/test/test_return_smoothed.py b/gssnng/test/test_return_smoothed.py
@@ -0,0 +1,29 @@
+if __name__ == '__main__':
+
+    import scanpy as sc
+    from gssnng.smooth_anndatas import smooth_anndata
+    import time
+
+    def test_return_smoothed(adata):
+        res0 = smooth_anndata(adata=adata,
+                              groupby='louvain',
+                              smooth_mode='adjacency',
+                              recompute_neighbors=32,
+                              method_params={},
+                              cores=4)
+        return(res0)
+
+
+    def test_score_all_sets():
+        q = sc.datasets.pbmc3k_processed()
+        t0 = time.time()
+        print('start time: ' + str(t0))
+        data_list = test_return_smoothed(q)
+        print('******DONE*******')
+        t1 = time.time()
+        print('end time: ' + str(t1))
+        print('TOTAL TIME: ' + str(t1-t0))
+        print(len(data_list))
+
+    test_score_all_sets()
+    print('test done')