From 94691fb3b8d013896cdadabc48e773e451752a80 Mon Sep 17 00:00:00 2001
From: Clare72 <cp390@cam.ac.uk>
Date: Fri, 20 Sep 2024 18:07:00 +0100
Subject: [PATCH 1/2] add get_cell_types_by_genes function

---
 src/vfb_connect/cross_server_tools.py | 98 ++++++++++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/src/vfb_connect/cross_server_tools.py b/src/vfb_connect/cross_server_tools.py
index c76c7190..6c52d404 100644
--- a/src/vfb_connect/cross_server_tools.py
+++ b/src/vfb_connect/cross_server_tools.py
@@ -715,7 +715,7 @@ def get_transcriptomic_profile(self, cell_type, gene_type=False, no_subtypes=Fal
                  "sd.link_base[0] + dbxd.accession[0] + sd.postfix[0] AS download_linkout, "
                  "g.label AS gene, g.short_form AS gene_id, "
                  "apoc.coll.subtract(labels(g), ['Class', 'Entity', 'hasScRNAseq', 'Feature', 'Gene']) AS function, "
-                 "e.expression_extent[0] as extent, toFloat(e.expression_level[0]) as level "
+                 "toFloat(e.expression_level[0]) as level, e.expression_extent[0] as extent "
                  "ORDER BY cell_type, g.label" % (gene_label, cell_type_short_form, equal_condition))
         r = self.nc.commit_list([query])
         dc = dict_cursor(r)
@@ -724,6 +724,102 @@ def get_transcriptomic_profile(self, cell_type, gene_type=False, no_subtypes=Fal
         else:
             return dc
 
+    def get_cell_types_by_genes(self, genes=None, gene_type=False, cell_type=None, query_by_label=True, return_dataframe=True):
+        """Get cell types that express a given gene, list of genes and/or type of gene based on transcriptomics data.
+
+        Returns a DataFrame of gene expression data for clusters of cells that express the specified gene(s).
+        Optionally query by gene_type, which can be retrieved using `get_gene_function_filters`.
+        At least one of genes or gene_type must be specified. If both are given, these have an additive effect.
+        If no data is found, returns False.
+        Can optionally restrict the output to children of cell_type e.g. 'optic lobe intrinsic neuron'.
+
+        :param genes: Optional. A list of FlyBase gene (FBgn) IDs.
+        :param gene_type: Optional. A gene function label retrieved using `get_gene_function_filters` (can be a list).
+        :param cell_type: The ID, name, or symbol of a class in the Drosophila Anatomy Ontology (FBbt).
+        :param query_by_label: Optional. Query using cell type or gene labels if `True`, or IDs if `False`. Default `True`.
+        :param return_dataframe: Optional. Returns pandas DataFrame if `True`, otherwise returns list of dicts. Default `True`.
+        :return: A DataFrame of cell types and scRNAseq expression data associated with clusters that express the given gene(s).
+        :rtype: pandas.DataFrame or list of dicts
+        :raises KeyError: If the genes, gene_type or cell_type are invalid.
+        """
+
+        if not (genes or gene_type):
+            raise ValueError("At least one gene or gene_type must be specified.")
+
+        if isinstance(genes, str):
+            genes = [genes]
+        if isinstance(gene_type, str):
+            gene_type = [gene_type]
+
+        if genes:
+            # self.lookup does not contain FBgns
+            FBgn_lookup = self.nc.get_lookup(limit_type_by_prefix='FBgn', include_individuals=False)
+            if query_by_label:
+                # this will be a bit broken until synonym unpacking is fixed
+                gene_short_forms = [FBgn_lookup.get(g, g) for g in genes]  # keep input if unmapped in case it is an ID
+            else:
+                if all(g in FBgn_lookup.values() for g in genes):
+                    gene_short_forms = genes
+                else:
+                    raise KeyError("genes must be a list of valid IDs from FlyBase.")
+            if not all(g.startswith('FBgn') for g in gene_short_forms):
+                raise KeyError("genes must be a list of valid IDs, labels or symbols from FlyBase.")
+            gene_filter = ["g.short_form IN ['%s']" % "','".join(gene_short_forms)]
+        else:
+            gene_filter = []
+
+        if gene_type:
+            if any(g not in self.get_gene_function_filters() for g in gene_type):
+                raise KeyError("gene_type must be a valid gene function label, try running get_gene_function_filters()")
+            else:
+                type_filter = [f"g:{t}" for t in gene_type]
+        else:
+            type_filter = []
+
+        gene_filter = ' OR '.join(type_filter + gene_filter)
+
+        if cell_type:
+            if query_by_label:
+                cell_type_short_form = self.lookup_id(cell_type)
+            else:
+                if cell_type in self.lookup.values():
+                    cell_type_short_form = cell_type
+                else:
+                    raise KeyError("cell_type must be a valid ID from the Drosophila Anatomy Ontology")
+
+            if not cell_type_short_form.startswith('FBbt'):
+                raise KeyError("cell_type must be a valid ID, label or symbol from the Drosophila Anatomy Ontology")
+
+            cell_type_filter = "MATCH (anat)-[:SUBCLASSOF*0..]->(:Class {short_form:'%s'}) " % cell_type_short_form
+        else:
+            cell_type_filter = ''
+
+        query = ("MATCH (g:Gene:Class) "
+                 "WHERE %s "
+                 "MATCH (g)<-[e:expresses]-(clus:Cluster:Individual)-[:composed_primarily_of]->(anat:Class) "
+                 "%s"
+                 "MATCH (clus)-[:part_of]->(:Individual)-[:has_part]->(sa:Sample:Individual) "
+                 "OPTIONAL MATCH (sa)-[:part_of]->(sex:Class) "
+                 "WHERE sex.short_form IN ['FBbt_00007011', 'FBbt_00007004'] "
+                 "OPTIONAL MATCH (sa)-[:overlaps]->(tis:Class:Anatomy) "
+                 "MATCH (clus)-[:has_source]->(ds:DataSet:Individual) "
+                 "OPTIONAL MATCH (ds)-[:has_reference]->(p:pub:Individual) "
+                 "RETURN anat.label AS cell_type, anat.short_form AS cell_type_id, "
+                 "g.label AS gene, g.short_form AS gene_id, "
+                 "apoc.coll.subtract(labels(g), ['Class', 'Entity', 'hasScRNAseq', 'Feature', 'Gene']) AS function, "
+                 "ds.short_form AS dataset_id, p.miniref[0] as ref, "
+                 "sex.label AS sample_sex, COLLECT(tis.label) AS sample_tissue, "
+                 "toFloat(e.expression_level[0]) as level, e.expression_extent[0] as extent "
+                 "ORDER BY cell_type, gene" % (gene_filter, cell_type_filter))
+
+        r = self.nc.commit_list([query])
+        dc = dict_cursor(r)
+        if return_dataframe:
+            return pd.DataFrame.from_records(dc)
+        else:
+            return dc
+
+
     def get_neuron_pubs(self, neuron, include_subclasses=True, include_nlp=False,
                         query_by_label=True, verbose=False):
 

From 282e7731dc197727f3ece926f3ecd6a303de4807 Mon Sep 17 00:00:00 2001
From: Clare Pilgrim <38460997+Clare72@users.noreply.github.com>
Date: Fri, 20 Sep 2024 18:10:11 +0100
Subject: [PATCH 2/2] Update test_notebooks.yml

---
 .github/workflows/test_notebooks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_notebooks.yml b/.github/workflows/test_notebooks.yml
index 8231ac3b..93f18801 100644
--- a/.github/workflows/test_notebooks.yml
+++ b/.github/workflows/test_notebooks.yml
@@ -28,7 +28,7 @@ jobs:
           jupyter nbconvert --to notebook --execute --ExecutePreprocessor.timeout=None snippets/*.ipynb
 
 
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         with:
           name: notebooks-for-${{ github.sha }}
           path: docs/tutorials