Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add get_cell_types_by_genes function #225

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 97 additions & 1 deletion src/vfb_connect/cross_server_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ def get_transcriptomic_profile(self, cell_type, gene_type=False, no_subtypes=Fal
"sd.link_base[0] + dbxd.accession[0] + sd.postfix[0] AS download_linkout, "
"g.label AS gene, g.short_form AS gene_id, "
"apoc.coll.subtract(labels(g), ['Class', 'Entity', 'hasScRNAseq', 'Feature', 'Gene']) AS function, "
"e.expression_extent[0] as extent, toFloat(e.expression_level[0]) as level "
"toFloat(e.expression_level[0]) as level, e.expression_extent[0] as extent "
"ORDER BY cell_type, g.label" % (gene_label, cell_type_short_form, equal_condition))
r = self.nc.commit_list([query])
dc = dict_cursor(r)
Expand All @@ -724,6 +724,102 @@ def get_transcriptomic_profile(self, cell_type, gene_type=False, no_subtypes=Fal
else:
return dc

def get_cell_types_by_genes(self, genes=None, gene_type=False, cell_type=None, query_by_label=True, return_dataframe=True):
"""Get cell types that express a given gene, list of genes and/or type of gene based on transcriptomics data.

Returns a DataFrame of gene expression data for clusters of cells that express the specified gene(s).
Optionally query by gene_type, which can be retrieved using `get_gene_function_filters`.
At least one of genes or gene_type must be specified. If both are given, these have an additive effect.
If no data is found, returns False.
Can optionally restrict the output to children of cell_type e.g. 'optic lobe intrinsic neuron'.

:param genes: Optional. A list of FlyBase gene (FBgn) IDs.
:param gene_type: Optional. A gene function label retrieved using `get_gene_function_filters` (can be a list).
:param cell_type: The ID, name, or symbol of a class in the Drosophila Anatomy Ontology (FBbt).
:param query_by_label: Optional. Query using cell type or gene labels if `True`, or IDs if `False`. Default `True`.
:param return_dataframe: Optional. Returns pandas DataFrame if `True`, otherwise returns list of dicts. Default `True`.
:return: A DataFrame of cell types and scRNAseq expression data associated with clusters that express the given gene(s).
:rtype: pandas.DataFrame or list of dicts
:raises KeyError: If the genes, gene_type or cell_type are invalid.
"""

if not (genes or gene_type):
raise ValueError("At least one gene or gene_type must be specified.")

if isinstance(genes, str):
genes = [genes]
if isinstance(gene_type, str):
gene_type = [gene_type]

if genes:
# self.lookup does not contain FBgns
FBgn_lookup = self.nc.get_lookup(limit_type_by_prefix='FBgn', include_individuals=False)
if query_by_label:
# this will be a bit broken until synonym unpacking is fixed
gene_short_forms = [FBgn_lookup.get(g, g) for g in genes] # keep input if unmapped in case it is an ID
else:
if all(g in FBgn_lookup.values() for g in genes):
gene_short_forms = genes
else:
raise KeyError("genes must be a list of valid IDs from FlyBase.")
if not all(g.startswith('FBgn') for g in gene_short_forms):
raise KeyError("genes must be a list of valid IDs, labels or symbols from FlyBase.")
gene_filter = ["g.short_form IN ['%s']" % "','".join(gene_short_forms)]
else:
gene_filter = []

if gene_type:
if any(g not in self.get_gene_function_filters() for g in gene_type):
raise KeyError("gene_type must be a valid gene function label, try running get_gene_function_filters()")
else:
type_filter = [f"g:{t}" for t in gene_type]
else:
type_filter = []

gene_filter = ' OR '.join(type_filter + gene_filter)

if cell_type:
if query_by_label:
cell_type_short_form = self.lookup_id(cell_type)
else:
if cell_type in self.lookup.values():
cell_type_short_form = cell_type
else:
raise KeyError("cell_type must be a valid ID from the Drosophila Anatomy Ontology")

if not cell_type_short_form.startswith('FBbt'):
raise KeyError("cell_type must be a valid ID, label or symbol from the Drosophila Anatomy Ontology")

cell_type_filter = "MATCH (anat)-[:SUBCLASSOF*0..]->(:Class {short_form:'%s'}) " % cell_type_short_form
else:
cell_type_filter = ''

query = ("MATCH (g:Gene:Class) "
"WHERE %s "
"MATCH (g)<-[e:expresses]-(clus:Cluster:Individual)-[:composed_primarily_of]->(anat:Class) "
"%s"
"MATCH (clus)-[:part_of]->(:Individual)-[:has_part]->(sa:Sample:Individual) "
"OPTIONAL MATCH (sa)-[:part_of]->(sex:Class) "
"WHERE sex.short_form IN ['FBbt_00007011', 'FBbt_00007004'] "
"OPTIONAL MATCH (sa)-[:overlaps]->(tis:Class:Anatomy) "
"MATCH (clus)-[:has_source]->(ds:DataSet:Individual) "
"OPTIONAL MATCH (ds)-[:has_reference]->(p:pub:Individual) "
"RETURN anat.label AS cell_type, anat.short_form AS cell_type_id, "
"g.label AS gene, g.short_form AS gene_id, "
"apoc.coll.subtract(labels(g), ['Class', 'Entity', 'hasScRNAseq', 'Feature', 'Gene']) AS function, "
"ds.short_form AS dataset_id, p.miniref[0] as ref, "
"sex.label AS sample_sex, COLLECT(tis.label) AS sample_tissue, "
"toFloat(e.expression_level[0]) as level, e.expression_extent[0] as extent "
"ORDER BY cell_type, gene" % (gene_filter, cell_type_filter))

r = self.nc.commit_list([query])
dc = dict_cursor(r)
if return_dataframe:
return pd.DataFrame.from_records(dc)
else:
return dc


def get_neuron_pubs(self, neuron, include_subclasses=True, include_nlp=False,
query_by_label=True, verbose=False):

Expand Down
Loading