Merge branch 'master' into pubs_query_fix

VirtualFlyBrain · Sep 5, 2024 · db7112e · db7112e
2 parents 39af777 + d962e47
commit db7112e
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 10 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,5 @@ jsonpath_rw
 bottleneck<=1.3.6
 matplotlib>3.9
 seaborn>0.13
+fonttools>=4.43.0 # not directly required, pinned by Snyk to avoid a vulnerability
+pillow>=10.3.0 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/src/vfb_connect.egg-info/PKG-INFO b/src/vfb_connect.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vfb_connect
-Version: 2.2.5.dev6+8aacf1a
+Version: 2.2.7.dev7+f46d531.dirty
 Summary: Wrapper for querying VirtualFlyBrain servers.
 Home-page: https://github.com/VirtualFlyBrain/VFB_connect
 Author: David Osumi-Sutherland

diff --git a/src/vfb_connect/cross_server_tools.py b/src/vfb_connect/cross_server_tools.py
@@ -95,6 +95,7 @@ def __init__(self, neo_endpoint=get_default_servers()['neo_endpoint'],
         self.nc = Neo4jConnect(**connections['neo'])
         self.neo_query_wrapper = QueryWrapper(**connections['neo'])
         self.cache_file = self.get_cache_file_path()
+        self._dbs_cache = {}
         self.lookup = self.nc.get_lookup(cache=self.cache_file)
         self.normalized_lookup = self.preprocess_lookup()
         self.reverse_lookup = {v: k for k, v in self.lookup.items()}
@@ -699,10 +700,16 @@ def get_transcriptomic_profile(self, cell_type, gene_type=False, no_subtypes=Fal
                  "OPTIONAL MATCH (sa)-[:overlaps]->(tis:Class:Anatomy) "
                  "OPTIONAL MATCH (clus)-[:has_source]->(ds:DataSet:Individual) "
                  "OPTIONAL MATCH (ds)-[:has_reference]->(p:pub:Individual) "
-                 "OPTIONAL MATCH (ds)-[dbx:database_cross_reference]->(s:Site:Individual) "
+                 "OPTIONAL MATCH (ds)-[dbxw:database_cross_reference]->(sw:Site:Individual "
+                 "{short_form:'scExpressionAtlas'}) "
+                 "OPTIONAL MATCH (ds)-[dbxd:database_cross_reference]->(sd:Site:Individual "
+                 "{short_form:'scExpressionAtlasFTP'}) WHERE dbxd.accession[0] = dbxw.accession[0] "
                  "RETURN DISTINCT c2.label AS cell_type, c2.short_form AS cell_type_id, "
                  "sex.label AS sample_sex, COLLECT(tis.label) AS sample_tissue, "
-                 "p.miniref[0] as ref, g.label AS gene, g.short_form AS gene_id, "
+                 "ds.short_form AS dataset_id, p.miniref[0] as ref, "
+                 "sw.link_base[0] + dbxw.accession[0] AS website_linkout, "
+                 "sd.link_base[0] + dbxd.accession[0] + sd.postfix[0] AS download_linkout, "
+                 "g.label AS gene, g.short_form AS gene_id, "
                  "apoc.coll.subtract(labels(g), ['Class', 'Entity', 'hasScRNAseq', 'Feature', 'Gene']) AS function, "
                  "e.expression_extent[0] as extent, toFloat(e.expression_level[0]) as level "
                  "ORDER BY cell_type, g.label" % (gene_label, cell_type_short_form, equal_condition))
@@ -907,15 +914,56 @@ def vfb_id_2_xrefs(self, vfb_id: iter, db='', id_type='', reverse_return=False):
         """
         return self.neo_query_wrapper.vfb_id_2_xrefs(vfb_id=vfb_id, db=db, id_type=id_type, reverse_return=reverse_return)
 
-    def get_dbs(self, include_symbols=True):
-        """Get all external databases in the database.
+    def get_dbs(self, include_symbols=True, data_sources_only=True, verbose=False):
+        """Get all external databases in the database, optionally filtering by data sources and including symbols.
 
-        :return: List of external databases in the database.
+        :param include_symbols: If True, include the symbols of the databases.
+        :type include_symbols: bool
+        :param data_sources_only: If True, only include databases where is_data_source=True.
+        :type data_sources_only: bool
+        :return: List of external databases and optionally their symbols.
         :rtype: list
         """
-        if not self._dbs:
-            self._dbs = self.neo_query_wrapper.get_dbs(include_symbols=include_symbols)
-        return self._dbs
+        # Create a cache key based on the options to ensure unique cache for each option set
+        cache_key = (include_symbols, data_sources_only)
+
+        # Check if the result is already cached
+        if cache_key in self._dbs_cache and self._dbs_cache[cache_key]:
+            print("Returning cached results") if verbose else None
+            return self._dbs_cache[cache_key]
+
+        print("Querying for external database ids") if verbose else None
+        # Base query to get all databases, filtering for data sources if needed
+        query = "MATCH (i:Individual) "
+        if data_sources_only:
+            query += "WHERE i.is_data_source=[True] AND (i:Site OR i:API) "
+        else:
+            query += "WHERE i:Site OR i:API "
+        query += "RETURN i.short_form as id"
+
+        # Execute the query
+        print("Querying for external database ids:", query) if verbose else None
+        results = self.cypher_query(query, return_dataframe=False, verbose=verbose)
+        dbs = [d['id'] for d in results]
+
+        # Optionally include symbols
+        if include_symbols:
+            print("Querying for external database symbols") if verbose else None
+            symbol_query = "MATCH (i:Individual) "
+            if data_sources_only:
+                symbol_query += "WHERE i.is_data_source=[True] AND (i:Site OR i:API) "
+            else:
+                symbol_query += "WHERE i:Site OR i:API "
+            symbol_query += "AND exists(i.symbol) AND not i.symbol[0] = '' RETURN i.symbol[0] as symbol"
+
+            print("Querying for external database symbols:",symbol_query) if verbose else None
+            symbol_results = self.cypher_query(symbol_query, return_dataframe=False, verbose=verbose)
+            dbs.extend([d['symbol'] for d in symbol_results])
+
+        # Cache the results for this combination of parameters
+        self._dbs_cache[cache_key] = dbs
+
+        return dbs
 
     def get_scRNAseq_expression(self, id, query_by_label=True, return_id_only=False, return_dataframe=True, verbose=False):
         """
@@ -1087,9 +1135,13 @@ def cypher_query(self, query, return_dataframe=True, verbose=False):
         :return: A DataFrame or list of results.
         :rtype: pandas.DataFrame or list of dicts
         """
+        print(f"Running query: {query}") if verbose else None
         r = self.nc.commit_list([query])
+        print(r) if verbose else None
         dc = dict_cursor(r)
+        print(dc) if verbose else None
         if return_dataframe:
+            print("Returning DataFrame") if verbose else None
             return pd.DataFrame.from_records(dc)
         return dc
 

diff --git a/src/vfb_connect/schema/vfb_term.py b/src/vfb_connect/schema/vfb_term.py
@@ -3016,7 +3016,7 @@ def __init__(self, terms: Union[List[VFBTerm], List[str], pandas.core.frame.Data
             print(f"Changing {len(terms)} term names to ids") if verbose else None
             terms = [self.vfb.lookup_id(term) for term in terms if term]
             if self.vfb._load_limit and len(terms) > self.vfb._load_limit:
-                print(f"More thann the load limit of {self.vfb._load_limit} requested. Loading first {self.vfb._load_limit} terms out of {len(terms)}")
+                print(f"More than the load limit of {self.vfb._load_limit} requested. Loading first {self.vfb._load_limit} terms out of {len(terms)}")
                 terms = terms[:self.vfb._load_limit]
             print(f"Pulling {len(terms)} terms from VFB...")
             json_list = self.vfb.get_TermInfo(terms, summary=False, verbose=verbose, query_by_label=query_by_label)