Enhanced implementations for remote endpoints. (#682)

* Enhanced implementations for remote endpoints. Added ability to query gene associations from NCBI Added ability to query amigo for terms and relationships * lint * add missing * add missing
INCATools · Dec 20, 2023 · b404cd2 · b404cd2
1 parent 68abc1c
commit b404cd2
Show file tree

Hide file tree

Showing 16 changed files with 638,356 additions and 20 deletions.
diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py
@@ -1006,8 +1006,9 @@ def main(
 @main.command()
 @click.argument("terms", nargs=-1)
 @ontological_output_type_option
+@autolabel_option
 @output_option
-def search(terms, output_type: str, output: TextIO):
+def search(terms, output_type: str, autolabel, output: TextIO):
     """
     Searches ontology for entities that have a label, alias, or other property matching a search term.
 
@@ -1056,11 +1057,15 @@ def search(terms, output_type: str, output: TextIO):
     if isinstance(impl, SearchInterface):
         writer = _get_writer(output_type, impl, StreamingInfoWriter)
         writer.output = output
-        for curie_it in chunk(query_terms_iterator(terms, impl)):
-            logging.info("** Next chunk:")
-            # TODO: move chunking logic to writer
-            for curie, label in impl.labels(curie_it):
-                writer.emit(dict(id=curie, label=label))
+        if autolabel:
+            for curie_it in chunk(query_terms_iterator(terms, impl)):
+                logging.info("** Next chunk:")
+                # TODO: move chunking logic to writer
+                for curie, label in impl.labels(curie_it):
+                    writer.emit(dict(id=curie, label=label))
+        else:
+            for curie in query_terms_iterator(terms, impl):
+                writer.emit(dict(id=curie), label_fields=[])
         writer.finish()
     else:
         raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")

diff --git a/src/oaklib/datamodels/association.py b/src/oaklib/datamodels/association.py
@@ -1,5 +1,5 @@
-# Auto generated from association.yaml by pythongen.py version: 0.9.0
-# Generation date: 2023-08-14T15:53:04
+# Auto generated from association.yaml by pythongen.py version: 0.0.1
+# Generation date: 2023-11-17T17:56:03
 # Schema: association
 #
 # id: https://w3id.org/oak/association
@@ -69,6 +69,7 @@
 OA = CurieNamespace("oa", "http://www.w3.org/ns/oa#")
 ONTOASSOC = CurieNamespace("ontoassoc", "https://w3id.org/oak/association/")
 RDF = CurieNamespace("rdf", "http://example.org/UNKNOWN/rdf/")
+RDFS = CurieNamespace("rdfs", "http://example.org/UNKNOWN/rdfs/")
 SSSOM = CurieNamespace("sssom", "https://w3id.org/sssom/")
 DEFAULT_ = ONTOASSOC
 
@@ -100,6 +101,10 @@ class PositiveOrNegativeAssociation(YAMLRoot):
     publications: Optional[
         Union[Union[str, URIorCURIE], List[Union[str, URIorCURIE]]]
     ] = empty_list()
+    evidence_type: Optional[Union[str, URIorCURIE]] = None
+    supporting_objects: Optional[
+        Union[Union[str, URIorCURIE], List[Union[str, URIorCURIE]]]
+    ] = empty_list()
     primary_knowledge_source: Optional[Union[str, URIorCURIE]] = None
     aggregator_knowledge_source: Optional[Union[str, URIorCURIE]] = None
     subject_closure: Optional[
@@ -110,6 +115,7 @@ class PositiveOrNegativeAssociation(YAMLRoot):
         Union[Union[str, URIorCURIE], List[Union[str, URIorCURIE]]]
     ] = empty_list()
     object_closure_label: Optional[Union[str, List[str]]] = empty_list()
+    comments: Optional[Union[str, List[str]]] = empty_list()
 
     def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
         if self.subject is not None and not isinstance(self.subject, URIorCURIE):
@@ -148,6 +154,17 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
             v if isinstance(v, URIorCURIE) else URIorCURIE(v) for v in self.publications
         ]
 
+        if self.evidence_type is not None and not isinstance(self.evidence_type, URIorCURIE):
+            self.evidence_type = URIorCURIE(self.evidence_type)
+
+        if not isinstance(self.supporting_objects, list):
+            self.supporting_objects = (
+                [self.supporting_objects] if self.supporting_objects is not None else []
+            )
+        self.supporting_objects = [
+            v if isinstance(v, URIorCURIE) else URIorCURIE(v) for v in self.supporting_objects
+        ]
+
         if self.primary_knowledge_source is not None and not isinstance(
             self.primary_knowledge_source, URIorCURIE
         ):
@@ -188,6 +205,10 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
             v if isinstance(v, str) else str(v) for v in self.object_closure_label
         ]
 
+        if not isinstance(self.comments, list):
+            self.comments = [self.comments] if self.comments is not None else []
+        self.comments = [v if isinstance(v, str) else str(v) for v in self.comments]
+
         super().__post_init__(**kwargs)
 
 
@@ -711,6 +732,33 @@ class slots:
     range=Optional[Union[str, URIorCURIE]],
 )
 
+slots.evidence_type = Slot(
+    uri=ONTOASSOC.evidence_type,
+    name="evidence_type",
+    curie=ONTOASSOC.curie("evidence_type"),
+    model_uri=ONTOASSOC.evidence_type,
+    domain=None,
+    range=Optional[Union[str, URIorCURIE]],
+)
+
+slots.supporting_objects = Slot(
+    uri=ONTOASSOC.supporting_objects,
+    name="supporting_objects",
+    curie=ONTOASSOC.curie("supporting_objects"),
+    model_uri=ONTOASSOC.supporting_objects,
+    domain=None,
+    range=Optional[Union[Union[str, URIorCURIE], List[Union[str, URIorCURIE]]]],
+)
+
+slots.comments = Slot(
+    uri=RDFS.comment,
+    name="comments",
+    curie=RDFS.curie("comment"),
+    model_uri=ONTOASSOC.comments,
+    domain=None,
+    range=Optional[Union[str, List[str]]],
+)
+
 slots.denormalized_slot = Slot(
     uri=ONTOASSOC.denormalized_slot,
     name="denormalized_slot",

diff --git a/src/oaklib/datamodels/association.yaml b/src/oaklib/datamodels/association.yaml
@@ -41,12 +41,15 @@ classes:
       - object_label
       - negated
       - publications
+      - evidence_type
+      - supporting_objects
       - primary_knowledge_source
       - aggregator_knowledge_source
       - subject_closure
       - subject_closure_label
       - object_closure
       - object_closure_label
+      - comments
 
   Association:
     is_a: PositiveOrNegativeAssociation
@@ -242,6 +245,18 @@ slots:
     description: The knowledge source that aggregated the association
     slot_uri: biolink:aggregator_knowledge_source
     range: uriorcurie
+  evidence_type:
+    description: The type of evidence supporting the association
+    range: uriorcurie
+  supporting_objects:
+    description: The objects that support the association
+    range: uriorcurie
+    multivalued: true
+  comments:
+    description: Comments about the association
+    slot_uri: rdfs:comment
+    range: string
+    multivalued: true
   denormalized_slot:
     mixin: true
     description: |-
@@ -387,3 +402,4 @@ slots:
   associations_for_subjects_in_common:
     multivalued: true
     range: Association
+
diff --git a/src/oaklib/implementations/__init__.py b/src/oaklib/implementations/__init__.py
@@ -17,6 +17,7 @@
 from oaklib.implementations.kgx.kgx_implementation import KGXImplementation
 from oaklib.implementations.llm_implementation import LLMImplementation
 from oaklib.implementations.monarch.monarch_implementation import MonarchImplementation
+from oaklib.implementations.ncbi.ncbi_gene_implementation import NCBIGeneImplementation
 from oaklib.implementations.ols import (
     BaseOlsImplementation,
     OlsImplementation,
@@ -77,6 +78,7 @@
     "OlsImplementation",
     "TIBOlsImplementation",
     "MonarchImplementation",
+    "NCBIGeneImplementation",
     "OntobeeImplementation",
     "ProntoImplementation",
     "SimpleOboImplementation",

diff --git a/src/oaklib/implementations/amigo/amigo_implementation.py b/src/oaklib/implementations/amigo/amigo_implementation.py
@@ -1,12 +1,16 @@
 """Adapter for AmiGO solr index."""
+import json
 import logging
 from dataclasses import dataclass
 from time import sleep
-from typing import Any, Dict, Iterable, Iterator, List, Optional
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple
 
 import pysolr
 
 from oaklib.datamodels.association import Association
+from oaklib.datamodels.search import SearchConfiguration
+from oaklib.datamodels.vocabulary import RDFS_LABEL
+from oaklib.interfaces import SearchInterface
 from oaklib.interfaces.association_provider_interface import (
     AssociationProviderInterface,
 )
@@ -15,15 +19,18 @@
     "AmiGOImplementation",
 ]
 
-from oaklib.interfaces.basic_ontology_interface import LANGUAGE_TAG
+from oaklib.interfaces.basic_ontology_interface import LANGUAGE_TAG, RELATIONSHIP
 from oaklib.types import CURIE, PRED_CURIE
+from oaklib.utilities.iterator_utils import chunk
 
 AMIGO_ENDPOINT = "http://golr.geneontology.org/solr/"
 
 logger = logging.getLogger(__name__)
 
 LIMIT = 10000
 
+ONTOLOGY_CLASS_CATEGORY = "ontology_class"
+
 # TODO: derive from schema
 DOCUMENT_CATEGORY = "document_category"
 BIOENTITY = "bioentity"
@@ -36,6 +43,10 @@
 ASSIGNED_BY = "assigned_by"
 REFERENCE = "reference"
 
+NEIGHBORHOOD_GRAPH_JSON = "neighborhood_graph_json"
+TOPOLOGY_GRAPH_JSON = "topology_graph_json"
+REGULATES_TRANSITIVITY_GRAPH_JSON = "regulates_transitivity_graph_json"
+
 # general
 ENTITY = "entity"
 ENTITY_LABEL = "entity_label"
@@ -57,15 +68,20 @@ def _fq_element(k, vs):
     return f"{k}:({v})"
 
 
-def _query(solr, fq, fields, start: int = None, limit: int = None) -> Iterator[Dict]:
+def _query(
+    solr, fq, fields, q=None, start: int = None, limit: int = None, **kwargs
+) -> Iterator[Dict]:
     if start is None:
         start = 0
     if limit is None:
         limit = LIMIT
     fq_list = [_fq_element(k, vs) for k, vs in fq.items()]
-    params = {"fq": fq_list, "fl": ",".join(fields)}
+    params = {"fq": fq_list, "fl": ",".join(fields), **kwargs}
+    if not q:
+        q = "*:*"
+    logging.info(f"QUERY: {q} PARAMS: {params}")
     while True:
-        results = solr.search("*:*", rows=limit, start=start, **params)
+        results = solr.search(q, rows=limit, start=start, **params)
         yield from results
         logging.debug(f"CHECKING: {start} + {len(results)} >= {results.hits}")
         if start + len(results) >= results.hits:
@@ -90,6 +106,7 @@ def _normalize(curie: CURIE) -> CURIE:
 @dataclass
 class AmiGOImplementation(
     AssociationProviderInterface,
+    SearchInterface,
 ):
     """
     Wraps AmiGO endpoint.
@@ -121,14 +138,48 @@ def __post_init__(self):
         self._source = self.resource.slug
         self._solr = pysolr.Solr(AMIGO_ENDPOINT)
 
+    def _cache_nodes(self, nodes: List[Dict], curies: Iterable[CURIE]):
+        for node in nodes:
+            curie = node["id"]
+            if curie in curies:
+                lbl = node.get("lbl", None)
+                if lbl:
+                    self.property_cache.add(curie, RDFS_LABEL, node["lbl"])
+
     def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]:
         if lang:
             raise NotImplementedError
+        if self.property_cache.contains(curie, RDFS_LABEL):
+            return self.property_cache.get(curie, RDFS_LABEL)
         fq = {"document_category": ["general"], ENTITY: [curie]}
         solr = self._solr
         results = _query(solr, fq, [ENTITY_LABEL])
+        self.property_cache.add(curie, RDFS_LABEL, None)
         for doc in results:
-            return doc[ENTITY_LABEL]
+            lbl = doc[ENTITY_LABEL]
+            self.property_cache.add(curie, RDFS_LABEL, lbl)
+            return lbl
+
+    def labels(
+        self, curies: Iterable[CURIE], allow_none=True, lang: LANGUAGE_TAG = None
+    ) -> Iterable[Tuple[CURIE, str]]:
+        if lang:
+            raise NotImplementedError
+        # Note: some issues with post, use a low chunk size to ensure GET
+        for curie_it in chunk(curies, 10):
+            next_curies = list(curie_it)
+            for curie in next_curies:
+                lbl = self.property_cache.get(curie, RDFS_LABEL)
+                if lbl is not None:
+                    yield curie, lbl
+                    next_curies.remove(curie)
+            if not next_curies:
+                continue
+            fq = {"document_category": ["general"], ENTITY: next_curies}
+            solr = self._solr
+            results = _query(solr, fq, [ENTITY, ENTITY_LABEL])
+            for doc in results:
+                yield doc[ENTITY], doc[ENTITY_LABEL]
 
     def associations(
         self,
@@ -179,3 +230,58 @@ def associations(
                 assoc.subject_closure = doc[ISA_PARTOF_CLOSURE]
                 assoc.subject_closure_label = doc[ISA_PARTOF_CLOSURE_LABEL]
             yield assoc
+
+    def relationships(
+        self,
+        subjects: Iterable[CURIE] = None,
+        predicates: Iterable[PRED_CURIE] = None,
+        objects: Iterable[CURIE] = None,
+        include_tbox: bool = True,
+        include_abox: bool = True,
+        include_entailed: bool = False,
+        exclude_blank: bool = True,
+    ) -> Iterator[RELATIONSHIP]:
+        solr = self._solr
+        fq = {DOCUMENT_CATEGORY: [ONTOLOGY_CLASS_CATEGORY]}
+        # neighborhood graph is indexed for both subject and object
+        if subjects:
+            subjects = list(subjects)
+            fq[ANNOTATION_CLASS] = subjects
+        elif objects:
+            objects = list(objects)
+            fq[ANNOTATION_CLASS] = objects
+        select_fields = [ANNOTATION_CLASS, NEIGHBORHOOD_GRAPH_JSON]
+        if include_entailed:
+            select_fields.append(REGULATES_TRANSITIVITY_GRAPH_JSON)
+        results = _query(solr, fq, select_fields)
+
+        for doc in results:
+            neighborhood_graph = json.loads(doc[NEIGHBORHOOD_GRAPH_JSON])
+            edges = neighborhood_graph["edges"]
+            nodes = neighborhood_graph["nodes"]
+            if include_entailed:
+                closure_graph = json.loads(doc[REGULATES_TRANSITIVITY_GRAPH_JSON])
+                edges.extend(closure_graph["edges"])
+            if subjects:
+                edges = [e for e in edges if e["sub"] in subjects]
+            if objects:
+                edges = [e for e in edges if e["obj"] in objects]
+            if predicates:
+                edges = [e for e in edges if e["pred"] in predicates]
+            for edge in edges:
+                s, p, o = edge["sub"], edge["pred"], edge["obj"]
+                self._cache_nodes(nodes, [s, p, o])
+                yield s, p, o
+
+    def basic_search(
+        self, search_term: str, config: Optional[SearchConfiguration] = None
+    ) -> Iterable[CURIE]:
+        solr = self._solr
+        fq = {DOCUMENT_CATEGORY: ["general"]}
+        # fq["general_blob"] = search_term
+        results = _query(
+            solr, fq, ["entity"], q=search_term, qf="general_blob_searchable", defType="edismax"
+        )
+
+        for doc in results:
+            yield doc["entity"]
diff --git a/src/oaklib/implementations/ncbi/__init__.py b/src/oaklib/implementations/ncbi/__init__.py