Skip to content

Commit

Permalink
Enhanced implementations for remote endpoints. (#682)
Browse files Browse the repository at this point in the history
* Enhanced implementations for remote endpoints.

Added ability to query gene associations from NCBI
Added ability to query amigo for terms and relationships

* lint

* add missing

* add missing
  • Loading branch information
cmungall authored Dec 20, 2023
1 parent 68abc1c commit b404cd2
Show file tree
Hide file tree
Showing 16 changed files with 638,356 additions and 20 deletions.
17 changes: 11 additions & 6 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,8 +1006,9 @@ def main(
@main.command()
@click.argument("terms", nargs=-1)
@ontological_output_type_option
@autolabel_option
@output_option
def search(terms, output_type: str, output: TextIO):
def search(terms, output_type: str, autolabel, output: TextIO):
"""
Searches ontology for entities that have a label, alias, or other property matching a search term.
Expand Down Expand Up @@ -1056,11 +1057,15 @@ def search(terms, output_type: str, output: TextIO):
if isinstance(impl, SearchInterface):
writer = _get_writer(output_type, impl, StreamingInfoWriter)
writer.output = output
for curie_it in chunk(query_terms_iterator(terms, impl)):
logging.info("** Next chunk:")
# TODO: move chunking logic to writer
for curie, label in impl.labels(curie_it):
writer.emit(dict(id=curie, label=label))
if autolabel:
for curie_it in chunk(query_terms_iterator(terms, impl)):
logging.info("** Next chunk:")
# TODO: move chunking logic to writer
for curie, label in impl.labels(curie_it):
writer.emit(dict(id=curie, label=label))
else:
for curie in query_terms_iterator(terms, impl):
writer.emit(dict(id=curie), label_fields=[])
writer.finish()
else:
raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}")
Expand Down
52 changes: 50 additions & 2 deletions src/oaklib/datamodels/association.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Auto generated from association.yaml by pythongen.py version: 0.9.0
# Generation date: 2023-08-14T15:53:04
# Auto generated from association.yaml by pythongen.py version: 0.0.1
# Generation date: 2023-11-17T17:56:03
# Schema: association
#
# id: https://w3id.org/oak/association
Expand Down Expand Up @@ -69,6 +69,7 @@
OA = CurieNamespace("oa", "http://www.w3.org/ns/oa#")
ONTOASSOC = CurieNamespace("ontoassoc", "https://w3id.org/oak/association/")
RDF = CurieNamespace("rdf", "http://example.org/UNKNOWN/rdf/")
RDFS = CurieNamespace("rdfs", "http://example.org/UNKNOWN/rdfs/")
SSSOM = CurieNamespace("sssom", "https://w3id.org/sssom/")
DEFAULT_ = ONTOASSOC

Expand Down Expand Up @@ -100,6 +101,10 @@ class PositiveOrNegativeAssociation(YAMLRoot):
publications: Optional[
Union[Union[str, URIorCURIE], List[Union[str, URIorCURIE]]]
] = empty_list()
evidence_type: Optional[Union[str, URIorCURIE]] = None
supporting_objects: Optional[
Union[Union[str, URIorCURIE], List[Union[str, URIorCURIE]]]
] = empty_list()
primary_knowledge_source: Optional[Union[str, URIorCURIE]] = None
aggregator_knowledge_source: Optional[Union[str, URIorCURIE]] = None
subject_closure: Optional[
Expand All @@ -110,6 +115,7 @@ class PositiveOrNegativeAssociation(YAMLRoot):
Union[Union[str, URIorCURIE], List[Union[str, URIorCURIE]]]
] = empty_list()
object_closure_label: Optional[Union[str, List[str]]] = empty_list()
comments: Optional[Union[str, List[str]]] = empty_list()

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.subject is not None and not isinstance(self.subject, URIorCURIE):
Expand Down Expand Up @@ -148,6 +154,17 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
v if isinstance(v, URIorCURIE) else URIorCURIE(v) for v in self.publications
]

if self.evidence_type is not None and not isinstance(self.evidence_type, URIorCURIE):
self.evidence_type = URIorCURIE(self.evidence_type)

if not isinstance(self.supporting_objects, list):
self.supporting_objects = (
[self.supporting_objects] if self.supporting_objects is not None else []
)
self.supporting_objects = [
v if isinstance(v, URIorCURIE) else URIorCURIE(v) for v in self.supporting_objects
]

if self.primary_knowledge_source is not None and not isinstance(
self.primary_knowledge_source, URIorCURIE
):
Expand Down Expand Up @@ -188,6 +205,10 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
v if isinstance(v, str) else str(v) for v in self.object_closure_label
]

if not isinstance(self.comments, list):
self.comments = [self.comments] if self.comments is not None else []
self.comments = [v if isinstance(v, str) else str(v) for v in self.comments]

super().__post_init__(**kwargs)


Expand Down Expand Up @@ -711,6 +732,33 @@ class slots:
range=Optional[Union[str, URIorCURIE]],
)

slots.evidence_type = Slot(
uri=ONTOASSOC.evidence_type,
name="evidence_type",
curie=ONTOASSOC.curie("evidence_type"),
model_uri=ONTOASSOC.evidence_type,
domain=None,
range=Optional[Union[str, URIorCURIE]],
)

slots.supporting_objects = Slot(
uri=ONTOASSOC.supporting_objects,
name="supporting_objects",
curie=ONTOASSOC.curie("supporting_objects"),
model_uri=ONTOASSOC.supporting_objects,
domain=None,
range=Optional[Union[Union[str, URIorCURIE], List[Union[str, URIorCURIE]]]],
)

slots.comments = Slot(
uri=RDFS.comment,
name="comments",
curie=RDFS.curie("comment"),
model_uri=ONTOASSOC.comments,
domain=None,
range=Optional[Union[str, List[str]]],
)

slots.denormalized_slot = Slot(
uri=ONTOASSOC.denormalized_slot,
name="denormalized_slot",
Expand Down
16 changes: 16 additions & 0 deletions src/oaklib/datamodels/association.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,15 @@ classes:
- object_label
- negated
- publications
- evidence_type
- supporting_objects
- primary_knowledge_source
- aggregator_knowledge_source
- subject_closure
- subject_closure_label
- object_closure
- object_closure_label
- comments

Association:
is_a: PositiveOrNegativeAssociation
Expand Down Expand Up @@ -242,6 +245,18 @@ slots:
description: The knowledge source that aggregated the association
slot_uri: biolink:aggregator_knowledge_source
range: uriorcurie
evidence_type:
description: The type of evidence supporting the association
range: uriorcurie
supporting_objects:
description: The objects that support the association
range: uriorcurie
multivalued: true
comments:
description: Comments about the association
slot_uri: rdfs:comment
range: string
multivalued: true
denormalized_slot:
mixin: true
description: |-
Expand Down Expand Up @@ -387,3 +402,4 @@ slots:
associations_for_subjects_in_common:
multivalued: true
range: Association

2 changes: 2 additions & 0 deletions src/oaklib/implementations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from oaklib.implementations.kgx.kgx_implementation import KGXImplementation
from oaklib.implementations.llm_implementation import LLMImplementation
from oaklib.implementations.monarch.monarch_implementation import MonarchImplementation
from oaklib.implementations.ncbi.ncbi_gene_implementation import NCBIGeneImplementation
from oaklib.implementations.ols import (
BaseOlsImplementation,
OlsImplementation,
Expand Down Expand Up @@ -77,6 +78,7 @@
"OlsImplementation",
"TIBOlsImplementation",
"MonarchImplementation",
"NCBIGeneImplementation",
"OntobeeImplementation",
"ProntoImplementation",
"SimpleOboImplementation",
Expand Down
118 changes: 112 additions & 6 deletions src/oaklib/implementations/amigo/amigo_implementation.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
"""Adapter for AmiGO solr index."""
import json
import logging
from dataclasses import dataclass
from time import sleep
from typing import Any, Dict, Iterable, Iterator, List, Optional
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple

import pysolr

from oaklib.datamodels.association import Association
from oaklib.datamodels.search import SearchConfiguration
from oaklib.datamodels.vocabulary import RDFS_LABEL
from oaklib.interfaces import SearchInterface
from oaklib.interfaces.association_provider_interface import (
AssociationProviderInterface,
)
Expand All @@ -15,15 +19,18 @@
"AmiGOImplementation",
]

from oaklib.interfaces.basic_ontology_interface import LANGUAGE_TAG
from oaklib.interfaces.basic_ontology_interface import LANGUAGE_TAG, RELATIONSHIP
from oaklib.types import CURIE, PRED_CURIE
from oaklib.utilities.iterator_utils import chunk

AMIGO_ENDPOINT = "http://golr.geneontology.org/solr/"

logger = logging.getLogger(__name__)

LIMIT = 10000

ONTOLOGY_CLASS_CATEGORY = "ontology_class"

# TODO: derive from schema
DOCUMENT_CATEGORY = "document_category"
BIOENTITY = "bioentity"
Expand All @@ -36,6 +43,10 @@
ASSIGNED_BY = "assigned_by"
REFERENCE = "reference"

NEIGHBORHOOD_GRAPH_JSON = "neighborhood_graph_json"
TOPOLOGY_GRAPH_JSON = "topology_graph_json"
REGULATES_TRANSITIVITY_GRAPH_JSON = "regulates_transitivity_graph_json"

# general
ENTITY = "entity"
ENTITY_LABEL = "entity_label"
Expand All @@ -57,15 +68,20 @@ def _fq_element(k, vs):
return f"{k}:({v})"


def _query(solr, fq, fields, start: int = None, limit: int = None) -> Iterator[Dict]:
def _query(
solr, fq, fields, q=None, start: int = None, limit: int = None, **kwargs
) -> Iterator[Dict]:
if start is None:
start = 0
if limit is None:
limit = LIMIT
fq_list = [_fq_element(k, vs) for k, vs in fq.items()]
params = {"fq": fq_list, "fl": ",".join(fields)}
params = {"fq": fq_list, "fl": ",".join(fields), **kwargs}
if not q:
q = "*:*"
logging.info(f"QUERY: {q} PARAMS: {params}")
while True:
results = solr.search("*:*", rows=limit, start=start, **params)
results = solr.search(q, rows=limit, start=start, **params)
yield from results
logging.debug(f"CHECKING: {start} + {len(results)} >= {results.hits}")
if start + len(results) >= results.hits:
Expand All @@ -90,6 +106,7 @@ def _normalize(curie: CURIE) -> CURIE:
@dataclass
class AmiGOImplementation(
AssociationProviderInterface,
SearchInterface,
):
"""
Wraps AmiGO endpoint.
Expand Down Expand Up @@ -121,14 +138,48 @@ def __post_init__(self):
self._source = self.resource.slug
self._solr = pysolr.Solr(AMIGO_ENDPOINT)

def _cache_nodes(self, nodes: List[Dict], curies: Iterable[CURIE]):
for node in nodes:
curie = node["id"]
if curie in curies:
lbl = node.get("lbl", None)
if lbl:
self.property_cache.add(curie, RDFS_LABEL, node["lbl"])

def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]:
if lang:
raise NotImplementedError
if self.property_cache.contains(curie, RDFS_LABEL):
return self.property_cache.get(curie, RDFS_LABEL)
fq = {"document_category": ["general"], ENTITY: [curie]}
solr = self._solr
results = _query(solr, fq, [ENTITY_LABEL])
self.property_cache.add(curie, RDFS_LABEL, None)
for doc in results:
return doc[ENTITY_LABEL]
lbl = doc[ENTITY_LABEL]
self.property_cache.add(curie, RDFS_LABEL, lbl)
return lbl

def labels(
self, curies: Iterable[CURIE], allow_none=True, lang: LANGUAGE_TAG = None
) -> Iterable[Tuple[CURIE, str]]:
if lang:
raise NotImplementedError
# Note: some issues with post, use a low chunk size to ensure GET
for curie_it in chunk(curies, 10):
next_curies = list(curie_it)
for curie in next_curies:
lbl = self.property_cache.get(curie, RDFS_LABEL)
if lbl is not None:
yield curie, lbl
next_curies.remove(curie)
if not next_curies:
continue
fq = {"document_category": ["general"], ENTITY: next_curies}
solr = self._solr
results = _query(solr, fq, [ENTITY, ENTITY_LABEL])
for doc in results:
yield doc[ENTITY], doc[ENTITY_LABEL]

def associations(
self,
Expand Down Expand Up @@ -179,3 +230,58 @@ def associations(
assoc.subject_closure = doc[ISA_PARTOF_CLOSURE]
assoc.subject_closure_label = doc[ISA_PARTOF_CLOSURE_LABEL]
yield assoc

def relationships(
self,
subjects: Iterable[CURIE] = None,
predicates: Iterable[PRED_CURIE] = None,
objects: Iterable[CURIE] = None,
include_tbox: bool = True,
include_abox: bool = True,
include_entailed: bool = False,
exclude_blank: bool = True,
) -> Iterator[RELATIONSHIP]:
solr = self._solr
fq = {DOCUMENT_CATEGORY: [ONTOLOGY_CLASS_CATEGORY]}
# neighborhood graph is indexed for both subject and object
if subjects:
subjects = list(subjects)
fq[ANNOTATION_CLASS] = subjects
elif objects:
objects = list(objects)
fq[ANNOTATION_CLASS] = objects
select_fields = [ANNOTATION_CLASS, NEIGHBORHOOD_GRAPH_JSON]
if include_entailed:
select_fields.append(REGULATES_TRANSITIVITY_GRAPH_JSON)
results = _query(solr, fq, select_fields)

for doc in results:
neighborhood_graph = json.loads(doc[NEIGHBORHOOD_GRAPH_JSON])
edges = neighborhood_graph["edges"]
nodes = neighborhood_graph["nodes"]
if include_entailed:
closure_graph = json.loads(doc[REGULATES_TRANSITIVITY_GRAPH_JSON])
edges.extend(closure_graph["edges"])
if subjects:
edges = [e for e in edges if e["sub"] in subjects]
if objects:
edges = [e for e in edges if e["obj"] in objects]
if predicates:
edges = [e for e in edges if e["pred"] in predicates]
for edge in edges:
s, p, o = edge["sub"], edge["pred"], edge["obj"]
self._cache_nodes(nodes, [s, p, o])
yield s, p, o

def basic_search(
self, search_term: str, config: Optional[SearchConfiguration] = None
) -> Iterable[CURIE]:
solr = self._solr
fq = {DOCUMENT_CATEGORY: ["general"]}
# fq["general_blob"] = search_term
results = _query(
solr, fq, ["entity"], q=search_term, qf="general_blob_searchable", defType="edismax"
)

for doc in results:
yield doc["entity"]
Empty file.
Loading

0 comments on commit b404cd2

Please sign in to comment.