Skip to content

Commit

Permalink
ingest empty datums with grace
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed Oct 2, 2024
1 parent cf83fc8 commit 86b1a9b
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 17 deletions.
39 changes: 39 additions & 0 deletions tests/trove/digestive_tract/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,42 @@ def test_extract_supplementary(self):
},
})
self.assertEqual(_indexcard.latest_rdf.modified, _orig_timestamp)

def test_extract_empty_with_prior(self):
(_prior_indexcard,) = digestive_tract.extract(self.raw)
self.assertFalse(self.raw.no_output)
self.assertIsNone(_prior_indexcard.deleted)
# add a later raw
_empty_raw = factories.RawDatumFactory(
mediatype='text/turtle',
datum=' ',
suid=self.raw.suid,
)
(_indexcard,) = digestive_tract.extract(_empty_raw)
self.assertTrue(_empty_raw.no_output)
self.assertEqual(_indexcard.id, _prior_indexcard.id)
self.assertIsNotNone(_indexcard.deleted)
with self.assertRaises(trove_db.LatestIndexcardRdf.DoesNotExist):
_indexcard.latest_rdf

def test_extract_empty_without_prior(self):
_empty_raw = factories.RawDatumFactory(
mediatype='text/turtle',
datum=' ',
)
_cards = digestive_tract.extract(_empty_raw)
self.assertEqual(_cards, [])
self.assertTrue(_empty_raw.no_output)

def test_extract_empty_supplementary(self):
(_orig_indexcard,) = digestive_tract.extract(self.raw)
digestive_tract.extract(self.supplementary_raw)
self.assertTrue(_orig_indexcard.supplementary_rdf_set.exists())
_empty_raw = factories.RawDatumFactory(
mediatype='text/turtle',
datum='',
suid=self.supplementary_raw.suid,
)
(_indexcard,) = digestive_tract.extract(_empty_raw)
self.assertEqual(_indexcard.id, _orig_indexcard.id)
self.assertFalse(_orig_indexcard.supplementary_rdf_set.exists())
36 changes: 19 additions & 17 deletions trove/digestive_tract.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,30 +120,32 @@ def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_
LatestIndexcardRdf (previously extracted from the record, but no longer present)
'''
assert raw.mediatype is not None, 'raw datum has no mediatype -- did you mean to call extract_legacy?'
_tripledicts_by_focus_iri = {}
_extractor = get_rdf_extractor_class(raw.mediatype)(raw.suid.source_config)
# TODO normalize (or just validate) tripledict:
# - synonymous iris should be grouped (only one as subject-key, others under owl:sameAs)
# - focus should have rdf:type
# - no subject-key iris which collide by trove_db.ResourceIdentifier equivalence
# - connected graph (all subject-key iris reachable from focus, or reverse for vocab terms?)
_extracted_tripledict: primitive_rdf.RdfTripleDictionary = _extractor.extract_rdf(raw.datum)
try:
_focus_iri = raw.suid.focus_identifier.find_equivalent_iri(_extracted_tripledict)
except ValueError:
raise DigestiveError(f'could not find {raw.suid.focus_identifier} in {raw}')
_tripledicts_by_focus_iri = {_focus_iri: _extracted_tripledict}
# special case: if the record defines an ontology, create a
# card for each subject iri that starts with the focus iri
# (TODO: consider a separate index card for *every* subject iri?)
if OWL.Ontology in _extracted_tripledict[_focus_iri].get(RDF.type, ()):
for _iri, _twopledict in _extracted_tripledict.items():
if (_iri != _focus_iri) and _iri.startswith(_focus_iri):
_term_tripledict = {_iri: copy.deepcopy(_twopledict)}
# ensure a link to the ontology (in case there's not already)
primitive_rdf.RdfGraph(_term_tripledict).add(
(_iri, RDFS.isDefinedBy, _focus_iri),
)
_tripledicts_by_focus_iri[_iri] = _term_tripledict
if _extracted_tripledict:
try:
_focus_iri = raw.suid.focus_identifier.find_equivalent_iri(_extracted_tripledict)
except ValueError:
raise DigestiveError(f'could not find {raw.suid.focus_identifier} in {raw}')
_tripledicts_by_focus_iri[_focus_iri] = _extracted_tripledict
# special case: if the record defines an ontology, create a
# card for each subject iri that starts with the focus iri
# (TODO: consider a separate index card for *every* subject iri?)
if OWL.Ontology in _extracted_tripledict.get(_focus_iri, {}).get(RDF.type, ()):
for _iri, _twopledict in _extracted_tripledict.items():
if (_iri != _focus_iri) and _iri.startswith(_focus_iri):
_term_tripledict = {_iri: copy.deepcopy(_twopledict)}
# ensure a link to the ontology (in case there's not already)
primitive_rdf.RdfGraph(_term_tripledict).add(
(_iri, RDFS.isDefinedBy, _focus_iri),
)
_tripledicts_by_focus_iri[_iri] = _term_tripledict
if raw.suid.is_supplementary:
return trove_db.Indexcard.objects.supplement_indexcards_from_tripledicts(
from_raw_datum=raw,
Expand Down
1 change: 1 addition & 0 deletions trove/models/indexcard.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def save_indexcards_from_tripledicts(
.exclude(id__in=[_card.id for _card in _indexcards])
):
_indexcard_to_delete.pls_delete()
_indexcards.append(_indexcard_to_delete)
return _indexcards

@transaction.atomic
Expand Down

0 comments on commit 86b1a9b

Please sign in to comment.