Skip to content

Commit

Permalink
supplementary metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed Sep 19, 2024
1 parent b15ff25 commit 5a7dd86
Show file tree
Hide file tree
Showing 8 changed files with 241 additions and 49 deletions.
18 changes: 18 additions & 0 deletions share/migrations/0074_sourceuniqueidentifier_is_supplementary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.25 on 2024-09-19 20:33

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('share', '0073_remove_indexbackfill_backfill_phase_index'),
]

operations = [
migrations.AddField(
model_name='sourceuniqueidentifier',
name='is_supplementary',
field=models.BooleanField(null=True),
),
]
1 change: 1 addition & 0 deletions share/models/source_unique_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class SourceUniqueIdentifier(models.Model):
identifier = models.TextField() # no restrictions on identifier format
source_config = models.ForeignKey('SourceConfig', on_delete=models.CASCADE)
focus_identifier = models.ForeignKey('trove.ResourceIdentifier', null=True, on_delete=models.PROTECT, related_name='suid_set')
is_supplementary = models.BooleanField(null=True)

class JSONAPIMeta(BaseJSONAPIMeta):
pass
Expand Down
9 changes: 8 additions & 1 deletion trove/derive/_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
from collections.abc import Iterable

from primitive_metadata import primitive_rdf

Expand All @@ -10,10 +11,16 @@ class IndexcardDeriver(abc.ABC):
focus_iri: str
data: primitive_rdf.RdfGraph

def __init__(self, upriver_rdf: IndexcardRdf):
def __init__(
self,
upriver_rdf: IndexcardRdf,
supplementary_rdf_set: Iterable[IndexcardRdf] = (),
):
self.upriver_rdf = upriver_rdf
self.focus_iri = upriver_rdf.focus_iri
self.data = primitive_rdf.RdfGraph(upriver_rdf.as_rdf_tripledict())
for _supplementary_rdf in supplementary_rdf_set:
self.data.add_tripledict(_supplementary_rdf.as_rdf_tripledict())

def q(self, pathset):
# convenience for querying self.data on self.focus_iri
Expand Down
26 changes: 22 additions & 4 deletions trove/digestive_tract.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def swallow(
focus_iri: str,
datestamp=None, # default "now"
urgent=False,
is_supplementary=False,
):
'''swallow: store a given record by checksum; queue for extraction
Expand All @@ -55,7 +56,12 @@ def swallow(
_suid, _suid_created = share_db.SourceUniqueIdentifier.objects.get_or_create(
source_config=_source_config,
identifier=record_identifier,
defaults={
'is_supplementary': is_supplementary,
},
)
if bool(_suid.is_supplementary) != is_supplementary:
raise DigestiveError(f'suid is_supplementary should not change! suid={_suid}, is_supplementary changed from {bool(_suid.is_supplementary)} to {is_supplementary}')
_focus_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri)
if _suid.focus_identifier is None:
_suid.focus_identifier = _focus_identifier
Expand Down Expand Up @@ -104,11 +110,12 @@ def swallow__sharev2_legacy(
def extract(raw: share_db.RawDatum, *, undelete_indexcards=False) -> list[trove_db.Indexcard]:
'''extract: gather rdf graph from a record; store as index card(s)
will create (or update):
may create (or update):
ResourceIdentifier (for each described resource and its types)
Indexcard (with identifiers and type-identifiers for each described resource)
ArchivedIndexcardRdf (all extracted metadata)
LatestIndexcardRdf (all extracted metadata, if latest raw)
ArchivedIndexcardRdf (all extracted metadata, if non-supplementary)
LatestIndexcardRdf (all extracted metadata, if latest raw and non-supplementary)
SupplementaryIndexcardRdf (all extracted metadata, if supplementary)
may delete:
LatestIndexcardRdf (previously extracted from the record, but no longer present)
'''
Expand Down Expand Up @@ -153,7 +160,10 @@ def derive(indexcard: trove_db.Indexcard, deriver_iris=None):
if indexcard.deleted or not indexcard.latest_rdf:
return
for _deriver_class in get_deriver_classes(deriver_iris):
_deriver = _deriver_class(upriver_rdf=indexcard.latest_rdf)
_deriver = _deriver_class(
upriver_rdf=indexcard.latest_rdf,
supplementary_rdf_set=indexcard.supplementary_rdf_set.all(),
)
_deriver_identifier = trove_db.ResourceIdentifier.objects.get_or_create_for_iri(_deriver.deriver_iri())
if _deriver.should_skip():
trove_db.DerivedIndexcard.objects.filter(
Expand All @@ -178,6 +188,11 @@ def expel(from_user: share_db.ShareUser, record_identifier: str):
source_config__source__user=from_user,
identifier=record_identifier,
)
(
trove_db.SupplementaryIndexcardRdf.objects
.filter(supplementary_suid__in=_suid_qs)
.delete()
)
for _indexcard in trove_db.Indexcard.objects.filter(source_record_suid__in=_suid_qs):
_indexcard.pls_delete()

Expand Down Expand Up @@ -247,6 +262,9 @@ def task__schedule_all_for_deriver(deriver_iri: str, notify_index=False):
def _sharev2_legacy_ingest(raw, urgent: bool):
assert raw.mediatype is None, 'raw datum has a mediatype -- did you mean to call non-legacy extract?'
_extractor = get_rdf_extractor_class(None)(raw.suid.source_config)
if typing.TYPE_CHECKING:
from trove.extract.legacy_sharev2 import LegacySharev2Extractor
assert isinstance(_extractor, LegacySharev2Extractor)
_sharev2graph = _extractor.extract_sharev2_graph(raw.datum)
_centralnode = _sharev2graph.get_central_node(guess=True)
_normd = share_db.NormalizedData.objects.create(
Expand Down
33 changes: 33 additions & 0 deletions trove/migrations/0006_supplementary_indexcard_rdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 3.2.25 on 2024-09-19 20:33

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('share', '0074_sourceuniqueidentifier_is_supplementary'),
('trove', '0005_indexes_for_oaipmh'),
]

operations = [
migrations.CreateModel(
name='SupplementaryIndexcardRdf',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', models.DateTimeField(auto_now_add=True)),
('modified', models.DateTimeField(auto_now=True)),
('turtle_checksum_iri', models.TextField(db_index=True)),
('focus_iri', models.TextField()),
('rdf_as_turtle', models.TextField()),
('from_raw_datum', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='+', to='share.rawdatum')),
('indexcard', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='trove_supplementaryindexcardrdf_set', to='trove.indexcard')),
('supplementary_suid', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='supplementary_rdf_set', to='share.sourceuniqueidentifier')),
],
),
migrations.AddConstraint(
model_name='supplementaryindexcardrdf',
constraint=models.UniqueConstraint(fields=('indexcard', 'supplementary_suid'), name='trove_supplementaryindexcardrdf_uniq_supplement'),
),
]
10 changes: 9 additions & 1 deletion trove/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,15 @@
'IndexcardRdf',
'LatestIndexcardRdf',
'ArchivedIndexcardRdf',
'SupplementaryIndexcardRdf',
'DerivedIndexcard',
)
from .indexcard import Indexcard, IndexcardRdf, LatestIndexcardRdf, ArchivedIndexcardRdf, DerivedIndexcard
from .indexcard import (
ArchivedIndexcardRdf,
DerivedIndexcard,
Indexcard,
IndexcardRdf,
LatestIndexcardRdf,
SupplementaryIndexcardRdf,
)
from .resource_identifier import ResourceIdentifier
Loading

0 comments on commit 5a7dd86

Please sign in to comment.