Skip to content

Commit

Permalink
legacy/migrator: add "subjects" support
Browse files Browse the repository at this point in the history
  • Loading branch information
slint committed Sep 19, 2023
1 parent 24aef8a commit d76eee3
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 47 deletions.
48 changes: 48 additions & 0 deletions migrator/tests/transform/test_record_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,18 @@ def zenodo_record_data():
],
"references": [{"raw_reference": "Test reference"}],
"keywords": ["migration", "test", "Zenodo", "RDM"],
"subjects": [
{
"term": "Astronomy",
"identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
"scheme": "url",
},
{
"term": "Klassenrat",
"identifier": "gnd:4180044-8",
"scheme": "gnd",
},
],
"_internal": {
"source": {
"agents": [
Expand Down Expand Up @@ -455,6 +467,18 @@ def expected_rdm_record_entry():
"title": "A book title",
},
"thesis:university": "Test University",
"legacy:subjects": [
{
"term": "Astronomy",
"identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
"scheme": "url",
},
{
"term": "Klassenrat",
"identifier": "gnd:4180044-8",
"scheme": "gnd",
},
],
# dwc
"dwc:basisOfRecord": ["foo", "bar"],
"dwc:catalogNumber": ["foo", "bar"],
Expand Down Expand Up @@ -656,6 +680,18 @@ def zenodo_draft_data():
},
],
"keywords": ["migration", "test", "Zenodo", "RDM"],
"subjects": [
{
"term": "Astronomy",
"identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
"scheme": "url",
},
{
"term": "Klassenrat",
"identifier": "gnd:4180044-8",
"scheme": "gnd",
},
],
"_internal": {
"source": {
"agents": [
Expand Down Expand Up @@ -919,6 +955,18 @@ def expected_rdm_draft_entry():
"title": "A book title",
},
"thesis:university": "Test University",
"legacy:subjects": [
{
"term": "Astronomy",
"identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
"scheme": "url",
},
{
"term": "Klassenrat",
"identifier": "gnd:4180044-8",
"scheme": "gnd",
},
],
# dwc
"dwc:basisOfRecord": ["foo", "bar"],
"dwc:catalogNumber": ["foo", "bar"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@
class ZenodoCustomFieldsEntry(Entry):
"""Custom fields entry transform."""

@classmethod
def _subjects(cls, subjects):
"""Parse subjects."""
res = []
for s in subjects or []:
res.append(
{
"term": s.get("term"),
"identifier": s.get("identifier"),
"scheme": s.get("scheme"),
}
)
return res or None

@classmethod
def _journal(cls, journal):
"""Parse journal fields."""
Expand Down Expand Up @@ -131,6 +145,7 @@ def transform(cls, entry):
entry.get("imprint", {}), entry.get("part_of", {})
),
"thesis:university": entry.get("thesis", {}).get("university"),
"legacy:subjects": cls._subjects(entry.get("subjects", [])),
"openbiodiv:TaxonomicConceptLabel": entry.get("custom", {}).get(
"openbiodiv:TaxonomicConceptLabel"
),
Expand Down
9 changes: 4 additions & 5 deletions site/tests/legacy/deposits/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,10 @@ def expected_record_metadata():
journal_title="Some journal name",
journal_volume="Some volume",
keywords=["Keyword 1", "keyword 2"],
# TODO uncomment when subjects are implemented
# subjects=[
# dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
# dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
# ],
subjects=[
dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
],
license="cc-zero",
notes="Some notes",
partof_pages="SOme part of",
Expand Down
31 changes: 31 additions & 0 deletions site/zenodo_rdm/custom_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,39 @@
from marshmallow import fields
from marshmallow_utils.fields import SanitizedUnicode


class SubjectListCF(BaseCF):
"""Subject list custom field."""

@property
def mapping(self):
"""Search mapping."""
return {
"type": "object",
"properties": {
"term": {"type": "keyword"},
"identifier": {"type": "keyword"},
"scheme": {"type": "keyword"},
},
}

@property
def field(self):
"""Marshmallow field."""
return fields.List(
fields.Nested(
{
"term": SanitizedUnicode(),
"identifier": SanitizedUnicode(),
"scheme": SanitizedUnicode(),
}
)
)


LEGACY_CUSTOM_FIELDS = [
KeywordCF(name="legacy:communities", multiple=True),
SubjectListCF(name="legacy:subjects"),
]
"""Legacy compatibility custom fields."""

Expand Down
35 changes: 6 additions & 29 deletions site/zenodo_rdm/legacy/deserializers/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def split_identifiers(self, data, **kwargs):
contributors = fields.List(fields.Dict())
additional_descriptions = fields.List(fields.Dict())
locations = fields.Method(deserialize="load_locations")
subjects = fields.List(fields.Dict())
subjects = fields.Method(deserialize="load_subjects", data_key="keywords")
version = SanitizedUnicode()
dates = fields.Method(deserialize="load_dates")
references = fields.Method(deserialize="load_references")
Expand Down Expand Up @@ -301,37 +301,14 @@ def load_locations(self, obj):

return {"features": features}

@post_load(pass_original=True)
def _subjects(self, result, original, **kwargs):
def load_subjects(self, obj):
"""Transform subjects of a legacy record.
RDM subjects translate to either legacy keywords or subjects.
RDM subjects translate to legacy keywords.
"""

def _from_keywords(keywords):
"""Legacy keywords are free text strings.
They map to custom subjects.
"""
return [{"subject": kw} for kw in keywords]

def _from_subjects(data):
"""Maps RDM subjects to legacy subjects.
Legacy subjects are custom vocabularies.
"""
# TODO we still did not define a strategy to map legacy subjects to rdm.
return []

keywords = original.get("keywords", [])
subjects = original.get("subjects", [])

if keywords or subjects:
rdm_subjects = _from_keywords(keywords) + _from_subjects(subjects)

result["subjects"] = rdm_subjects

return result
if not obj:
return missing
return [{"subject": kw} for kw in obj]

def load_dates(self, obj):
"""Transform dates of a legacy record."""
Expand Down
16 changes: 16 additions & 0 deletions site/zenodo_rdm/legacy/deserializers/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,19 @@ def load_communities(self, result, original, **kwargs):
result.setdefault("custom_fields", {})
result["custom_fields"].update({"legacy:communities": community_ids})
return result

@post_load(pass_original=True)
def load_subjects(self, result, original, **kwargs):
"""Store legacy subjects as a custom field."""
subjects = original.get("metadata", {}).get("subjects", [])
if subjects:
result.setdefault("custom_fields", {})
result["custom_fields"]["legacy:subjects"] = [
{
"term": s.get("term"),
"identifier": s.get("identifier"),
"scheme": s.get("scheme"),
}
for s in subjects
]
return result
20 changes: 7 additions & 13 deletions site/zenodo_rdm/legacy/serializers/schemas/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ class MetadataSchema(Schema):
creators = fields.List(fields.Nested(CreatorSchema), dump_only=True)
contributors = fields.List(fields.Nested(ContributorSchema), dump_only=True)

keywords = fields.Method("dump_keywords")
subjects = fields.Raw(attribute="custom_fields.legacy:subjects")

related_identifiers = fields.List(fields.Nested(RelatedIdentifierSchema))

locations = fields.Method("dump_locations")
Expand Down Expand Up @@ -212,31 +215,22 @@ def resolve_license(self, data, **kwargs):
data["license"] = rdm_to_legacy(license["id"])
return data

@post_dump(pass_original=True)
def dump_subjects(self, result, original, **kwargs):
"""Dumps subjects."""
subjects = original.get("subjects", [])
serialized_subjects = []
def dump_keywords(self, obj):
"""Dumps keywords from RDM subjects."""
subjects = obj.get("subjects", [])
serialized_keywords = []
if subjects:
for _sbj in subjects:
_id = _sbj.get("id")
_subject = _sbj.get("subject")
# If subject has an id, it's a controlled vocabulary
if _id:
# TODO we still did not define a strategy to map legacy subjects to rdm.
pass
# Otherwise it's a free text string (keyword)
elif _subject:
serialized_keywords.append(_subject)

if serialized_keywords:
result["keywords"] = serialized_keywords

if serialized_subjects:
result["subjects"] = serialized_subjects

return result
return serialized_keywords or missing

def dump_reference(self, obj):
"""Dumps reference."""
Expand Down

0 comments on commit d76eee3

Please sign in to comment.