diff --git a/migrator/tests/transform/test_record_transform.py b/migrator/tests/transform/test_record_transform.py index 9075b2d8..5e39590e 100644 --- a/migrator/tests/transform/test_record_transform.py +++ b/migrator/tests/transform/test_record_transform.py @@ -156,6 +156,18 @@ def zenodo_record_data(): ], "references": [{"raw_reference": "Test reference"}], "keywords": ["migration", "test", "Zenodo", "RDM"], + "subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], "_internal": { "source": { "agents": [ @@ -455,6 +467,18 @@ def expected_rdm_record_entry(): "title": "A book title", }, "thesis:university": "Test University", + "legacy:subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], # dwc "dwc:basisOfRecord": ["foo", "bar"], "dwc:catalogNumber": ["foo", "bar"], @@ -656,6 +680,18 @@ def zenodo_draft_data(): }, ], "keywords": ["migration", "test", "Zenodo", "RDM"], + "subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], "_internal": { "source": { "agents": [ @@ -919,6 +955,18 @@ def expected_rdm_draft_entry(): "title": "A book title", }, "thesis:university": "Test University", + "legacy:subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], # dwc "dwc:basisOfRecord": ["foo", "bar"], "dwc:catalogNumber": ["foo", "bar"], diff --git a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py index 7297114b..a948c5a0 100644 --- a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py +++ b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py @@ -13,6 +13,20 @@ class ZenodoCustomFieldsEntry(Entry): """Custom fields entry transform.""" + @classmethod + def _subjects(cls, subjects): + """Parse subjects.""" + res = [] + for s in subjects or []: + res.append( + { + "term": s.get("term"), + "identifier": s.get("identifier"), + "scheme": s.get("scheme"), + } + ) + return res or None + @classmethod def _journal(cls, journal): """Parse journal fields.""" @@ -131,6 +145,7 @@ def transform(cls, entry): entry.get("imprint", {}), entry.get("part_of", {}) ), "thesis:university": entry.get("thesis", {}).get("university"), + "legacy:subjects": cls._subjects(entry.get("subjects", [])), "openbiodiv:TaxonomicConceptLabel": entry.get("custom", {}).get( "openbiodiv:TaxonomicConceptLabel" ), diff --git a/site/tests/legacy/deposits/conftest.py b/site/tests/legacy/deposits/conftest.py index 7d402b6b..9f664ea8 100644 --- a/site/tests/legacy/deposits/conftest.py +++ b/site/tests/legacy/deposits/conftest.py @@ -149,11 +149,10 @@ def expected_record_metadata(): journal_title="Some journal name", journal_volume="Some volume", keywords=["Keyword 1", "keyword 2"], - # TODO uncomment when subjects are implemented - # subjects=[ - # dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"), - # dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"), - # ], + subjects=[ + dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"), + dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"), + ], license="cc-zero", notes="Some notes", partof_pages="SOme part of", diff --git a/site/zenodo_rdm/custom_fields.py b/site/zenodo_rdm/custom_fields.py index 148a81ec..2a4c6616 100644 --- a/site/zenodo_rdm/custom_fields.py +++ b/site/zenodo_rdm/custom_fields.py @@ -43,8 +43,39 @@ from marshmallow import fields from marshmallow_utils.fields import SanitizedUnicode + +class SubjectListCF(BaseCF): + """Subject list custom field.""" + + @property + def mapping(self): + """Search mapping.""" + return { + "type": "object", + "properties": { + "term": {"type": "keyword"}, + "identifier": {"type": "keyword"}, + "scheme": {"type": "keyword"}, + }, + } + + @property + def field(self): + """Marshmallow field.""" + return fields.List( + fields.Nested( + { + "term": SanitizedUnicode(), + "identifier": SanitizedUnicode(), + "scheme": SanitizedUnicode(), + } + ) + ) + + LEGACY_CUSTOM_FIELDS = [ KeywordCF(name="legacy:communities", multiple=True), + SubjectListCF(name="legacy:subjects"), ] """Legacy compatibility custom fields.""" diff --git a/site/zenodo_rdm/legacy/deserializers/metadata.py b/site/zenodo_rdm/legacy/deserializers/metadata.py index cfbb4299..443360a8 100644 --- a/site/zenodo_rdm/legacy/deserializers/metadata.py +++ b/site/zenodo_rdm/legacy/deserializers/metadata.py @@ -158,7 +158,7 @@ def split_identifiers(self, data, **kwargs): contributors = fields.List(fields.Dict()) additional_descriptions = fields.List(fields.Dict()) locations = fields.Method(deserialize="load_locations") - subjects = fields.List(fields.Dict()) + subjects = fields.Method(deserialize="load_subjects", data_key="keywords") version = SanitizedUnicode() dates = fields.Method(deserialize="load_dates") references = fields.Method(deserialize="load_references") @@ -301,37 +301,14 @@ def load_locations(self, obj): return {"features": features} - @post_load(pass_original=True) - def _subjects(self, result, original, **kwargs): + def load_subjects(self, obj): """Transform subjects of a legacy record. - RDM subjects translate to either legacy keywords or subjects. + RDM subjects translate to legacy keywords. """ - - def _from_keywords(keywords): - """Legacy keywords are free text strings. - - They map to custom subjects. - """ - return [{"subject": kw} for kw in keywords] - - def _from_subjects(data): - """Maps RDM subjects to legacy subjects. - - Legacy subjects are custom vocabularies. - """ - # TODO we still did not define a strategy to map legacy subjects to rdm. - return [] - - keywords = original.get("keywords", []) - subjects = original.get("subjects", []) - - if keywords or subjects: - rdm_subjects = _from_keywords(keywords) + _from_subjects(subjects) - - result["subjects"] = rdm_subjects - - return result + if not obj: + return missing + return [{"subject": kw} for kw in obj] def load_dates(self, obj): """Transform dates of a legacy record.""" diff --git a/site/zenodo_rdm/legacy/deserializers/schemas.py b/site/zenodo_rdm/legacy/deserializers/schemas.py index bf4adf44..86ee573a 100644 --- a/site/zenodo_rdm/legacy/deserializers/schemas.py +++ b/site/zenodo_rdm/legacy/deserializers/schemas.py @@ -213,3 +213,19 @@ def load_communities(self, result, original, **kwargs): result.setdefault("custom_fields", {}) result["custom_fields"].update({"legacy:communities": community_ids}) return result + + @post_load(pass_original=True) + def load_subjects(self, result, original, **kwargs): + """Store legacy subjects as a custom field.""" + subjects = original.get("metadata", {}).get("subjects", []) + if subjects: + result.setdefault("custom_fields", {}) + result["custom_fields"]["legacy:subjects"] = [ + { + "term": s.get("term"), + "identifier": s.get("identifier"), + "scheme": s.get("scheme"), + } + for s in subjects + ] + return result diff --git a/site/zenodo_rdm/legacy/serializers/schemas/common.py b/site/zenodo_rdm/legacy/serializers/schemas/common.py index db060910..ab8cae57 100644 --- a/site/zenodo_rdm/legacy/serializers/schemas/common.py +++ b/site/zenodo_rdm/legacy/serializers/schemas/common.py @@ -166,6 +166,9 @@ class MetadataSchema(Schema): creators = fields.List(fields.Nested(CreatorSchema), dump_only=True) contributors = fields.List(fields.Nested(ContributorSchema), dump_only=True) + keywords = fields.Method("dump_keywords") + subjects = fields.Raw(attribute="custom_fields.legacy:subjects") + related_identifiers = fields.List(fields.Nested(RelatedIdentifierSchema)) locations = fields.Method("dump_locations") @@ -212,11 +215,9 @@ def resolve_license(self, data, **kwargs): data["license"] = rdm_to_legacy(license["id"]) return data - @post_dump(pass_original=True) - def dump_subjects(self, result, original, **kwargs): - """Dumps subjects.""" - subjects = original.get("subjects", []) - serialized_subjects = [] + def dump_keywords(self, obj): + """Dumps keywords from RDM subjects.""" + subjects = obj.get("subjects", []) serialized_keywords = [] if subjects: for _sbj in subjects: @@ -224,19 +225,12 @@ def dump_subjects(self, result, original, **kwargs): _subject = _sbj.get("subject") # If subject has an id, it's a controlled vocabulary if _id: - # TODO we still did not define a strategy to map legacy subjects to rdm. pass # Otherwise it's a free text string (keyword) elif _subject: serialized_keywords.append(_subject) - if serialized_keywords: - result["keywords"] = serialized_keywords - - if serialized_subjects: - result["subjects"] = serialized_subjects - - return result + return serialized_keywords or missing def dump_reference(self, obj): """Dumps reference."""