legacy/migrator: add "subjects" support

zenodo · Sep 19, 2023 · d76eee3 · d76eee3
1 parent 24aef8a
commit d76eee3
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 47 deletions.
diff --git a/migrator/tests/transform/test_record_transform.py b/migrator/tests/transform/test_record_transform.py
@@ -156,6 +156,18 @@ def zenodo_record_data():
             ],
             "references": [{"raw_reference": "Test reference"}],
             "keywords": ["migration", "test", "Zenodo", "RDM"],
+            "subjects": [
+                {
+                    "term": "Astronomy",
+                    "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                    "scheme": "url",
+                },
+                {
+                    "term": "Klassenrat",
+                    "identifier": "gnd:4180044-8",
+                    "scheme": "gnd",
+                },
+            ],
             "_internal": {
                 "source": {
                     "agents": [
@@ -455,6 +467,18 @@ def expected_rdm_record_entry():
                     "title": "A book title",
                 },
                 "thesis:university": "Test University",
+                "legacy:subjects": [
+                    {
+                        "term": "Astronomy",
+                        "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                        "scheme": "url",
+                    },
+                    {
+                        "term": "Klassenrat",
+                        "identifier": "gnd:4180044-8",
+                        "scheme": "gnd",
+                    },
+                ],
                 # dwc
                 "dwc:basisOfRecord": ["foo", "bar"],
                 "dwc:catalogNumber": ["foo", "bar"],
@@ -656,6 +680,18 @@ def zenodo_draft_data():
                 },
             ],
             "keywords": ["migration", "test", "Zenodo", "RDM"],
+            "subjects": [
+                {
+                    "term": "Astronomy",
+                    "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                    "scheme": "url",
+                },
+                {
+                    "term": "Klassenrat",
+                    "identifier": "gnd:4180044-8",
+                    "scheme": "gnd",
+                },
+            ],
             "_internal": {
                 "source": {
                     "agents": [
@@ -919,6 +955,18 @@ def expected_rdm_draft_entry():
                     "title": "A book title",
                 },
                 "thesis:university": "Test University",
+                "legacy:subjects": [
+                    {
+                        "term": "Astronomy",
+                        "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                        "scheme": "url",
+                    },
+                    {
+                        "term": "Klassenrat",
+                        "identifier": "gnd:4180044-8",
+                        "scheme": "gnd",
+                    },
+                ],
                 # dwc
                 "dwc:basisOfRecord": ["foo", "bar"],
                 "dwc:catalogNumber": ["foo", "bar"],

diff --git a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py
@@ -13,6 +13,20 @@
 class ZenodoCustomFieldsEntry(Entry):
     """Custom fields entry transform."""
 
+    @classmethod
+    def _subjects(cls, subjects):
+        """Parse subjects."""
+        res = []
+        for s in subjects or []:
+            res.append(
+                {
+                    "term": s.get("term"),
+                    "identifier": s.get("identifier"),
+                    "scheme": s.get("scheme"),
+                }
+            )
+        return res or None
+
     @classmethod
     def _journal(cls, journal):
         """Parse journal fields."""
@@ -131,6 +145,7 @@ def transform(cls, entry):
                 entry.get("imprint", {}), entry.get("part_of", {})
             ),
             "thesis:university": entry.get("thesis", {}).get("university"),
+            "legacy:subjects": cls._subjects(entry.get("subjects", [])),
             "openbiodiv:TaxonomicConceptLabel": entry.get("custom", {}).get(
                 "openbiodiv:TaxonomicConceptLabel"
             ),

diff --git a/site/tests/legacy/deposits/conftest.py b/site/tests/legacy/deposits/conftest.py
@@ -149,11 +149,10 @@ def expected_record_metadata():
         journal_title="Some journal name",
         journal_volume="Some volume",
         keywords=["Keyword 1", "keyword 2"],
-        # TODO uncomment when subjects are implemented
-        # subjects=[
-        #     dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
-        #     dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
-        # ],
+        subjects=[
+            dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
+            dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
+        ],
         license="cc-zero",
         notes="Some notes",
         partof_pages="SOme part of",

diff --git a/site/zenodo_rdm/custom_fields.py b/site/zenodo_rdm/custom_fields.py
@@ -43,8 +43,39 @@
 from marshmallow import fields
 from marshmallow_utils.fields import SanitizedUnicode
 
+
+class SubjectListCF(BaseCF):
+    """Subject list custom field."""
+
+    @property
+    def mapping(self):
+        """Search mapping."""
+        return {
+            "type": "object",
+            "properties": {
+                "term": {"type": "keyword"},
+                "identifier": {"type": "keyword"},
+                "scheme": {"type": "keyword"},
+            },
+        }
+
+    @property
+    def field(self):
+        """Marshmallow field."""
+        return fields.List(
+            fields.Nested(
+                {
+                    "term": SanitizedUnicode(),
+                    "identifier": SanitizedUnicode(),
+                    "scheme": SanitizedUnicode(),
+                }
+            )
+        )
+
+
 LEGACY_CUSTOM_FIELDS = [
     KeywordCF(name="legacy:communities", multiple=True),
+    SubjectListCF(name="legacy:subjects"),
 ]
 """Legacy compatibility custom fields."""
 

diff --git a/site/zenodo_rdm/legacy/deserializers/metadata.py b/site/zenodo_rdm/legacy/deserializers/metadata.py
@@ -158,7 +158,7 @@ def split_identifiers(self, data, **kwargs):
     contributors = fields.List(fields.Dict())
     additional_descriptions = fields.List(fields.Dict())
     locations = fields.Method(deserialize="load_locations")
-    subjects = fields.List(fields.Dict())
+    subjects = fields.Method(deserialize="load_subjects", data_key="keywords")
     version = SanitizedUnicode()
     dates = fields.Method(deserialize="load_dates")
     references = fields.Method(deserialize="load_references")
@@ -301,37 +301,14 @@ def load_locations(self, obj):
 
         return {"features": features}
 
-    @post_load(pass_original=True)
-    def _subjects(self, result, original, **kwargs):
+    def load_subjects(self, obj):
         """Transform subjects of a legacy record.
 
-        RDM subjects translate to either legacy keywords or subjects.
+        RDM subjects translate to legacy keywords.
         """
-
-        def _from_keywords(keywords):
-            """Legacy keywords are free text strings.
-
-            They map to custom subjects.
-            """
-            return [{"subject": kw} for kw in keywords]
-
-        def _from_subjects(data):
-            """Maps RDM subjects to legacy subjects.
-
-            Legacy subjects are custom vocabularies.
-            """
-            # TODO we still did not define a strategy to map legacy subjects to rdm.
-            return []
-
-        keywords = original.get("keywords", [])
-        subjects = original.get("subjects", [])
-
-        if keywords or subjects:
-            rdm_subjects = _from_keywords(keywords) + _from_subjects(subjects)
-
-            result["subjects"] = rdm_subjects
-
-        return result
+        if not obj:
+            return missing
+        return [{"subject": kw} for kw in obj]
 
     def load_dates(self, obj):
         """Transform dates of a legacy record."""

diff --git a/site/zenodo_rdm/legacy/deserializers/schemas.py b/site/zenodo_rdm/legacy/deserializers/schemas.py
@@ -213,3 +213,19 @@ def load_communities(self, result, original, **kwargs):
             result.setdefault("custom_fields", {})
             result["custom_fields"].update({"legacy:communities": community_ids})
         return result
+
+    @post_load(pass_original=True)
+    def load_subjects(self, result, original, **kwargs):
+        """Store legacy subjects as a custom field."""
+        subjects = original.get("metadata", {}).get("subjects", [])
+        if subjects:
+            result.setdefault("custom_fields", {})
+            result["custom_fields"]["legacy:subjects"] = [
+                {
+                    "term": s.get("term"),
+                    "identifier": s.get("identifier"),
+                    "scheme": s.get("scheme"),
+                }
+                for s in subjects
+            ]
+        return result
diff --git a/site/zenodo_rdm/legacy/serializers/schemas/common.py b/site/zenodo_rdm/legacy/serializers/schemas/common.py
@@ -166,6 +166,9 @@ class MetadataSchema(Schema):
     creators = fields.List(fields.Nested(CreatorSchema), dump_only=True)
     contributors = fields.List(fields.Nested(ContributorSchema), dump_only=True)
 
+    keywords = fields.Method("dump_keywords")
+    subjects = fields.Raw(attribute="custom_fields.legacy:subjects")
+
     related_identifiers = fields.List(fields.Nested(RelatedIdentifierSchema))
 
     locations = fields.Method("dump_locations")
@@ -212,31 +215,22 @@ def resolve_license(self, data, **kwargs):
             data["license"] = rdm_to_legacy(license["id"])
         return data
 
-    @post_dump(pass_original=True)
-    def dump_subjects(self, result, original, **kwargs):
-        """Dumps subjects."""
-        subjects = original.get("subjects", [])
-        serialized_subjects = []
+    def dump_keywords(self, obj):
+        """Dumps keywords from RDM subjects."""
+        subjects = obj.get("subjects", [])
         serialized_keywords = []
         if subjects:
             for _sbj in subjects:
                 _id = _sbj.get("id")
                 _subject = _sbj.get("subject")
                 # If subject has an id, it's a controlled vocabulary
                 if _id:
-                    # TODO we still did not define a strategy to map legacy subjects to rdm.
                     pass
                 # Otherwise it's a free text string (keyword)
                 elif _subject:
                     serialized_keywords.append(_subject)
 
-        if serialized_keywords:
-            result["keywords"] = serialized_keywords
-
-        if serialized_subjects:
-            result["subjects"] = serialized_subjects
-
-        return result
+        return serialized_keywords or missing
 
     def dump_reference(self, obj):
         """Dumps reference."""