zenodo · slint · Sep 19, 2023 · Sep 18, 2023 · Sep 19, 2023 · Sep 19, 2023
diff --git a/legacy/zenodo_legacy/funders.py b/legacy/zenodo_legacy/funders.py
@@ -27,7 +27,7 @@
     "10.13039/501100000038": "01h531d29",
     "10.13039/100000001": "021nxhr62",
     "10.13039/501100003246": "04jsz6e67",
-    # NOTE: RCUK (10.13039/100014013) was succeeded by UKRI (10.13039/501100000690).
+    # NOTE: RCUK (10.13039/501100000690) was succeeded by UKRI (10.13039/100014013).
     # All awards/grants were transferred, so we're also remapping the funder IDs to
     # point to the UKRI ROR ID (001aqnf71).
     "10.13039/501100000690": "001aqnf71",

diff --git a/migrator/scripts/backup_indices.sql b/migrator/scripts/backup_indices.sql
@@ -1,4 +1,5 @@
 -- From https://www.postgresql.org/message-id/flat/877em2racj.fsf%40gmail.com#36a9eba4b16b8172e379b2a19f403939
+DROP TABLE rdm_index_backup;
 CREATE TABLE rdm_index_backup AS
 SELECT *
 FROM pg_indexes
@@ -14,6 +15,9 @@ WHERE
     'accounts_user_login_information',
     'accounts_user_session_activity',
     'oauth2server_token',
+    'oauth2server_client',
+    'oauthclient_remoteaccount',
+    'oauthclient_remotetoken',
     'accounts_useridentity',
     -- communities
     'communities_metadata',
@@ -27,12 +31,19 @@ WHERE
     'rdm_drafts_metadata',
     'rdm_versions_state',
     'rdm_records_files',
+    'rdm_records_media_files',
     'rdm_drafts_files',
+    'rdm_drafts_media_files',
     'pidstore_pid',
     -- requests
     'request_metadata',
     -- github
     'webhooks_events',
     'github_repositories',
-    'github_releases'
+    'github_releases',
+    -- vocabularies
+    'name_metadata',
+    'affiliation_metadata',
+    'award_metadata',
+    'funder_metadata'
   );
diff --git a/migrator/scripts/dump_affiliations_db.py b/migrator/scripts/dump_affiliations_db.py
@@ -0,0 +1,84 @@
+"""Parse affiliations from ROR dumps into CSV format, importable via COPY.
+
+To use call ``load_file(DATA_PATH, "affiliations.csv")``.
+"""
+
+import csv
+import orjson as json
+import uuid
+
+from idutils import normalize_ror
+from invenio_rdm_migrator.utils import ts
+
+DATA_PATH = "v1.32-2023-09-14-ror-data.zip"  # https://zenodo.org/record/8346986
+
+
+VOCABULARIES_AFFILIATION_SCHEMES = {
+    "grid",
+    "gnd",
+    "isni",
+    "ror",
+}
+
+
+def transform_affiliation(data):
+    """Applies the transformation to the stream entry."""
+    affiliation = {
+        "$schema": "local://affiliations/affiliation-v1.0.0.json",
+    }
+
+    affiliation["id"] = normalize_ror(data.get("id"))
+    if not affiliation["id"]:
+        return
+
+    affiliation["name"] = data.get("name")
+    if not affiliation["name"]:
+        return
+
+    acronyms = data.get("acronyms") or []
+    if acronyms:
+        affiliation["acronym"] = acronyms[0]
+
+    affiliation["title"] = {"en": affiliation["name"]}
+    for label in data.get("labels", []):
+        affiliation["title"][label["iso639"]] = label["label"]
+
+    # The ROR is always listed in identifiers, expected by serialization
+    affiliation["identifiers"] = [{"identifier": affiliation["id"], "scheme": "ror"}]
+    for scheme, identifier in data.get("external_ids", {}).items():
+        scheme = scheme.lower()
+        if scheme in VOCABULARIES_AFFILIATION_SCHEMES:
+            value = identifier.get("preferred") or identifier.get("all")[0]
+            affiliation["identifiers"].append({"identifier": value, "scheme": scheme})
+
+    return affiliation
+
+
+def load_file(datafile, outpath):
+    """Load the data file and dump as CSV."""
+    with open(outpath, "w") as fout, open(datafile, "rb") as fp:
+        print(f"[{ts()}] loading {datafile}")
+        writer = csv.writer(fout)
+        entries = json.loads(fp.read())
+        for idx, data in enumerate(entries):
+            if idx % 1000 == 0:
+                print(f"[{ts()}] {idx}")
+            try:
+                affiliation = transform_affiliation(data)
+                if not affiliation:
+                    print(f"[{ts()}] Failed to transform #{idx}:\n{data}\n")
+                    continue
+                affiliation_id = affiliation.pop("id")
+                creation_ts = ts()
+                writer.writerow(
+                    (
+                        str(uuid.uuid4()),  # id
+                        affiliation_id,  # pid
+                        json.dumps(affiliation),  # json
+                        creation_ts,  # created
+                        creation_ts,  # updated (same as created)
+                        1,  # version_id
+                    )
+                )
+            except Exception as ex:
+                print(f"[{ts()}] Exception for line {idx}:\n{data}\n\n{ex}\n")
diff --git a/migrator/scripts/dump_awards_db.py b/migrator/scripts/dump_awards_db.py
@@ -13,12 +13,12 @@
 from invenio_rdm_migrator.utils import ts
 
 DATA_PATHS = [
-    "awards-2023-08.jsonl.gz"  # https://zenodo.org/record/8224080
-    "awards-2023-04.jsonl.gz"  # https://zenodo.org/record/7870151
-    "awards-2023-03.jsonl.gz"  # https://zenodo.org/record/7803150
-    "awards-2023-02.jsonl.gz"  # https://zenodo.org/record/7683844
-    "awards-2023-01.jsonl.gz"  # https://zenodo.org/record/7561801
-    "awards-2022-12.jsonl.gz"  # https://zenodo.org/record/7745773
+    "awards-2023-08.jsonl.gz",  # https://zenodo.org/record/8224080
+    "awards-2023-04.jsonl.gz",  # https://zenodo.org/record/7870151
+    "awards-2023-03.jsonl.gz",  # https://zenodo.org/record/7803150
+    "awards-2023-02.jsonl.gz",  # https://zenodo.org/record/7683844
+    "awards-2023-01.jsonl.gz",  # https://zenodo.org/record/7561801
+    "awards-2022-12.jsonl.gz",  # https://zenodo.org/record/7745773
 ]
 
 VOCABULARIES_AWARDS_OPENAIRE_FUNDERS = {

diff --git a/migrator/scripts/dump_funders_db.py b/migrator/scripts/dump_funders_db.py
@@ -7,13 +7,14 @@
 """
 
 import csv
-import json
+import orjson as json
 import uuid
 
 from idutils import normalize_ror
 from invenio_rdm_migrator.utils import ts
 
-DATA_PATH = "v1.25-2023-05-11-ror-data.zip"  # https://zenodo.org/record/7926988
+DATA_PATH = "v1.32-2023-09-14-ror-data.zip"  # https://zenodo.org/record/8346986
+
 
 VOCABULARIES_FUNDER_SCHEMES = {
     "grid",
@@ -72,7 +73,7 @@ def load_file(datafile, outpath):
     with open(outpath, "w") as fout, open(datafile, "rb") as fp:
         print(f"[{ts()}] loading {datafile}")
         writer = csv.writer(fout)
-        entries = json.load(fp)
+        entries = json.loads(fp.read())
         for idx, data in enumerate(entries):
             if idx % 1000 == 0:
                 print(f"[{ts()}] {idx}")

diff --git a/migrator/tests/actions/conftest.py b/migrator/tests/actions/conftest.py
@@ -56,7 +56,7 @@ def state(tmp_dir):
     state_db = StateDB(
         db_dir=tmp_dir.name, validators={"parents": ParentModelValidator}
     )
-    STATE.initialized_state(state_db, cache=False)
+    STATE.initialized_state(state_db, cache=False, search_cache=False)
 
     return STATE
 

diff --git a/migrator/tests/transform/test_community_transform.py b/migrator/tests/transform/test_community_transform.py
@@ -58,6 +58,7 @@ def expected_rdm_community():
             },
         },
         "bucket_id": None,
+        "deletion_status": "P",
     }
 
 

diff --git a/migrator/tests/transform/test_record_transform.py b/migrator/tests/transform/test_record_transform.py
@@ -156,6 +156,18 @@ def zenodo_record_data():
             ],
             "references": [{"raw_reference": "Test reference"}],
             "keywords": ["migration", "test", "Zenodo", "RDM"],
+            "subjects": [
+                {
+                    "term": "Astronomy",
+                    "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                    "scheme": "url",
+                },
+                {
+                    "term": "Klassenrat",
+                    "identifier": "gnd:4180044-8",
+                    "scheme": "gnd",
+                },
+            ],
             "_internal": {
                 "source": {
                     "agents": [
@@ -455,6 +467,18 @@ def expected_rdm_record_entry():
                     "title": "A book title",
                 },
                 "thesis:university": "Test University",
+                "legacy:subjects": [
+                    {
+                        "term": "Astronomy",
+                        "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                        "scheme": "url",
+                    },
+                    {
+                        "term": "Klassenrat",
+                        "identifier": "gnd:4180044-8",
+                        "scheme": "gnd",
+                    },
+                ],
                 # dwc
                 "dwc:basisOfRecord": ["foo", "bar"],
                 "dwc:catalogNumber": ["foo", "bar"],
@@ -656,6 +680,18 @@ def zenodo_draft_data():
                 },
             ],
             "keywords": ["migration", "test", "Zenodo", "RDM"],
+            "subjects": [
+                {
+                    "term": "Astronomy",
+                    "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                    "scheme": "url",
+                },
+                {
+                    "term": "Klassenrat",
+                    "identifier": "gnd:4180044-8",
+                    "scheme": "gnd",
+                },
+            ],
             "_internal": {
                 "source": {
                     "agents": [
@@ -919,6 +955,18 @@ def expected_rdm_draft_entry():
                     "title": "A book title",
                 },
                 "thesis:university": "Test University",
+                "legacy:subjects": [
+                    {
+                        "term": "Astronomy",
+                        "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                        "scheme": "url",
+                    },
+                    {
+                        "term": "Klassenrat",
+                        "identifier": "gnd:4180044-8",
+                        "scheme": "gnd",
+                    },
+                ],
                 # dwc
                 "dwc:basisOfRecord": ["foo", "bar"],
                 "dwc:catalogNumber": ["foo", "bar"],

diff --git a/migrator/zenodo_rdm_migrator/transform/entries/communities.py b/migrator/zenodo_rdm_migrator/transform/entries/communities.py
@@ -68,6 +68,10 @@ def _bucket_id(self, entry):
         """Returns the community bucket id."""
         return None
 
+    def _deletion_status(self, entry):
+        """Returns the community's deletion status."""
+        return "P"
+
     def _metadata(self, entry):
         """Returns community metadata."""
         # Clean-up description

diff --git a/migrator/zenodo_rdm_migrator/transform/entries/parents.py b/migrator/zenodo_rdm_migrator/transform/entries/parents.py
@@ -8,6 +8,7 @@
 """Zenodo migrator parent record transformer entries."""
 
 
+from invenio_rdm_migrator.state import STATE
 from invenio_rdm_migrator.transform import Entry
 
 from ...errors import NoConceptRecidForDraft
@@ -76,15 +77,28 @@ def transform(self, entry):
                 # we raise so the error logger writes these cases in the log file
                 raise NoConceptRecidForDraft(draft=entry)
 
+            communities = self._communities(entry)
             transformed["json"] = {
                 # loader is responsible for creating/updating if the PID exists.
                 "id": parent_pid,
-                "communities": self._communities(entry),
+                "communities": communities,
                 "pids": self._pids(entry),
             }
             owner = next(iter(entry["json"].get("owners", [])), None)
             if owner is not None:
                 transformed["json"]["access"] = {"owned_by": {"user": owner}}
+
+            permission_flags = {}
+            owner_comm_slugs = {
+                comm["slug"]
+                for comm in (
+                    STATE.COMMUNITIES.search("owner_id", owner) if owner else []
+                )
+            }
+            comm_slugs = set(communities.get("ids", []))
+            has_only_managed_communities = comm_slugs < owner_comm_slugs
+            if not has_only_managed_communities:
+                permission_flags["can_community_manage_record"] = False
         elif not self.partial:
             raise KeyError("json")
         # else, pass

diff --git a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py
@@ -13,6 +13,20 @@
 class ZenodoCustomFieldsEntry(Entry):
     """Custom fields entry transform."""
 
+    @classmethod
+    def _subjects(cls, subjects):
+        """Parse subjects."""
+        res = []
+        for s in subjects or []:
+            res.append(
+                {
+                    "term": s.get("term"),
+                    "identifier": s.get("identifier"),
+                    "scheme": s.get("scheme"),
+                }
+            )
+        return res or None
+
     @classmethod
     def _journal(cls, journal):
         """Parse journal fields."""
@@ -131,6 +145,7 @@ def transform(cls, entry):
                 entry.get("imprint", {}), entry.get("part_of", {})
             ),
             "thesis:university": entry.get("thesis", {}).get("university"),
+            "legacy:subjects": cls._subjects(entry.get("subjects", [])),
             "openbiodiv:TaxonomicConceptLabel": entry.get("custom", {}).get(
                 "openbiodiv:TaxonomicConceptLabel"
             ),

diff --git a/scripts/admin/community_manage_requests.py b/scripts/admin/community_manage_requests.py
@@ -17,7 +17,7 @@ def create_community_manage_record_request(record_id):
 
     # add a permission flag to db (make record a legacy one)
     db_record = RDMRecord.get_record(record_id)
-    db_record.parent.permission_flags = {"can_community_manage_record": True}
+    db_record.parent.permission_flags = {"can_community_manage_record": False}
     db_record.parent.commit()
     db.session.commit()
 

diff --git a/site/tests/legacy/deposits/conftest.py b/site/tests/legacy/deposits/conftest.py
@@ -149,11 +149,10 @@ def expected_record_metadata():
         journal_title="Some journal name",
         journal_volume="Some volume",
         keywords=["Keyword 1", "keyword 2"],
-        # TODO uncomment when subjects are implemented
-        # subjects=[
-        #     dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
-        #     dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
-        # ],
+        subjects=[
+            dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
+            dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
+        ],
         license="cc-zero",
         notes="Some notes",
         partof_pages="SOme part of",

diff --git a/site/tests/requests/test_community_manage_record_request.py b/site/tests/requests/test_community_manage_record_request.py
@@ -87,7 +87,7 @@ def test_submit_a_request(uploader):
     assert db_request["title"] == "Communities manage legacy records"
     assert db_request["expires_at"] is not None
     assert db_request["description"].startswith(
-        "<h4>Some of your records, that are going through migration"
+        "<h4>Some of your records, that are going through the migration"
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -58,6 +58,7 @@ def expected_rdm_community(): @@
                 },
             },
             "bucket_id": None,
+            "deletion_status": "P",
         }
@@ Expand Down @@