From 0a0c0062224edfb876866c9744e5ac14f11a253b Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Mon, 18 Sep 2023 19:54:04 +0200
Subject: [PATCH 1/7] migrator: fix index backup script

---
 migrator/scripts/backup_indices.sql | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/migrator/scripts/backup_indices.sql b/migrator/scripts/backup_indices.sql
index 729fc025..07757deb 100644
--- a/migrator/scripts/backup_indices.sql
+++ b/migrator/scripts/backup_indices.sql
@@ -1,4 +1,5 @@
 -- From https://www.postgresql.org/message-id/flat/877em2racj.fsf%40gmail.com#36a9eba4b16b8172e379b2a19f403939
+DROP TABLE rdm_index_backup;
 CREATE TABLE rdm_index_backup AS
 SELECT *
 FROM pg_indexes
@@ -14,6 +15,9 @@ WHERE
     'accounts_user_login_information',
     'accounts_user_session_activity',
     'oauth2server_token',
+    'oauth2server_client',
+    'oauthclient_remoteaccount',
+    'oauthclient_remotetoken',
     'accounts_useridentity',
     -- communities
     'communities_metadata',
@@ -27,12 +31,19 @@ WHERE
     'rdm_drafts_metadata',
     'rdm_versions_state',
     'rdm_records_files',
+    'rdm_records_media_files',
     'rdm_drafts_files',
+    'rdm_drafts_media_files',
     'pidstore_pid',
     -- requests
     'request_metadata',
     -- github
     'webhooks_events',
     'github_repositories',
-    'github_releases'
+    'github_releases',
+    -- vocabularies
+    'name_metadata',
+    'affiliation_metadata',
+    'award_metadata',
+    'funder_metadata'
   );

From 24ecd5f1944d4441cdede0d49e6db5518dc4eded Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 19 Sep 2023 10:41:26 +0200
Subject: [PATCH 2/7] migrator: fix awards dump script

---
 legacy/zenodo_legacy/funders.py    |  2 +-
 migrator/scripts/dump_awards_db.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/legacy/zenodo_legacy/funders.py b/legacy/zenodo_legacy/funders.py
index a640db2a..3a237d75 100644
--- a/legacy/zenodo_legacy/funders.py
+++ b/legacy/zenodo_legacy/funders.py
@@ -27,7 +27,7 @@
     "10.13039/501100000038": "01h531d29",
     "10.13039/100000001": "021nxhr62",
     "10.13039/501100003246": "04jsz6e67",
-    # NOTE: RCUK (10.13039/100014013) was succeeded by UKRI (10.13039/501100000690).
+    # NOTE: RCUK (10.13039/501100000690) was succeeded by UKRI (10.13039/100014013).
     # All awards/grants were transferred, so we're also remapping the funder IDs to
     # point to the UKRI ROR ID (001aqnf71).
     "10.13039/501100000690": "001aqnf71",
diff --git a/migrator/scripts/dump_awards_db.py b/migrator/scripts/dump_awards_db.py
index 47362fcb..1d616eb8 100644
--- a/migrator/scripts/dump_awards_db.py
+++ b/migrator/scripts/dump_awards_db.py
@@ -13,12 +13,12 @@
 from invenio_rdm_migrator.utils import ts
 
 DATA_PATHS = [
-    "awards-2023-08.jsonl.gz"  # https://zenodo.org/record/8224080
-    "awards-2023-04.jsonl.gz"  # https://zenodo.org/record/7870151
-    "awards-2023-03.jsonl.gz"  # https://zenodo.org/record/7803150
-    "awards-2023-02.jsonl.gz"  # https://zenodo.org/record/7683844
-    "awards-2023-01.jsonl.gz"  # https://zenodo.org/record/7561801
-    "awards-2022-12.jsonl.gz"  # https://zenodo.org/record/7745773
+    "awards-2023-08.jsonl.gz",  # https://zenodo.org/record/8224080
+    "awards-2023-04.jsonl.gz",  # https://zenodo.org/record/7870151
+    "awards-2023-03.jsonl.gz",  # https://zenodo.org/record/7803150
+    "awards-2023-02.jsonl.gz",  # https://zenodo.org/record/7683844
+    "awards-2023-01.jsonl.gz",  # https://zenodo.org/record/7561801
+    "awards-2022-12.jsonl.gz",  # https://zenodo.org/record/7745773
 ]
 
 VOCABULARIES_AWARDS_OPENAIRE_FUNDERS = {

From fa8591fc686cd286ee11a2bdc4fcf07e2832bfd4 Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 19 Sep 2023 10:43:52 +0200
Subject: [PATCH 3/7] migrator: add "deletion_status" to community

---
 migrator/zenodo_rdm_migrator/transform/entries/communities.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/migrator/zenodo_rdm_migrator/transform/entries/communities.py b/migrator/zenodo_rdm_migrator/transform/entries/communities.py
index c7e4ade3..dc19d346 100644
--- a/migrator/zenodo_rdm_migrator/transform/entries/communities.py
+++ b/migrator/zenodo_rdm_migrator/transform/entries/communities.py
@@ -68,6 +68,10 @@ def _bucket_id(self, entry):
         """Returns the community bucket id."""
         return None
 
+    def _deletion_status(self, entry):
+        """Returns the community's deletion status."""
+        return "P"
+
     def _metadata(self, entry):
         """Returns community metadata."""
         # Clean-up description

From 6629c721d137bee99feff66be1f8385f93c3dedc Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 19 Sep 2023 14:57:37 +0200
Subject: [PATCH 4/7] legacy/migrator: communities fixes

---
 migrator/tests/actions/conftest.py               |  2 +-
 .../tests/transform/test_community_transform.py  |  1 +
 .../transform/entries/parents.py                 | 16 +++++++++++++++-
 scripts/admin/community_manage_requests.py       |  2 +-
 .../test_community_manage_record_request.py      |  2 +-
 .../legacy/requests/community_manage_record.py   | 10 +++++-----
 6 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/migrator/tests/actions/conftest.py b/migrator/tests/actions/conftest.py
index e9f6a1e6..400d0589 100644
--- a/migrator/tests/actions/conftest.py
+++ b/migrator/tests/actions/conftest.py
@@ -56,7 +56,7 @@ def state(tmp_dir):
     state_db = StateDB(
         db_dir=tmp_dir.name, validators={"parents": ParentModelValidator}
     )
-    STATE.initialized_state(state_db, cache=False)
+    STATE.initialized_state(state_db, cache=False, search_cache=False)
 
     return STATE
 
diff --git a/migrator/tests/transform/test_community_transform.py b/migrator/tests/transform/test_community_transform.py
index 33192856..5cc12ee1 100644
--- a/migrator/tests/transform/test_community_transform.py
+++ b/migrator/tests/transform/test_community_transform.py
@@ -58,6 +58,7 @@ def expected_rdm_community():
             },
         },
         "bucket_id": None,
+        "deletion_status": "P",
     }
 
 
diff --git a/migrator/zenodo_rdm_migrator/transform/entries/parents.py b/migrator/zenodo_rdm_migrator/transform/entries/parents.py
index 2d1be518..6c4cf41f 100644
--- a/migrator/zenodo_rdm_migrator/transform/entries/parents.py
+++ b/migrator/zenodo_rdm_migrator/transform/entries/parents.py
@@ -8,6 +8,7 @@
 """Zenodo migrator parent record transformer entries."""
 
 
+from invenio_rdm_migrator.state import STATE
 from invenio_rdm_migrator.transform import Entry
 
 from ...errors import NoConceptRecidForDraft
@@ -76,15 +77,28 @@ def transform(self, entry):
                 # we raise so the error logger writes these cases in the log file
                 raise NoConceptRecidForDraft(draft=entry)
 
+            communities = self._communities(entry)
             transformed["json"] = {
                 # loader is responsible for creating/updating if the PID exists.
                 "id": parent_pid,
-                "communities": self._communities(entry),
+                "communities": communities,
                 "pids": self._pids(entry),
             }
             owner = next(iter(entry["json"].get("owners", [])), None)
             if owner is not None:
                 transformed["json"]["access"] = {"owned_by": {"user": owner}}
+
+            permission_flags = {}
+            owner_comm_slugs = {
+                comm["slug"]
+                for comm in (
+                    STATE.COMMUNITIES.search("owner_id", owner) if owner else []
+                )
+            }
+            comm_slugs = set(communities.get("ids", []))
+            has_only_managed_communities = comm_slugs < owner_comm_slugs
+            if not has_only_managed_communities:
+                permission_flags["can_community_manage_record"] = False
         elif not self.partial:
             raise KeyError("json")
         # else, pass
diff --git a/scripts/admin/community_manage_requests.py b/scripts/admin/community_manage_requests.py
index 9609b005..39d62ac5 100644
--- a/scripts/admin/community_manage_requests.py
+++ b/scripts/admin/community_manage_requests.py
@@ -17,7 +17,7 @@ def create_community_manage_record_request(record_id):
 
     # add a permission flag to db (make record a legacy one)
     db_record = RDMRecord.get_record(record_id)
-    db_record.parent.permission_flags = {"can_community_manage_record": True}
+    db_record.parent.permission_flags = {"can_community_manage_record": False}
     db_record.parent.commit()
     db.session.commit()
 
diff --git a/site/tests/requests/test_community_manage_record_request.py b/site/tests/requests/test_community_manage_record_request.py
index 83a44e39..1001f8cd 100644
--- a/site/tests/requests/test_community_manage_record_request.py
+++ b/site/tests/requests/test_community_manage_record_request.py
@@ -87,7 +87,7 @@ def test_submit_a_request(uploader):
     assert db_request["title"] == "Communities manage legacy records"
     assert db_request["expires_at"] is not None
     assert db_request["description"].startswith(
-        "<h4>Some of your records, that are going through migration"
+        "<h4>Some of your records, that are going through the migration"
     )
 
 
diff --git a/site/zenodo_rdm/legacy/requests/community_manage_record.py b/site/zenodo_rdm/legacy/requests/community_manage_record.py
index a1cacce3..fbc780e8 100644
--- a/site/zenodo_rdm/legacy/requests/community_manage_record.py
+++ b/site/zenodo_rdm/legacy/requests/community_manage_record.py
@@ -96,13 +96,13 @@ def execute(self, identity, uow):
         # example: "May 11, 2024"
         expires_at = self.request.expires_at.strftime("%B %d, %Y")
         self.request["description"] = (
-            "<h4>Some of your records, that are going through migration process are part "
-            "of the communities that don't belong to you.</br>Accept this request to keep the old "
-            "behaviour and <b>allow community curators</b> to manage (edit, create new version, add to "
-            "another community, etc.) corresponding record. </br>In case of declining this "
+            "<h4>Some of your records, that are going through the migration process are part "
+            "of communities that don't belong to you.</br>Accept this request to adopt the new "
+            "behaviour and <b>allow community curators</b> to manage (edit, create new versions, add to "
+            "another community, etc.) your corresponding records. </br>In case of declining this "
             "request all your legacy records will be <b>removed from all communities</b> "
             "that you are not an owner of. </br></br>If you do not perform any action by "
-            f"<b>{expires_at}</b>, the permission for community curators to manage the record "
+            f"<b>{expires_at}</b>, the permission for community curators to manage your records "
             "will automatically be fully granted.</h4>"
         )
 

From 40a50289325d234c370fe68a9d81c793a5ce447d Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 19 Sep 2023 15:00:28 +0200
Subject: [PATCH 5/7] legacy/migrator: add "subjects" support

---
 .../tests/transform/test_record_transform.py  | 48 +++++++++++++++++++
 .../entries/records/custom_fields.py          | 15 ++++++
 site/tests/legacy/deposits/conftest.py        |  9 ++--
 site/zenodo_rdm/custom_fields.py              | 31 ++++++++++++
 .../legacy/deserializers/metadata.py          | 35 +++-----------
 .../legacy/deserializers/schemas.py           | 16 +++++++
 .../legacy/serializers/schemas/common.py      | 20 +++-----
 7 files changed, 127 insertions(+), 47 deletions(-)

diff --git a/migrator/tests/transform/test_record_transform.py b/migrator/tests/transform/test_record_transform.py
index 9075b2d8..5e39590e 100644
--- a/migrator/tests/transform/test_record_transform.py
+++ b/migrator/tests/transform/test_record_transform.py
@@ -156,6 +156,18 @@ def zenodo_record_data():
             ],
             "references": [{"raw_reference": "Test reference"}],
             "keywords": ["migration", "test", "Zenodo", "RDM"],
+            "subjects": [
+                {
+                    "term": "Astronomy",
+                    "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                    "scheme": "url",
+                },
+                {
+                    "term": "Klassenrat",
+                    "identifier": "gnd:4180044-8",
+                    "scheme": "gnd",
+                },
+            ],
             "_internal": {
                 "source": {
                     "agents": [
@@ -455,6 +467,18 @@ def expected_rdm_record_entry():
                     "title": "A book title",
                 },
                 "thesis:university": "Test University",
+                "legacy:subjects": [
+                    {
+                        "term": "Astronomy",
+                        "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                        "scheme": "url",
+                    },
+                    {
+                        "term": "Klassenrat",
+                        "identifier": "gnd:4180044-8",
+                        "scheme": "gnd",
+                    },
+                ],
                 # dwc
                 "dwc:basisOfRecord": ["foo", "bar"],
                 "dwc:catalogNumber": ["foo", "bar"],
@@ -656,6 +680,18 @@ def zenodo_draft_data():
                 },
             ],
             "keywords": ["migration", "test", "Zenodo", "RDM"],
+            "subjects": [
+                {
+                    "term": "Astronomy",
+                    "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                    "scheme": "url",
+                },
+                {
+                    "term": "Klassenrat",
+                    "identifier": "gnd:4180044-8",
+                    "scheme": "gnd",
+                },
+            ],
             "_internal": {
                 "source": {
                     "agents": [
@@ -919,6 +955,18 @@ def expected_rdm_draft_entry():
                     "title": "A book title",
                 },
                 "thesis:university": "Test University",
+                "legacy:subjects": [
+                    {
+                        "term": "Astronomy",
+                        "identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
+                        "scheme": "url",
+                    },
+                    {
+                        "term": "Klassenrat",
+                        "identifier": "gnd:4180044-8",
+                        "scheme": "gnd",
+                    },
+                ],
                 # dwc
                 "dwc:basisOfRecord": ["foo", "bar"],
                 "dwc:catalogNumber": ["foo", "bar"],
diff --git a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py
index 7297114b..a948c5a0 100644
--- a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py
+++ b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py
@@ -13,6 +13,20 @@
 class ZenodoCustomFieldsEntry(Entry):
     """Custom fields entry transform."""
 
+    @classmethod
+    def _subjects(cls, subjects):
+        """Parse subjects."""
+        res = []
+        for s in subjects or []:
+            res.append(
+                {
+                    "term": s.get("term"),
+                    "identifier": s.get("identifier"),
+                    "scheme": s.get("scheme"),
+                }
+            )
+        return res or None
+
     @classmethod
     def _journal(cls, journal):
         """Parse journal fields."""
@@ -131,6 +145,7 @@ def transform(cls, entry):
                 entry.get("imprint", {}), entry.get("part_of", {})
             ),
             "thesis:university": entry.get("thesis", {}).get("university"),
+            "legacy:subjects": cls._subjects(entry.get("subjects", [])),
             "openbiodiv:TaxonomicConceptLabel": entry.get("custom", {}).get(
                 "openbiodiv:TaxonomicConceptLabel"
             ),
diff --git a/site/tests/legacy/deposits/conftest.py b/site/tests/legacy/deposits/conftest.py
index 7d402b6b..9f664ea8 100644
--- a/site/tests/legacy/deposits/conftest.py
+++ b/site/tests/legacy/deposits/conftest.py
@@ -149,11 +149,10 @@ def expected_record_metadata():
         journal_title="Some journal name",
         journal_volume="Some volume",
         keywords=["Keyword 1", "keyword 2"],
-        # TODO uncomment when subjects are implemented
-        # subjects=[
-        #     dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
-        #     dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
-        # ],
+        subjects=[
+            dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
+            dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
+        ],
         license="cc-zero",
         notes="Some notes",
         partof_pages="SOme part of",
diff --git a/site/zenodo_rdm/custom_fields.py b/site/zenodo_rdm/custom_fields.py
index 148a81ec..2a4c6616 100644
--- a/site/zenodo_rdm/custom_fields.py
+++ b/site/zenodo_rdm/custom_fields.py
@@ -43,8 +43,39 @@
 from marshmallow import fields
 from marshmallow_utils.fields import SanitizedUnicode
 
+
+class SubjectListCF(BaseCF):
+    """Subject list custom field."""
+
+    @property
+    def mapping(self):
+        """Search mapping."""
+        return {
+            "type": "object",
+            "properties": {
+                "term": {"type": "keyword"},
+                "identifier": {"type": "keyword"},
+                "scheme": {"type": "keyword"},
+            },
+        }
+
+    @property
+    def field(self):
+        """Marshmallow field."""
+        return fields.List(
+            fields.Nested(
+                {
+                    "term": SanitizedUnicode(),
+                    "identifier": SanitizedUnicode(),
+                    "scheme": SanitizedUnicode(),
+                }
+            )
+        )
+
+
 LEGACY_CUSTOM_FIELDS = [
     KeywordCF(name="legacy:communities", multiple=True),
+    SubjectListCF(name="legacy:subjects"),
 ]
 """Legacy compatibility custom fields."""
 
diff --git a/site/zenodo_rdm/legacy/deserializers/metadata.py b/site/zenodo_rdm/legacy/deserializers/metadata.py
index cfbb4299..443360a8 100644
--- a/site/zenodo_rdm/legacy/deserializers/metadata.py
+++ b/site/zenodo_rdm/legacy/deserializers/metadata.py
@@ -158,7 +158,7 @@ def split_identifiers(self, data, **kwargs):
     contributors = fields.List(fields.Dict())
     additional_descriptions = fields.List(fields.Dict())
     locations = fields.Method(deserialize="load_locations")
-    subjects = fields.List(fields.Dict())
+    subjects = fields.Method(deserialize="load_subjects", data_key="keywords")
     version = SanitizedUnicode()
     dates = fields.Method(deserialize="load_dates")
     references = fields.Method(deserialize="load_references")
@@ -301,37 +301,14 @@ def load_locations(self, obj):
 
         return {"features": features}
 
-    @post_load(pass_original=True)
-    def _subjects(self, result, original, **kwargs):
+    def load_subjects(self, obj):
         """Transform subjects of a legacy record.
 
-        RDM subjects translate to either legacy keywords or subjects.
+        RDM subjects translate to legacy keywords.
         """
-
-        def _from_keywords(keywords):
-            """Legacy keywords are free text strings.
-
-            They map to custom subjects.
-            """
-            return [{"subject": kw} for kw in keywords]
-
-        def _from_subjects(data):
-            """Maps RDM subjects to legacy subjects.
-
-            Legacy subjects are custom vocabularies.
-            """
-            # TODO we still did not define a strategy to map legacy subjects to rdm.
-            return []
-
-        keywords = original.get("keywords", [])
-        subjects = original.get("subjects", [])
-
-        if keywords or subjects:
-            rdm_subjects = _from_keywords(keywords) + _from_subjects(subjects)
-
-            result["subjects"] = rdm_subjects
-
-        return result
+        if not obj:
+            return missing
+        return [{"subject": kw} for kw in obj]
 
     def load_dates(self, obj):
         """Transform dates of a legacy record."""
diff --git a/site/zenodo_rdm/legacy/deserializers/schemas.py b/site/zenodo_rdm/legacy/deserializers/schemas.py
index bf4adf44..86ee573a 100644
--- a/site/zenodo_rdm/legacy/deserializers/schemas.py
+++ b/site/zenodo_rdm/legacy/deserializers/schemas.py
@@ -213,3 +213,19 @@ def load_communities(self, result, original, **kwargs):
             result.setdefault("custom_fields", {})
             result["custom_fields"].update({"legacy:communities": community_ids})
         return result
+
+    @post_load(pass_original=True)
+    def load_subjects(self, result, original, **kwargs):
+        """Store legacy subjects as a custom field."""
+        subjects = original.get("metadata", {}).get("subjects", [])
+        if subjects:
+            result.setdefault("custom_fields", {})
+            result["custom_fields"]["legacy:subjects"] = [
+                {
+                    "term": s.get("term"),
+                    "identifier": s.get("identifier"),
+                    "scheme": s.get("scheme"),
+                }
+                for s in subjects
+            ]
+        return result
diff --git a/site/zenodo_rdm/legacy/serializers/schemas/common.py b/site/zenodo_rdm/legacy/serializers/schemas/common.py
index db060910..ab8cae57 100644
--- a/site/zenodo_rdm/legacy/serializers/schemas/common.py
+++ b/site/zenodo_rdm/legacy/serializers/schemas/common.py
@@ -166,6 +166,9 @@ class MetadataSchema(Schema):
     creators = fields.List(fields.Nested(CreatorSchema), dump_only=True)
     contributors = fields.List(fields.Nested(ContributorSchema), dump_only=True)
 
+    keywords = fields.Method("dump_keywords")
+    subjects = fields.Raw(attribute="custom_fields.legacy:subjects")
+
     related_identifiers = fields.List(fields.Nested(RelatedIdentifierSchema))
 
     locations = fields.Method("dump_locations")
@@ -212,11 +215,9 @@ def resolve_license(self, data, **kwargs):
             data["license"] = rdm_to_legacy(license["id"])
         return data
 
-    @post_dump(pass_original=True)
-    def dump_subjects(self, result, original, **kwargs):
-        """Dumps subjects."""
-        subjects = original.get("subjects", [])
-        serialized_subjects = []
+    def dump_keywords(self, obj):
+        """Dumps keywords from RDM subjects."""
+        subjects = obj.get("subjects", [])
         serialized_keywords = []
         if subjects:
             for _sbj in subjects:
@@ -224,19 +225,12 @@ def dump_subjects(self, result, original, **kwargs):
                 _subject = _sbj.get("subject")
                 # If subject has an id, it's a controlled vocabulary
                 if _id:
-                    # TODO we still did not define a strategy to map legacy subjects to rdm.
                     pass
                 # Otherwise it's a free text string (keyword)
                 elif _subject:
                     serialized_keywords.append(_subject)
 
-        if serialized_keywords:
-            result["keywords"] = serialized_keywords
-
-        if serialized_subjects:
-            result["subjects"] = serialized_subjects
-
-        return result
+        return serialized_keywords or missing
 
     def dump_reference(self, obj):
         """Dumps reference."""

From 948e1e50a68d233b5b462bebedf442ca59087cd0 Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 19 Sep 2023 15:04:24 +0200
Subject: [PATCH 6/7] migrator: update funders dump script

---
 migrator/scripts/dump_funders_db.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/migrator/scripts/dump_funders_db.py b/migrator/scripts/dump_funders_db.py
index 39383380..f0f71527 100644
--- a/migrator/scripts/dump_funders_db.py
+++ b/migrator/scripts/dump_funders_db.py
@@ -7,13 +7,14 @@
 """
 
 import csv
-import json
+import orjson as json
 import uuid
 
 from idutils import normalize_ror
 from invenio_rdm_migrator.utils import ts
 
-DATA_PATH = "v1.25-2023-05-11-ror-data.zip"  # https://zenodo.org/record/7926988
+DATA_PATH = "v1.32-2023-09-14-ror-data.zip"  # https://zenodo.org/record/8346986
+
 
 VOCABULARIES_FUNDER_SCHEMES = {
     "grid",
@@ -72,7 +73,7 @@ def load_file(datafile, outpath):
     with open(outpath, "w") as fout, open(datafile, "rb") as fp:
         print(f"[{ts()}] loading {datafile}")
         writer = csv.writer(fout)
-        entries = json.load(fp)
+        entries = json.loads(fp.read())
         for idx, data in enumerate(entries):
             if idx % 1000 == 0:
                 print(f"[{ts()}] {idx}")

From cd44371f72b4e17b923c36144f76ce2ec0924e9e Mon Sep 17 00:00:00 2001
From: Alex Ioannidis <a.ioannidis@cern.ch>
Date: Tue, 19 Sep 2023 15:10:44 +0200
Subject: [PATCH 7/7] migrator: add affiliations ROR dump script

---
 migrator/scripts/dump_affiliations_db.py | 84 ++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 migrator/scripts/dump_affiliations_db.py

diff --git a/migrator/scripts/dump_affiliations_db.py b/migrator/scripts/dump_affiliations_db.py
new file mode 100644
index 00000000..d7205c54
--- /dev/null
+++ b/migrator/scripts/dump_affiliations_db.py
@@ -0,0 +1,84 @@
+"""Parse affiliations from ROR dumps into CSV format, importable via COPY.
+
+To use call ``load_file(DATA_PATH, "affiliations.csv")``.
+"""
+
+import csv
+import orjson as json
+import uuid
+
+from idutils import normalize_ror
+from invenio_rdm_migrator.utils import ts
+
+DATA_PATH = "v1.32-2023-09-14-ror-data.zip"  # https://zenodo.org/record/8346986
+
+
+VOCABULARIES_AFFILIATION_SCHEMES = {
+    "grid",
+    "gnd",
+    "isni",
+    "ror",
+}
+
+
+def transform_affiliation(data):
+    """Applies the transformation to the stream entry."""
+    affiliation = {
+        "$schema": "local://affiliations/affiliation-v1.0.0.json",
+    }
+
+    affiliation["id"] = normalize_ror(data.get("id"))
+    if not affiliation["id"]:
+        return
+
+    affiliation["name"] = data.get("name")
+    if not affiliation["name"]:
+        return
+
+    acronyms = data.get("acronyms") or []
+    if acronyms:
+        affiliation["acronym"] = acronyms[0]
+
+    affiliation["title"] = {"en": affiliation["name"]}
+    for label in data.get("labels", []):
+        affiliation["title"][label["iso639"]] = label["label"]
+
+    # The ROR is always listed in identifiers, expected by serialization
+    affiliation["identifiers"] = [{"identifier": affiliation["id"], "scheme": "ror"}]
+    for scheme, identifier in data.get("external_ids", {}).items():
+        scheme = scheme.lower()
+        if scheme in VOCABULARIES_AFFILIATION_SCHEMES:
+            value = identifier.get("preferred") or identifier.get("all")[0]
+            affiliation["identifiers"].append({"identifier": value, "scheme": scheme})
+
+    return affiliation
+
+
+def load_file(datafile, outpath):
+    """Load the data file and dump as CSV."""
+    with open(outpath, "w") as fout, open(datafile, "rb") as fp:
+        print(f"[{ts()}] loading {datafile}")
+        writer = csv.writer(fout)
+        entries = json.loads(fp.read())
+        for idx, data in enumerate(entries):
+            if idx % 1000 == 0:
+                print(f"[{ts()}] {idx}")
+            try:
+                affiliation = transform_affiliation(data)
+                if not affiliation:
+                    print(f"[{ts()}] Failed to transform #{idx}:\n{data}\n")
+                    continue
+                affiliation_id = affiliation.pop("id")
+                creation_ts = ts()
+                writer.writerow(
+                    (
+                        str(uuid.uuid4()),  # id
+                        affiliation_id,  # pid
+                        json.dumps(affiliation),  # json
+                        creation_ts,  # created
+                        creation_ts,  # updated (same as created)
+                        1,  # version_id
+                    )
+                )
+            except Exception as ex:
+                print(f"[{ts()}] Exception for line {idx}:\n{data}\n\n{ex}\n")