diff --git a/legacy/zenodo_legacy/funders.py b/legacy/zenodo_legacy/funders.py index a640db2a..3a237d75 100644 --- a/legacy/zenodo_legacy/funders.py +++ b/legacy/zenodo_legacy/funders.py @@ -27,7 +27,7 @@ "10.13039/501100000038": "01h531d29", "10.13039/100000001": "021nxhr62", "10.13039/501100003246": "04jsz6e67", - # NOTE: RCUK (10.13039/100014013) was succeeded by UKRI (10.13039/501100000690). + # NOTE: RCUK (10.13039/501100000690) was succeeded by UKRI (10.13039/100014013). # All awards/grants were transferred, so we're also remapping the funder IDs to # point to the UKRI ROR ID (001aqnf71). "10.13039/501100000690": "001aqnf71", diff --git a/migrator/scripts/backup_indices.sql b/migrator/scripts/backup_indices.sql index 729fc025..07757deb 100644 --- a/migrator/scripts/backup_indices.sql +++ b/migrator/scripts/backup_indices.sql @@ -1,4 +1,5 @@ -- From https://www.postgresql.org/message-id/flat/877em2racj.fsf%40gmail.com#36a9eba4b16b8172e379b2a19f403939 +DROP TABLE rdm_index_backup; CREATE TABLE rdm_index_backup AS SELECT * FROM pg_indexes @@ -14,6 +15,9 @@ WHERE 'accounts_user_login_information', 'accounts_user_session_activity', 'oauth2server_token', + 'oauth2server_client', + 'oauthclient_remoteaccount', + 'oauthclient_remotetoken', 'accounts_useridentity', -- communities 'communities_metadata', @@ -27,12 +31,19 @@ WHERE 'rdm_drafts_metadata', 'rdm_versions_state', 'rdm_records_files', + 'rdm_records_media_files', 'rdm_drafts_files', + 'rdm_drafts_media_files', 'pidstore_pid', -- requests 'request_metadata', -- github 'webhooks_events', 'github_repositories', - 'github_releases' + 'github_releases', + -- vocabularies + 'name_metadata', + 'affiliation_metadata', + 'award_metadata', + 'funder_metadata' ); diff --git a/migrator/scripts/dump_affiliations_db.py b/migrator/scripts/dump_affiliations_db.py new file mode 100644 index 00000000..d7205c54 --- /dev/null +++ b/migrator/scripts/dump_affiliations_db.py @@ -0,0 +1,84 @@ +"""Parse affiliations from ROR dumps into CSV format, importable via COPY. + +To use call ``load_file(DATA_PATH, "affiliations.csv")``. +""" + +import csv +import orjson as json +import uuid + +from idutils import normalize_ror +from invenio_rdm_migrator.utils import ts + +DATA_PATH = "v1.32-2023-09-14-ror-data.zip" # https://zenodo.org/record/8346986 + + +VOCABULARIES_AFFILIATION_SCHEMES = { + "grid", + "gnd", + "isni", + "ror", +} + + +def transform_affiliation(data): + """Applies the transformation to the stream entry.""" + affiliation = { + "$schema": "local://affiliations/affiliation-v1.0.0.json", + } + + affiliation["id"] = normalize_ror(data.get("id")) + if not affiliation["id"]: + return + + affiliation["name"] = data.get("name") + if not affiliation["name"]: + return + + acronyms = data.get("acronyms") or [] + if acronyms: + affiliation["acronym"] = acronyms[0] + + affiliation["title"] = {"en": affiliation["name"]} + for label in data.get("labels", []): + affiliation["title"][label["iso639"]] = label["label"] + + # The ROR is always listed in identifiers, expected by serialization + affiliation["identifiers"] = [{"identifier": affiliation["id"], "scheme": "ror"}] + for scheme, identifier in data.get("external_ids", {}).items(): + scheme = scheme.lower() + if scheme in VOCABULARIES_AFFILIATION_SCHEMES: + value = identifier.get("preferred") or identifier.get("all")[0] + affiliation["identifiers"].append({"identifier": value, "scheme": scheme}) + + return affiliation + + +def load_file(datafile, outpath): + """Load the data file and dump as CSV.""" + with open(outpath, "w") as fout, open(datafile, "rb") as fp: + print(f"[{ts()}] loading {datafile}") + writer = csv.writer(fout) + entries = json.loads(fp.read()) + for idx, data in enumerate(entries): + if idx % 1000 == 0: + print(f"[{ts()}] {idx}") + try: + affiliation = transform_affiliation(data) + if not affiliation: + print(f"[{ts()}] Failed to transform #{idx}:\n{data}\n") + continue + affiliation_id = affiliation.pop("id") + creation_ts = ts() + writer.writerow( + ( + str(uuid.uuid4()), # id + affiliation_id, # pid + json.dumps(affiliation), # json + creation_ts, # created + creation_ts, # updated (same as created) + 1, # version_id + ) + ) + except Exception as ex: + print(f"[{ts()}] Exception for line {idx}:\n{data}\n\n{ex}\n") diff --git a/migrator/scripts/dump_awards_db.py b/migrator/scripts/dump_awards_db.py index 47362fcb..1d616eb8 100644 --- a/migrator/scripts/dump_awards_db.py +++ b/migrator/scripts/dump_awards_db.py @@ -13,12 +13,12 @@ from invenio_rdm_migrator.utils import ts DATA_PATHS = [ - "awards-2023-08.jsonl.gz" # https://zenodo.org/record/8224080 - "awards-2023-04.jsonl.gz" # https://zenodo.org/record/7870151 - "awards-2023-03.jsonl.gz" # https://zenodo.org/record/7803150 - "awards-2023-02.jsonl.gz" # https://zenodo.org/record/7683844 - "awards-2023-01.jsonl.gz" # https://zenodo.org/record/7561801 - "awards-2022-12.jsonl.gz" # https://zenodo.org/record/7745773 + "awards-2023-08.jsonl.gz", # https://zenodo.org/record/8224080 + "awards-2023-04.jsonl.gz", # https://zenodo.org/record/7870151 + "awards-2023-03.jsonl.gz", # https://zenodo.org/record/7803150 + "awards-2023-02.jsonl.gz", # https://zenodo.org/record/7683844 + "awards-2023-01.jsonl.gz", # https://zenodo.org/record/7561801 + "awards-2022-12.jsonl.gz", # https://zenodo.org/record/7745773 ] VOCABULARIES_AWARDS_OPENAIRE_FUNDERS = { diff --git a/migrator/scripts/dump_funders_db.py b/migrator/scripts/dump_funders_db.py index 39383380..f0f71527 100644 --- a/migrator/scripts/dump_funders_db.py +++ b/migrator/scripts/dump_funders_db.py @@ -7,13 +7,14 @@ """ import csv -import json +import orjson as json import uuid from idutils import normalize_ror from invenio_rdm_migrator.utils import ts -DATA_PATH = "v1.25-2023-05-11-ror-data.zip" # https://zenodo.org/record/7926988 +DATA_PATH = "v1.32-2023-09-14-ror-data.zip" # https://zenodo.org/record/8346986 + VOCABULARIES_FUNDER_SCHEMES = { "grid", @@ -72,7 +73,7 @@ def load_file(datafile, outpath): with open(outpath, "w") as fout, open(datafile, "rb") as fp: print(f"[{ts()}] loading {datafile}") writer = csv.writer(fout) - entries = json.load(fp) + entries = json.loads(fp.read()) for idx, data in enumerate(entries): if idx % 1000 == 0: print(f"[{ts()}] {idx}") diff --git a/migrator/tests/actions/conftest.py b/migrator/tests/actions/conftest.py index e9f6a1e6..400d0589 100644 --- a/migrator/tests/actions/conftest.py +++ b/migrator/tests/actions/conftest.py @@ -56,7 +56,7 @@ def state(tmp_dir): state_db = StateDB( db_dir=tmp_dir.name, validators={"parents": ParentModelValidator} ) - STATE.initialized_state(state_db, cache=False) + STATE.initialized_state(state_db, cache=False, search_cache=False) return STATE diff --git a/migrator/tests/transform/test_community_transform.py b/migrator/tests/transform/test_community_transform.py index 33192856..5cc12ee1 100644 --- a/migrator/tests/transform/test_community_transform.py +++ b/migrator/tests/transform/test_community_transform.py @@ -58,6 +58,7 @@ def expected_rdm_community(): }, }, "bucket_id": None, + "deletion_status": "P", } diff --git a/migrator/tests/transform/test_record_transform.py b/migrator/tests/transform/test_record_transform.py index 9075b2d8..5e39590e 100644 --- a/migrator/tests/transform/test_record_transform.py +++ b/migrator/tests/transform/test_record_transform.py @@ -156,6 +156,18 @@ def zenodo_record_data(): ], "references": [{"raw_reference": "Test reference"}], "keywords": ["migration", "test", "Zenodo", "RDM"], + "subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], "_internal": { "source": { "agents": [ @@ -455,6 +467,18 @@ def expected_rdm_record_entry(): "title": "A book title", }, "thesis:university": "Test University", + "legacy:subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], # dwc "dwc:basisOfRecord": ["foo", "bar"], "dwc:catalogNumber": ["foo", "bar"], @@ -656,6 +680,18 @@ def zenodo_draft_data(): }, ], "keywords": ["migration", "test", "Zenodo", "RDM"], + "subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], "_internal": { "source": { "agents": [ @@ -919,6 +955,18 @@ def expected_rdm_draft_entry(): "title": "A book title", }, "thesis:university": "Test University", + "legacy:subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], # dwc "dwc:basisOfRecord": ["foo", "bar"], "dwc:catalogNumber": ["foo", "bar"], diff --git a/migrator/zenodo_rdm_migrator/transform/entries/communities.py b/migrator/zenodo_rdm_migrator/transform/entries/communities.py index c7e4ade3..dc19d346 100644 --- a/migrator/zenodo_rdm_migrator/transform/entries/communities.py +++ b/migrator/zenodo_rdm_migrator/transform/entries/communities.py @@ -68,6 +68,10 @@ def _bucket_id(self, entry): """Returns the community bucket id.""" return None + def _deletion_status(self, entry): + """Returns the community's deletion status.""" + return "P" + def _metadata(self, entry): """Returns community metadata.""" # Clean-up description diff --git a/migrator/zenodo_rdm_migrator/transform/entries/parents.py b/migrator/zenodo_rdm_migrator/transform/entries/parents.py index 2d1be518..6c4cf41f 100644 --- a/migrator/zenodo_rdm_migrator/transform/entries/parents.py +++ b/migrator/zenodo_rdm_migrator/transform/entries/parents.py @@ -8,6 +8,7 @@ """Zenodo migrator parent record transformer entries.""" +from invenio_rdm_migrator.state import STATE from invenio_rdm_migrator.transform import Entry from ...errors import NoConceptRecidForDraft @@ -76,15 +77,28 @@ def transform(self, entry): # we raise so the error logger writes these cases in the log file raise NoConceptRecidForDraft(draft=entry) + communities = self._communities(entry) transformed["json"] = { # loader is responsible for creating/updating if the PID exists. "id": parent_pid, - "communities": self._communities(entry), + "communities": communities, "pids": self._pids(entry), } owner = next(iter(entry["json"].get("owners", [])), None) if owner is not None: transformed["json"]["access"] = {"owned_by": {"user": owner}} + + permission_flags = {} + owner_comm_slugs = { + comm["slug"] + for comm in ( + STATE.COMMUNITIES.search("owner_id", owner) if owner else [] + ) + } + comm_slugs = set(communities.get("ids", [])) + has_only_managed_communities = comm_slugs < owner_comm_slugs + if not has_only_managed_communities: + permission_flags["can_community_manage_record"] = False elif not self.partial: raise KeyError("json") # else, pass diff --git a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py index 7297114b..a948c5a0 100644 --- a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py +++ b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py @@ -13,6 +13,20 @@ class ZenodoCustomFieldsEntry(Entry): """Custom fields entry transform.""" + @classmethod + def _subjects(cls, subjects): + """Parse subjects.""" + res = [] + for s in subjects or []: + res.append( + { + "term": s.get("term"), + "identifier": s.get("identifier"), + "scheme": s.get("scheme"), + } + ) + return res or None + @classmethod def _journal(cls, journal): """Parse journal fields.""" @@ -131,6 +145,7 @@ def transform(cls, entry): entry.get("imprint", {}), entry.get("part_of", {}) ), "thesis:university": entry.get("thesis", {}).get("university"), + "legacy:subjects": cls._subjects(entry.get("subjects", [])), "openbiodiv:TaxonomicConceptLabel": entry.get("custom", {}).get( "openbiodiv:TaxonomicConceptLabel" ), diff --git a/scripts/admin/community_manage_requests.py b/scripts/admin/community_manage_requests.py index 9609b005..39d62ac5 100644 --- a/scripts/admin/community_manage_requests.py +++ b/scripts/admin/community_manage_requests.py @@ -17,7 +17,7 @@ def create_community_manage_record_request(record_id): # add a permission flag to db (make record a legacy one) db_record = RDMRecord.get_record(record_id) - db_record.parent.permission_flags = {"can_community_manage_record": True} + db_record.parent.permission_flags = {"can_community_manage_record": False} db_record.parent.commit() db.session.commit() diff --git a/site/tests/legacy/deposits/conftest.py b/site/tests/legacy/deposits/conftest.py index 7d402b6b..9f664ea8 100644 --- a/site/tests/legacy/deposits/conftest.py +++ b/site/tests/legacy/deposits/conftest.py @@ -149,11 +149,10 @@ def expected_record_metadata(): journal_title="Some journal name", journal_volume="Some volume", keywords=["Keyword 1", "keyword 2"], - # TODO uncomment when subjects are implemented - # subjects=[ - # dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"), - # dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"), - # ], + subjects=[ + dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"), + dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"), + ], license="cc-zero", notes="Some notes", partof_pages="SOme part of", diff --git a/site/tests/requests/test_community_manage_record_request.py b/site/tests/requests/test_community_manage_record_request.py index 83a44e39..1001f8cd 100644 --- a/site/tests/requests/test_community_manage_record_request.py +++ b/site/tests/requests/test_community_manage_record_request.py @@ -87,7 +87,7 @@ def test_submit_a_request(uploader): assert db_request["title"] == "Communities manage legacy records" assert db_request["expires_at"] is not None assert db_request["description"].startswith( - "