diff --git a/legacy/zenodo_legacy/funders.py b/legacy/zenodo_legacy/funders.py index a640db2a..3a237d75 100644 --- a/legacy/zenodo_legacy/funders.py +++ b/legacy/zenodo_legacy/funders.py @@ -27,7 +27,7 @@ "10.13039/501100000038": "01h531d29", "10.13039/100000001": "021nxhr62", "10.13039/501100003246": "04jsz6e67", - # NOTE: RCUK (10.13039/100014013) was succeeded by UKRI (10.13039/501100000690). + # NOTE: RCUK (10.13039/501100000690) was succeeded by UKRI (10.13039/100014013). # All awards/grants were transferred, so we're also remapping the funder IDs to # point to the UKRI ROR ID (001aqnf71). "10.13039/501100000690": "001aqnf71", diff --git a/migrator/scripts/backup_indices.sql b/migrator/scripts/backup_indices.sql index 729fc025..07757deb 100644 --- a/migrator/scripts/backup_indices.sql +++ b/migrator/scripts/backup_indices.sql @@ -1,4 +1,5 @@ -- From https://www.postgresql.org/message-id/flat/877em2racj.fsf%40gmail.com#36a9eba4b16b8172e379b2a19f403939 +DROP TABLE rdm_index_backup; CREATE TABLE rdm_index_backup AS SELECT * FROM pg_indexes @@ -14,6 +15,9 @@ WHERE 'accounts_user_login_information', 'accounts_user_session_activity', 'oauth2server_token', + 'oauth2server_client', + 'oauthclient_remoteaccount', + 'oauthclient_remotetoken', 'accounts_useridentity', -- communities 'communities_metadata', @@ -27,12 +31,19 @@ WHERE 'rdm_drafts_metadata', 'rdm_versions_state', 'rdm_records_files', + 'rdm_records_media_files', 'rdm_drafts_files', + 'rdm_drafts_media_files', 'pidstore_pid', -- requests 'request_metadata', -- github 'webhooks_events', 'github_repositories', - 'github_releases' + 'github_releases', + -- vocabularies + 'name_metadata', + 'affiliation_metadata', + 'award_metadata', + 'funder_metadata' ); diff --git a/migrator/scripts/dump_affiliations_db.py b/migrator/scripts/dump_affiliations_db.py new file mode 100644 index 00000000..d7205c54 --- /dev/null +++ b/migrator/scripts/dump_affiliations_db.py @@ -0,0 +1,84 @@ +"""Parse affiliations from ROR dumps into CSV format, importable via COPY. + +To use call ``load_file(DATA_PATH, "affiliations.csv")``. +""" + +import csv +import orjson as json +import uuid + +from idutils import normalize_ror +from invenio_rdm_migrator.utils import ts + +DATA_PATH = "v1.32-2023-09-14-ror-data.zip" # https://zenodo.org/record/8346986 + + +VOCABULARIES_AFFILIATION_SCHEMES = { + "grid", + "gnd", + "isni", + "ror", +} + + +def transform_affiliation(data): + """Applies the transformation to the stream entry.""" + affiliation = { + "$schema": "local://affiliations/affiliation-v1.0.0.json", + } + + affiliation["id"] = normalize_ror(data.get("id")) + if not affiliation["id"]: + return + + affiliation["name"] = data.get("name") + if not affiliation["name"]: + return + + acronyms = data.get("acronyms") or [] + if acronyms: + affiliation["acronym"] = acronyms[0] + + affiliation["title"] = {"en": affiliation["name"]} + for label in data.get("labels", []): + affiliation["title"][label["iso639"]] = label["label"] + + # The ROR is always listed in identifiers, expected by serialization + affiliation["identifiers"] = [{"identifier": affiliation["id"], "scheme": "ror"}] + for scheme, identifier in data.get("external_ids", {}).items(): + scheme = scheme.lower() + if scheme in VOCABULARIES_AFFILIATION_SCHEMES: + value = identifier.get("preferred") or identifier.get("all")[0] + affiliation["identifiers"].append({"identifier": value, "scheme": scheme}) + + return affiliation + + +def load_file(datafile, outpath): + """Load the data file and dump as CSV.""" + with open(outpath, "w") as fout, open(datafile, "rb") as fp: + print(f"[{ts()}] loading {datafile}") + writer = csv.writer(fout) + entries = json.loads(fp.read()) + for idx, data in enumerate(entries): + if idx % 1000 == 0: + print(f"[{ts()}] {idx}") + try: + affiliation = transform_affiliation(data) + if not affiliation: + print(f"[{ts()}] Failed to transform #{idx}:\n{data}\n") + continue + affiliation_id = affiliation.pop("id") + creation_ts = ts() + writer.writerow( + ( + str(uuid.uuid4()), # id + affiliation_id, # pid + json.dumps(affiliation), # json + creation_ts, # created + creation_ts, # updated (same as created) + 1, # version_id + ) + ) + except Exception as ex: + print(f"[{ts()}] Exception for line {idx}:\n{data}\n\n{ex}\n") diff --git a/migrator/scripts/dump_awards_db.py b/migrator/scripts/dump_awards_db.py index 47362fcb..1d616eb8 100644 --- a/migrator/scripts/dump_awards_db.py +++ b/migrator/scripts/dump_awards_db.py @@ -13,12 +13,12 @@ from invenio_rdm_migrator.utils import ts DATA_PATHS = [ - "awards-2023-08.jsonl.gz" # https://zenodo.org/record/8224080 - "awards-2023-04.jsonl.gz" # https://zenodo.org/record/7870151 - "awards-2023-03.jsonl.gz" # https://zenodo.org/record/7803150 - "awards-2023-02.jsonl.gz" # https://zenodo.org/record/7683844 - "awards-2023-01.jsonl.gz" # https://zenodo.org/record/7561801 - "awards-2022-12.jsonl.gz" # https://zenodo.org/record/7745773 + "awards-2023-08.jsonl.gz", # https://zenodo.org/record/8224080 + "awards-2023-04.jsonl.gz", # https://zenodo.org/record/7870151 + "awards-2023-03.jsonl.gz", # https://zenodo.org/record/7803150 + "awards-2023-02.jsonl.gz", # https://zenodo.org/record/7683844 + "awards-2023-01.jsonl.gz", # https://zenodo.org/record/7561801 + "awards-2022-12.jsonl.gz", # https://zenodo.org/record/7745773 ] VOCABULARIES_AWARDS_OPENAIRE_FUNDERS = { diff --git a/migrator/scripts/dump_funders_db.py b/migrator/scripts/dump_funders_db.py index 39383380..f0f71527 100644 --- a/migrator/scripts/dump_funders_db.py +++ b/migrator/scripts/dump_funders_db.py @@ -7,13 +7,14 @@ """ import csv -import json +import orjson as json import uuid from idutils import normalize_ror from invenio_rdm_migrator.utils import ts -DATA_PATH = "v1.25-2023-05-11-ror-data.zip" # https://zenodo.org/record/7926988 +DATA_PATH = "v1.32-2023-09-14-ror-data.zip" # https://zenodo.org/record/8346986 + VOCABULARIES_FUNDER_SCHEMES = { "grid", @@ -72,7 +73,7 @@ def load_file(datafile, outpath): with open(outpath, "w") as fout, open(datafile, "rb") as fp: print(f"[{ts()}] loading {datafile}") writer = csv.writer(fout) - entries = json.load(fp) + entries = json.loads(fp.read()) for idx, data in enumerate(entries): if idx % 1000 == 0: print(f"[{ts()}] {idx}") diff --git a/migrator/tests/actions/conftest.py b/migrator/tests/actions/conftest.py index e9f6a1e6..400d0589 100644 --- a/migrator/tests/actions/conftest.py +++ b/migrator/tests/actions/conftest.py @@ -56,7 +56,7 @@ def state(tmp_dir): state_db = StateDB( db_dir=tmp_dir.name, validators={"parents": ParentModelValidator} ) - STATE.initialized_state(state_db, cache=False) + STATE.initialized_state(state_db, cache=False, search_cache=False) return STATE diff --git a/migrator/tests/transform/test_community_transform.py b/migrator/tests/transform/test_community_transform.py index 33192856..5cc12ee1 100644 --- a/migrator/tests/transform/test_community_transform.py +++ b/migrator/tests/transform/test_community_transform.py @@ -58,6 +58,7 @@ def expected_rdm_community(): }, }, "bucket_id": None, + "deletion_status": "P", } diff --git a/migrator/tests/transform/test_record_transform.py b/migrator/tests/transform/test_record_transform.py index 9075b2d8..5e39590e 100644 --- a/migrator/tests/transform/test_record_transform.py +++ b/migrator/tests/transform/test_record_transform.py @@ -156,6 +156,18 @@ def zenodo_record_data(): ], "references": [{"raw_reference": "Test reference"}], "keywords": ["migration", "test", "Zenodo", "RDM"], + "subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], "_internal": { "source": { "agents": [ @@ -455,6 +467,18 @@ def expected_rdm_record_entry(): "title": "A book title", }, "thesis:university": "Test University", + "legacy:subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], # dwc "dwc:basisOfRecord": ["foo", "bar"], "dwc:catalogNumber": ["foo", "bar"], @@ -656,6 +680,18 @@ def zenodo_draft_data(): }, ], "keywords": ["migration", "test", "Zenodo", "RDM"], + "subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], "_internal": { "source": { "agents": [ @@ -919,6 +955,18 @@ def expected_rdm_draft_entry(): "title": "A book title", }, "thesis:university": "Test University", + "legacy:subjects": [ + { + "term": "Astronomy", + "identifier": "http://id.loc.gov/authorities/subjects/sh85009003", + "scheme": "url", + }, + { + "term": "Klassenrat", + "identifier": "gnd:4180044-8", + "scheme": "gnd", + }, + ], # dwc "dwc:basisOfRecord": ["foo", "bar"], "dwc:catalogNumber": ["foo", "bar"], diff --git a/migrator/zenodo_rdm_migrator/transform/entries/communities.py b/migrator/zenodo_rdm_migrator/transform/entries/communities.py index c7e4ade3..dc19d346 100644 --- a/migrator/zenodo_rdm_migrator/transform/entries/communities.py +++ b/migrator/zenodo_rdm_migrator/transform/entries/communities.py @@ -68,6 +68,10 @@ def _bucket_id(self, entry): """Returns the community bucket id.""" return None + def _deletion_status(self, entry): + """Returns the community's deletion status.""" + return "P" + def _metadata(self, entry): """Returns community metadata.""" # Clean-up description diff --git a/migrator/zenodo_rdm_migrator/transform/entries/parents.py b/migrator/zenodo_rdm_migrator/transform/entries/parents.py index 2d1be518..6c4cf41f 100644 --- a/migrator/zenodo_rdm_migrator/transform/entries/parents.py +++ b/migrator/zenodo_rdm_migrator/transform/entries/parents.py @@ -8,6 +8,7 @@ """Zenodo migrator parent record transformer entries.""" +from invenio_rdm_migrator.state import STATE from invenio_rdm_migrator.transform import Entry from ...errors import NoConceptRecidForDraft @@ -76,15 +77,28 @@ def transform(self, entry): # we raise so the error logger writes these cases in the log file raise NoConceptRecidForDraft(draft=entry) + communities = self._communities(entry) transformed["json"] = { # loader is responsible for creating/updating if the PID exists. "id": parent_pid, - "communities": self._communities(entry), + "communities": communities, "pids": self._pids(entry), } owner = next(iter(entry["json"].get("owners", [])), None) if owner is not None: transformed["json"]["access"] = {"owned_by": {"user": owner}} + + permission_flags = {} + owner_comm_slugs = { + comm["slug"] + for comm in ( + STATE.COMMUNITIES.search("owner_id", owner) if owner else [] + ) + } + comm_slugs = set(communities.get("ids", [])) + has_only_managed_communities = comm_slugs < owner_comm_slugs + if not has_only_managed_communities: + permission_flags["can_community_manage_record"] = False elif not self.partial: raise KeyError("json") # else, pass diff --git a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py index 7297114b..a948c5a0 100644 --- a/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py +++ b/migrator/zenodo_rdm_migrator/transform/entries/records/custom_fields.py @@ -13,6 +13,20 @@ class ZenodoCustomFieldsEntry(Entry): """Custom fields entry transform.""" + @classmethod + def _subjects(cls, subjects): + """Parse subjects.""" + res = [] + for s in subjects or []: + res.append( + { + "term": s.get("term"), + "identifier": s.get("identifier"), + "scheme": s.get("scheme"), + } + ) + return res or None + @classmethod def _journal(cls, journal): """Parse journal fields.""" @@ -131,6 +145,7 @@ def transform(cls, entry): entry.get("imprint", {}), entry.get("part_of", {}) ), "thesis:university": entry.get("thesis", {}).get("university"), + "legacy:subjects": cls._subjects(entry.get("subjects", [])), "openbiodiv:TaxonomicConceptLabel": entry.get("custom", {}).get( "openbiodiv:TaxonomicConceptLabel" ), diff --git a/scripts/admin/community_manage_requests.py b/scripts/admin/community_manage_requests.py index 9609b005..39d62ac5 100644 --- a/scripts/admin/community_manage_requests.py +++ b/scripts/admin/community_manage_requests.py @@ -17,7 +17,7 @@ def create_community_manage_record_request(record_id): # add a permission flag to db (make record a legacy one) db_record = RDMRecord.get_record(record_id) - db_record.parent.permission_flags = {"can_community_manage_record": True} + db_record.parent.permission_flags = {"can_community_manage_record": False} db_record.parent.commit() db.session.commit() diff --git a/site/tests/legacy/deposits/conftest.py b/site/tests/legacy/deposits/conftest.py index 7d402b6b..9f664ea8 100644 --- a/site/tests/legacy/deposits/conftest.py +++ b/site/tests/legacy/deposits/conftest.py @@ -149,11 +149,10 @@ def expected_record_metadata(): journal_title="Some journal name", journal_volume="Some volume", keywords=["Keyword 1", "keyword 2"], - # TODO uncomment when subjects are implemented - # subjects=[ - # dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"), - # dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"), - # ], + subjects=[ + dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"), + dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"), + ], license="cc-zero", notes="Some notes", partof_pages="SOme part of", diff --git a/site/tests/requests/test_community_manage_record_request.py b/site/tests/requests/test_community_manage_record_request.py index 83a44e39..1001f8cd 100644 --- a/site/tests/requests/test_community_manage_record_request.py +++ b/site/tests/requests/test_community_manage_record_request.py @@ -87,7 +87,7 @@ def test_submit_a_request(uploader): assert db_request["title"] == "Communities manage legacy records" assert db_request["expires_at"] is not None assert db_request["description"].startswith( - "

Some of your records, that are going through migration" + "

Some of your records, that are going through the migration" ) diff --git a/site/zenodo_rdm/custom_fields.py b/site/zenodo_rdm/custom_fields.py index 148a81ec..2a4c6616 100644 --- a/site/zenodo_rdm/custom_fields.py +++ b/site/zenodo_rdm/custom_fields.py @@ -43,8 +43,39 @@ from marshmallow import fields from marshmallow_utils.fields import SanitizedUnicode + +class SubjectListCF(BaseCF): + """Subject list custom field.""" + + @property + def mapping(self): + """Search mapping.""" + return { + "type": "object", + "properties": { + "term": {"type": "keyword"}, + "identifier": {"type": "keyword"}, + "scheme": {"type": "keyword"}, + }, + } + + @property + def field(self): + """Marshmallow field.""" + return fields.List( + fields.Nested( + { + "term": SanitizedUnicode(), + "identifier": SanitizedUnicode(), + "scheme": SanitizedUnicode(), + } + ) + ) + + LEGACY_CUSTOM_FIELDS = [ KeywordCF(name="legacy:communities", multiple=True), + SubjectListCF(name="legacy:subjects"), ] """Legacy compatibility custom fields.""" diff --git a/site/zenodo_rdm/legacy/deserializers/metadata.py b/site/zenodo_rdm/legacy/deserializers/metadata.py index cfbb4299..443360a8 100644 --- a/site/zenodo_rdm/legacy/deserializers/metadata.py +++ b/site/zenodo_rdm/legacy/deserializers/metadata.py @@ -158,7 +158,7 @@ def split_identifiers(self, data, **kwargs): contributors = fields.List(fields.Dict()) additional_descriptions = fields.List(fields.Dict()) locations = fields.Method(deserialize="load_locations") - subjects = fields.List(fields.Dict()) + subjects = fields.Method(deserialize="load_subjects", data_key="keywords") version = SanitizedUnicode() dates = fields.Method(deserialize="load_dates") references = fields.Method(deserialize="load_references") @@ -301,37 +301,14 @@ def load_locations(self, obj): return {"features": features} - @post_load(pass_original=True) - def _subjects(self, result, original, **kwargs): + def load_subjects(self, obj): """Transform subjects of a legacy record. - RDM subjects translate to either legacy keywords or subjects. + RDM subjects translate to legacy keywords. """ - - def _from_keywords(keywords): - """Legacy keywords are free text strings. - - They map to custom subjects. - """ - return [{"subject": kw} for kw in keywords] - - def _from_subjects(data): - """Maps RDM subjects to legacy subjects. - - Legacy subjects are custom vocabularies. - """ - # TODO we still did not define a strategy to map legacy subjects to rdm. - return [] - - keywords = original.get("keywords", []) - subjects = original.get("subjects", []) - - if keywords or subjects: - rdm_subjects = _from_keywords(keywords) + _from_subjects(subjects) - - result["subjects"] = rdm_subjects - - return result + if not obj: + return missing + return [{"subject": kw} for kw in obj] def load_dates(self, obj): """Transform dates of a legacy record.""" diff --git a/site/zenodo_rdm/legacy/deserializers/schemas.py b/site/zenodo_rdm/legacy/deserializers/schemas.py index bf4adf44..86ee573a 100644 --- a/site/zenodo_rdm/legacy/deserializers/schemas.py +++ b/site/zenodo_rdm/legacy/deserializers/schemas.py @@ -213,3 +213,19 @@ def load_communities(self, result, original, **kwargs): result.setdefault("custom_fields", {}) result["custom_fields"].update({"legacy:communities": community_ids}) return result + + @post_load(pass_original=True) + def load_subjects(self, result, original, **kwargs): + """Store legacy subjects as a custom field.""" + subjects = original.get("metadata", {}).get("subjects", []) + if subjects: + result.setdefault("custom_fields", {}) + result["custom_fields"]["legacy:subjects"] = [ + { + "term": s.get("term"), + "identifier": s.get("identifier"), + "scheme": s.get("scheme"), + } + for s in subjects + ] + return result diff --git a/site/zenodo_rdm/legacy/requests/community_manage_record.py b/site/zenodo_rdm/legacy/requests/community_manage_record.py index a1cacce3..fbc780e8 100644 --- a/site/zenodo_rdm/legacy/requests/community_manage_record.py +++ b/site/zenodo_rdm/legacy/requests/community_manage_record.py @@ -96,13 +96,13 @@ def execute(self, identity, uow): # example: "May 11, 2024" expires_at = self.request.expires_at.strftime("%B %d, %Y") self.request["description"] = ( - "

Some of your records, that are going through migration process are part " - "of the communities that don't belong to you.
Accept this request to keep the old " - "behaviour and allow community curators to manage (edit, create new version, add to " - "another community, etc.) corresponding record.
In case of declining this " + "

Some of your records, that are going through the migration process are part " + "of communities that don't belong to you.
Accept this request to adopt the new " + "behaviour and allow community curators to manage (edit, create new versions, add to " + "another community, etc.) your corresponding records.
In case of declining this " "request all your legacy records will be removed from all communities " "that you are not an owner of.

If you do not perform any action by " - f"{expires_at}, the permission for community curators to manage the record " + f"{expires_at}, the permission for community curators to manage your records " "will automatically be fully granted.

" ) diff --git a/site/zenodo_rdm/legacy/serializers/schemas/common.py b/site/zenodo_rdm/legacy/serializers/schemas/common.py index db060910..ab8cae57 100644 --- a/site/zenodo_rdm/legacy/serializers/schemas/common.py +++ b/site/zenodo_rdm/legacy/serializers/schemas/common.py @@ -166,6 +166,9 @@ class MetadataSchema(Schema): creators = fields.List(fields.Nested(CreatorSchema), dump_only=True) contributors = fields.List(fields.Nested(ContributorSchema), dump_only=True) + keywords = fields.Method("dump_keywords") + subjects = fields.Raw(attribute="custom_fields.legacy:subjects") + related_identifiers = fields.List(fields.Nested(RelatedIdentifierSchema)) locations = fields.Method("dump_locations") @@ -212,11 +215,9 @@ def resolve_license(self, data, **kwargs): data["license"] = rdm_to_legacy(license["id"]) return data - @post_dump(pass_original=True) - def dump_subjects(self, result, original, **kwargs): - """Dumps subjects.""" - subjects = original.get("subjects", []) - serialized_subjects = [] + def dump_keywords(self, obj): + """Dumps keywords from RDM subjects.""" + subjects = obj.get("subjects", []) serialized_keywords = [] if subjects: for _sbj in subjects: @@ -224,19 +225,12 @@ def dump_subjects(self, result, original, **kwargs): _subject = _sbj.get("subject") # If subject has an id, it's a controlled vocabulary if _id: - # TODO we still did not define a strategy to map legacy subjects to rdm. pass # Otherwise it's a free text string (keyword) elif _subject: serialized_keywords.append(_subject) - if serialized_keywords: - result["keywords"] = serialized_keywords - - if serialized_subjects: - result["subjects"] = serialized_subjects - - return result + return serialized_keywords or missing def dump_reference(self, obj): """Dumps reference."""