Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Legacy and migration fixes #520

Merged
merged 7 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion legacy/zenodo_legacy/funders.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"10.13039/501100000038": "01h531d29",
"10.13039/100000001": "021nxhr62",
"10.13039/501100003246": "04jsz6e67",
# NOTE: RCUK (10.13039/100014013) was succeeded by UKRI (10.13039/501100000690).
# NOTE: RCUK (10.13039/501100000690) was succeeded by UKRI (10.13039/100014013).
# All awards/grants were transferred, so we're also remapping the funder IDs to
# point to the UKRI ROR ID (001aqnf71).
"10.13039/501100000690": "001aqnf71",
Expand Down
13 changes: 12 additions & 1 deletion migrator/scripts/backup_indices.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-- From https://www.postgresql.org/message-id/flat/877em2racj.fsf%40gmail.com#36a9eba4b16b8172e379b2a19f403939
DROP TABLE rdm_index_backup;
CREATE TABLE rdm_index_backup AS
SELECT *
FROM pg_indexes
Expand All @@ -14,6 +15,9 @@ WHERE
'accounts_user_login_information',
'accounts_user_session_activity',
'oauth2server_token',
'oauth2server_client',
'oauthclient_remoteaccount',
'oauthclient_remotetoken',
'accounts_useridentity',
-- communities
'communities_metadata',
Expand All @@ -27,12 +31,19 @@ WHERE
'rdm_drafts_metadata',
'rdm_versions_state',
'rdm_records_files',
'rdm_records_media_files',
'rdm_drafts_files',
'rdm_drafts_media_files',
'pidstore_pid',
-- requests
'request_metadata',
-- github
'webhooks_events',
'github_repositories',
'github_releases'
'github_releases',
-- vocabularies
'name_metadata',
'affiliation_metadata',
'award_metadata',
'funder_metadata'
);
84 changes: 84 additions & 0 deletions migrator/scripts/dump_affiliations_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Parse affiliations from ROR dumps into CSV format, importable via COPY.

To use call ``load_file(DATA_PATH, "affiliations.csv")``.
"""

import csv
import orjson as json
import uuid

from idutils import normalize_ror
from invenio_rdm_migrator.utils import ts

DATA_PATH = "v1.32-2023-09-14-ror-data.zip" # https://zenodo.org/record/8346986


VOCABULARIES_AFFILIATION_SCHEMES = {
"grid",
"gnd",
"isni",
"ror",
}


def transform_affiliation(data):
"""Applies the transformation to the stream entry."""
affiliation = {
"$schema": "local://affiliations/affiliation-v1.0.0.json",
}

affiliation["id"] = normalize_ror(data.get("id"))
if not affiliation["id"]:
return

affiliation["name"] = data.get("name")
if not affiliation["name"]:
return

acronyms = data.get("acronyms") or []
if acronyms:
affiliation["acronym"] = acronyms[0]

affiliation["title"] = {"en": affiliation["name"]}
for label in data.get("labels", []):
affiliation["title"][label["iso639"]] = label["label"]

# The ROR is always listed in identifiers, expected by serialization
affiliation["identifiers"] = [{"identifier": affiliation["id"], "scheme": "ror"}]
for scheme, identifier in data.get("external_ids", {}).items():
scheme = scheme.lower()
if scheme in VOCABULARIES_AFFILIATION_SCHEMES:
value = identifier.get("preferred") or identifier.get("all")[0]
affiliation["identifiers"].append({"identifier": value, "scheme": scheme})

return affiliation


def load_file(datafile, outpath):
"""Load the data file and dump as CSV."""
with open(outpath, "w") as fout, open(datafile, "rb") as fp:
print(f"[{ts()}] loading {datafile}")
writer = csv.writer(fout)
entries = json.loads(fp.read())
for idx, data in enumerate(entries):
if idx % 1000 == 0:
print(f"[{ts()}] {idx}")
try:
affiliation = transform_affiliation(data)
if not affiliation:
print(f"[{ts()}] Failed to transform #{idx}:\n{data}\n")
continue
affiliation_id = affiliation.pop("id")
creation_ts = ts()
writer.writerow(
(
str(uuid.uuid4()), # id
affiliation_id, # pid
json.dumps(affiliation), # json
creation_ts, # created
creation_ts, # updated (same as created)
1, # version_id
)
)
except Exception as ex:
print(f"[{ts()}] Exception for line {idx}:\n{data}\n\n{ex}\n")
12 changes: 6 additions & 6 deletions migrator/scripts/dump_awards_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from invenio_rdm_migrator.utils import ts

DATA_PATHS = [
"awards-2023-08.jsonl.gz" # https://zenodo.org/record/8224080
"awards-2023-04.jsonl.gz" # https://zenodo.org/record/7870151
"awards-2023-03.jsonl.gz" # https://zenodo.org/record/7803150
"awards-2023-02.jsonl.gz" # https://zenodo.org/record/7683844
"awards-2023-01.jsonl.gz" # https://zenodo.org/record/7561801
"awards-2022-12.jsonl.gz" # https://zenodo.org/record/7745773
"awards-2023-08.jsonl.gz", # https://zenodo.org/record/8224080
"awards-2023-04.jsonl.gz", # https://zenodo.org/record/7870151
"awards-2023-03.jsonl.gz", # https://zenodo.org/record/7803150
"awards-2023-02.jsonl.gz", # https://zenodo.org/record/7683844
"awards-2023-01.jsonl.gz", # https://zenodo.org/record/7561801
"awards-2022-12.jsonl.gz", # https://zenodo.org/record/7745773
]

VOCABULARIES_AWARDS_OPENAIRE_FUNDERS = {
Expand Down
7 changes: 4 additions & 3 deletions migrator/scripts/dump_funders_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
"""

import csv
import json
import orjson as json
import uuid

from idutils import normalize_ror
from invenio_rdm_migrator.utils import ts

DATA_PATH = "v1.25-2023-05-11-ror-data.zip" # https://zenodo.org/record/7926988
DATA_PATH = "v1.32-2023-09-14-ror-data.zip" # https://zenodo.org/record/8346986


VOCABULARIES_FUNDER_SCHEMES = {
"grid",
Expand Down Expand Up @@ -72,7 +73,7 @@ def load_file(datafile, outpath):
with open(outpath, "w") as fout, open(datafile, "rb") as fp:
print(f"[{ts()}] loading {datafile}")
writer = csv.writer(fout)
entries = json.load(fp)
entries = json.loads(fp.read())
for idx, data in enumerate(entries):
if idx % 1000 == 0:
print(f"[{ts()}] {idx}")
Expand Down
2 changes: 1 addition & 1 deletion migrator/tests/actions/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def state(tmp_dir):
state_db = StateDB(
db_dir=tmp_dir.name, validators={"parents": ParentModelValidator}
)
STATE.initialized_state(state_db, cache=False)
STATE.initialized_state(state_db, cache=False, search_cache=False)

return STATE

Expand Down
1 change: 1 addition & 0 deletions migrator/tests/transform/test_community_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def expected_rdm_community():
},
},
"bucket_id": None,
"deletion_status": "P",
}


Expand Down
48 changes: 48 additions & 0 deletions migrator/tests/transform/test_record_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,18 @@ def zenodo_record_data():
],
"references": [{"raw_reference": "Test reference"}],
"keywords": ["migration", "test", "Zenodo", "RDM"],
"subjects": [
{
"term": "Astronomy",
"identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
"scheme": "url",
},
{
"term": "Klassenrat",
"identifier": "gnd:4180044-8",
"scheme": "gnd",
},
],
"_internal": {
"source": {
"agents": [
Expand Down Expand Up @@ -455,6 +467,18 @@ def expected_rdm_record_entry():
"title": "A book title",
},
"thesis:university": "Test University",
"legacy:subjects": [
{
"term": "Astronomy",
"identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
"scheme": "url",
},
{
"term": "Klassenrat",
"identifier": "gnd:4180044-8",
"scheme": "gnd",
},
],
# dwc
"dwc:basisOfRecord": ["foo", "bar"],
"dwc:catalogNumber": ["foo", "bar"],
Expand Down Expand Up @@ -656,6 +680,18 @@ def zenodo_draft_data():
},
],
"keywords": ["migration", "test", "Zenodo", "RDM"],
"subjects": [
{
"term": "Astronomy",
"identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
"scheme": "url",
},
{
"term": "Klassenrat",
"identifier": "gnd:4180044-8",
"scheme": "gnd",
},
],
"_internal": {
"source": {
"agents": [
Expand Down Expand Up @@ -919,6 +955,18 @@ def expected_rdm_draft_entry():
"title": "A book title",
},
"thesis:university": "Test University",
"legacy:subjects": [
{
"term": "Astronomy",
"identifier": "http://id.loc.gov/authorities/subjects/sh85009003",
"scheme": "url",
},
{
"term": "Klassenrat",
"identifier": "gnd:4180044-8",
"scheme": "gnd",
},
],
# dwc
"dwc:basisOfRecord": ["foo", "bar"],
"dwc:catalogNumber": ["foo", "bar"],
Expand Down
4 changes: 4 additions & 0 deletions migrator/zenodo_rdm_migrator/transform/entries/communities.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ def _bucket_id(self, entry):
"""Returns the community bucket id."""
return None

def _deletion_status(self, entry):
"""Returns the community's deletion status."""
return "P"

def _metadata(self, entry):
"""Returns community metadata."""
# Clean-up description
Expand Down
16 changes: 15 additions & 1 deletion migrator/zenodo_rdm_migrator/transform/entries/parents.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""Zenodo migrator parent record transformer entries."""


from invenio_rdm_migrator.state import STATE
from invenio_rdm_migrator.transform import Entry

from ...errors import NoConceptRecidForDraft
Expand Down Expand Up @@ -76,15 +77,28 @@ def transform(self, entry):
# we raise so the error logger writes these cases in the log file
raise NoConceptRecidForDraft(draft=entry)

communities = self._communities(entry)
transformed["json"] = {
# loader is responsible for creating/updating if the PID exists.
"id": parent_pid,
"communities": self._communities(entry),
"communities": communities,
"pids": self._pids(entry),
}
owner = next(iter(entry["json"].get("owners", [])), None)
if owner is not None:
transformed["json"]["access"] = {"owned_by": {"user": owner}}

permission_flags = {}
owner_comm_slugs = {
comm["slug"]
for comm in (
STATE.COMMUNITIES.search("owner_id", owner) if owner else []
)
}
comm_slugs = set(communities.get("ids", []))
has_only_managed_communities = comm_slugs < owner_comm_slugs
if not has_only_managed_communities:
permission_flags["can_community_manage_record"] = False
elif not self.partial:
raise KeyError("json")
# else, pass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@
class ZenodoCustomFieldsEntry(Entry):
"""Custom fields entry transform."""

@classmethod
def _subjects(cls, subjects):
"""Parse subjects."""
res = []
for s in subjects or []:
res.append(
{
"term": s.get("term"),
"identifier": s.get("identifier"),
"scheme": s.get("scheme"),
}
)
return res or None

@classmethod
def _journal(cls, journal):
"""Parse journal fields."""
Expand Down Expand Up @@ -131,6 +145,7 @@ def transform(cls, entry):
entry.get("imprint", {}), entry.get("part_of", {})
),
"thesis:university": entry.get("thesis", {}).get("university"),
"legacy:subjects": cls._subjects(entry.get("subjects", [])),
"openbiodiv:TaxonomicConceptLabel": entry.get("custom", {}).get(
"openbiodiv:TaxonomicConceptLabel"
),
Expand Down
2 changes: 1 addition & 1 deletion scripts/admin/community_manage_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def create_community_manage_record_request(record_id):

# add a permission flag to db (make record a legacy one)
db_record = RDMRecord.get_record(record_id)
db_record.parent.permission_flags = {"can_community_manage_record": True}
db_record.parent.permission_flags = {"can_community_manage_record": False}
db_record.parent.commit()
db.session.commit()

Expand Down
9 changes: 4 additions & 5 deletions site/tests/legacy/deposits/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,10 @@ def expected_record_metadata():
journal_title="Some journal name",
journal_volume="Some volume",
keywords=["Keyword 1", "keyword 2"],
# TODO uncomment when subjects are implemented
# subjects=[
# dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
# dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
# ],
subjects=[
dict(scheme="gnd", identifier="gnd:1234567899", term="Astronaut"),
dict(scheme="gnd", identifier="gnd:1234567898", term="Amish"),
],
license="cc-zero",
notes="Some notes",
partof_pages="SOme part of",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_submit_a_request(uploader):
assert db_request["title"] == "Communities manage legacy records"
assert db_request["expires_at"] is not None
assert db_request["description"].startswith(
"<h4>Some of your records, that are going through migration"
"<h4>Some of your records, that are going through the migration"
)


Expand Down
Loading
Loading