Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migration fixes #512

Merged
merged 4 commits into from
Sep 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 24 additions & 13 deletions migrator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,28 +194,39 @@ directly on the legacy database, in the following manner:
DB_URI="postgresql://zenodo@zenodo-legacy-db-host:5432/zenodo"

# Users, ~30min
psql $DB_URI -f scripts/users_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/users-$(date -I).jsonl.gz"
psql $DB_URI -f scripts/users_dump.sql | sed 's/\\\\/\\/g' > "dumps/users-$(date -I).jsonl"
# Communities, ~5min
psql $DB_URI -f scripts/communities_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/communities-$(date -I).jsonl.gz"
psql $DB_URI -f scripts/communities_dump.sql | sed 's/\\\\/\\/g' > "dumps/communities-$(date -I).jsonl"
# Community record inclusion requests, ~10min
psql $DB_URI -f scripts/requests_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/requests-$(date -I).jsonl.gz"
psql $DB_URI -f scripts/requests_dump.sql | sed 's/\\\\/\\/g' > "dumps/requests-$(date -I).jsonl"
# Records, ~2-3h
psql $DB_URI -f scripts/records_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/records-$(date -I).jsonl.gz"
psql $DB_URI -f scripts/records_dump.sql | sed 's/\\\\/\\/g' > "dumps/records-$(date -I).jsonl"
# Deposits/drafts, ~30min
psql $DB_URI -f scripts/deposits_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/deposits-$(date -I).jsonl.gz"

# For dumping files we use a different style, since we're not filtering anything:
psql $DB_URI -f scripts/deposits_dump.sql | sed 's/\\\\/\\/g' > "dumps/deposits-$(date -I).jsonl"
# Deleted records, ~3-4h
psql $DB_URI -f scripts/deleted_records_dump.sql | sed 's/\\\\/\\/g' > "dumps/deleted-records-$(date -I).jsonl"

# Oauth2 server clients
psql $DB_URI -f scripts/oauth2server_clients_dump.sql | sed 's/\\\\/\\/g' > "dumps/oauth2server-clients-$(date -I).jsonl"
# Oauth2 server tokens
psql $DB_URI -f scripts/oauth2server_tokens_dump.sql | sed 's/\\\\/\\/g' > "dumps/oauth2server-tokens-$(date -I).jsonl"

# NOTE: For OAuth client and files we use the CSV format, since we're not transforming anything
# Oauth-Client accounts
psql $DB_URI -f scripts/oauthclient_remoteaccount_dump.sql | sed 's/\\\\/\\/g' > "dumps/oauthclient_remoteaccount.csv"
# Oauth-Client tokens
psql $DB_URI -f scripts/oauthclient_remotetoken_dump.sql | sed 's/\\\\/\\/g' > "dumps/oauthclient_remotetoken.csv"
# File instances, ~10min
psql $DB_URI -f scripts/files_files_dump.sql | gzip -c > "dumps/files_files-$(date -I).csv.gz"
psql $DB_URI -f scripts/files_files_dump.sql > "dumps/files_files-$(date -I).csv"
# Buckets, ~1min
psql $DB_URI -f scripts/files_bucket_dump.sql | gzip -c > "dumps/files_bucket-$(date -I).csv.gz"
psql $DB_URI -f scripts/files_bucket_dump.sql > "dumps/files_bucket-$(date -I).csv"
# File object versions, ~3min
psql $DB_URI -f scripts/files_object_dump.sql | gzip -c > "dumps/files_object-$(date -I).csv.gz"
psql $DB_URI -f scripts/files_object_dump.sql > "dumps/files_object-$(date -I).csv"

# Webhook Events
psql $DB_URI -f scripts/webhook_events_dump.sql | gzip -c > "dumps/webhook_events-$(date -I).csv.gz"
psql $DB_URI -f scripts/webhook_events_dump.sql > "dumps/webhook_events-$(date -I).csv"
# GitHub repositories
psql $DB_URI -f scripts/github_repositories_dump.sql | gzip -c > "dumps/github_repositories-$(date -I).csv.gz"
psql $DB_URI -f scripts/github_repositories_dump.sql > "dumps/github_repositories-$(date -I).csv"
# GitHub releases
psql $DB_URI -f scripts/github_releases_dump.sql | gzip -c > "dumps/github_releases-$(date -I).jsonl.gz"
psql $DB_URI -f scripts/github_releases_dump.sql > "dumps/github_releases-$(date -I).jsonl"
```
21 changes: 16 additions & 5 deletions migrator/scripts/dump_awards_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
from invenio_rdm_migrator.utils import ts

DATA_PATHS = [
"awards-apr-2023.jsonl.gz", # https://zenodo.org/record/7870151
"awards-mar-2023.jsonl.gz", # https://zenodo.org/record/7803150
"awards-feb-2023.jsonl.gz", # https://zenodo.org/record/7683844
"awards-jan-2023.jsonl.gz", # https://zenodo.org/record/7561801
"awards-dec-2022.jsonl.gz", # https://zenodo.org/record/7745773
"awards-2023-08.jsonl.gz" # https://zenodo.org/record/8224080
"awards-2023-04.jsonl.gz" # https://zenodo.org/record/7870151
"awards-2023-03.jsonl.gz" # https://zenodo.org/record/7803150
"awards-2023-02.jsonl.gz" # https://zenodo.org/record/7683844
"awards-2023-01.jsonl.gz" # https://zenodo.org/record/7561801
"awards-2022-12.jsonl.gz" # https://zenodo.org/record/7745773
]

VOCABULARIES_AWARDS_OPENAIRE_FUNDERS = {
Expand Down Expand Up @@ -64,6 +65,16 @@ def transform_openaire_grant(data):
return

award["id"] = f"{funder_id}::{code}"

funding = next(iter(data.get("funding", [])), None)
if funding:
funding_stream_id = funding.get("funding_stream", {}).get("id", "")
# Example funding stream ID: `EC::HE::HORIZON-AG-UN`. We need the `EC`
# string, i.e. the second "part" of the identifier.
program = next(iter(funding_stream_id.split("::")[1:2]), "")
if program:
award["program"] = program

identifiers = []
if funder_id == VOCABULARIES_AWARDS_EC_ROR_ID:
identifiers.append(
Expand Down
20 changes: 20 additions & 0 deletions migrator/scripts/oauth2server_clients_dump.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
COPY (
SELECT
row_to_json(clients)
FROM
(
SELECT
name,
description,
website,
user_id,
client_id,
client_secret,
is_confidential,
is_internal,
_redirect_uris,
_default_scopes
FROM
oauth2server_client
) as clients
) TO STDOUT;
20 changes: 20 additions & 0 deletions migrator/scripts/oauth2server_tokens_dump.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
COPY (
SELECT
row_to_json(tokens)
FROM
(
SELECT
id,
client_id,
user_id,
token_type,
access_token,
refresh_token,
expires,
_scopes,
is_personal,
is_internal
FROM
oauth2server_token
) as tokens
) TO STDOUT;
10 changes: 10 additions & 0 deletions migrator/scripts/oauthclient_remoteaccount_dump.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
COPY (
SELECT
id,
user_id,
client_id,
extra_data,
created,
updated
FROM oauthclient_remoteaccount
) TO STDOUT WITH (FORMAT csv);
11 changes: 11 additions & 0 deletions migrator/scripts/oauthclient_remotetoken_dump.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
COPY (
SELECT
id_remote_account,
token_type,
access_token,
secret,
created,
updated
FROM
oauthclient_remotetoken
) TO STDOUT WITH (FORMAT csv);
12 changes: 0 additions & 12 deletions migrator/scripts/users_dump.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,11 @@ COPY (
up.*,
coalesce(se.created, null) as safelisted_at,
coalesce(se.notes, null) as safelisted_notes,
coalesce(user_t.tokens, null) AS tokens,
coalesce(user_i.identities, null) AS identities,
coalesce(user_sa.session_activity, null) AS session_activity
FROM accounts_user AS u
LEFT JOIN userprofiles_userprofile up ON u.id = up.user_id
LEFT JOIN safelist_entries se ON u.id = se.user_id
LEFT JOIN LATERAL (
SELECT json_agg(row_to_json(_t)) AS tokens
FROM (
SELECT t.*, c.name
FROM oauth2server_token AS t
JOIN oauth2server_client c ON t.client_id = c.client_id
WHERE t.user_id = u.id
AND t.is_personal = true
AND t.is_internal = false
) as _t
) AS user_t ON true
LEFT JOIN LATERAL (
SELECT json_agg(row_to_json(i)) AS identities
FROM oauthclient_useridentity AS i
Expand Down
28 changes: 0 additions & 28 deletions migrator/zenodo_rdm_migrator/extract/sql.py

This file was deleted.

29 changes: 17 additions & 12 deletions migrator/zenodo_rdm_migrator/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@
)
from invenio_rdm_migrator.streams.oauth import (
OAuthClientCopyLoad,
OAuthRemoteTokenTransform,
OAuthServerCopyLoad,
OAuthServerClientCopyLoad,
OAuthServerClientTransform,
OAuthServerTokenCopyLoad,
OAuthServerTokenTransform,
)
from invenio_rdm_migrator.streams.records import RDMRecordCopyLoad
from invenio_rdm_migrator.streams.records import RDMDraftCopyLoad, RDMRecordCopyLoad
from invenio_rdm_migrator.streams.requests import RequestCopyLoad
from invenio_rdm_migrator.streams.users import UserCopyLoad

Expand Down Expand Up @@ -58,7 +59,7 @@
name="drafts",
extract_cls=JSONLExtract,
transform_cls=ZenodoRecordTransform,
load_cls=RDMRecordCopyLoad,
load_cls=RDMDraftCopyLoad,
)
"""ETL stream for Zenodo to RDM drafts."""

Expand Down Expand Up @@ -104,21 +105,25 @@

OAuthClientStreamDefinition = StreamDefinition(
name="oauthclient",
# only the tokens need loading from file, the rest are existing data
extract_cls=JSONLExtract,
# will use transform the tokens and forward on the rest
transform_cls=OAuthRemoteTokenTransform,
extract_cls=None,
transform_cls=None,
load_cls=OAuthClientCopyLoad,
)
"""ETL stream for Zenodo to import OAutch clients related information."""

OAuthServerStreamDefinition = StreamDefinition(
OAuthServerClientStreamDefinition = StreamDefinition(
name="oauthserver",
extract_cls=JSONLExtract,
transform_cls=OAuthServerClientTransform,
load_cls=OAuthServerClientCopyLoad,
)
"""ETL stream for Zenodo to import OAutch clients related information."""

OAuthServerTokenStreamDefinition = StreamDefinition(
name="oauthserver",
# only the tokens need loading from file, the rest are existing data
extract_cls=JSONLExtract,
# will use transform the tokens and forward on the rest
transform_cls=OAuthServerTokenTransform,
load_cls=OAuthServerCopyLoad,
load_cls=OAuthServerTokenCopyLoad,
)
"""ETL stream for Zenodo to import OAutch clients related information."""

Expand Down
10 changes: 8 additions & 2 deletions migrator/zenodo_rdm_migrator/transform/entries/parents.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,17 @@ def __init__(self, partial=False):
self.partial = partial

def _communities(self, entry):
result = {}
communities = entry["json"].get("communities")
if communities:
slugs = [slug for slug in communities]
return {"ids": slugs, "default": slugs[0]}
return {}
result["ids"] = slugs
# If there's only one community, we set it as the default also
if len(slugs) == 1:
result["default"] = slugs[0]
else:
result["default"] = None
return result

def _pids(self, entry):
pids = {}
Expand Down
18 changes: 0 additions & 18 deletions migrator/zenodo_rdm_migrator/transform/records.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from invenio_rdm_migrator.streams.records import RDMRecordTransform

from .entries.parents import ParentRecordEntry
from .entries.records.record_files import ZenodoRDMRecordFileEntry
from .entries.records.records import ZenodoDraftEntry, ZenodoRecordEntry


Expand All @@ -29,21 +28,6 @@ def _draft(self, entry):
"""Extract a draft."""
return ZenodoDraftEntry().transform(entry)

def _file_records(self, entry):
"""Transform file records for an entry."""
files = entry["json"].get("_files", [])
return list(map(ZenodoRDMRecordFileEntry(context=entry).transform, files))

def _draft_files(self, entry):
"""Transform file records of a record."""
# draft files are migrated as post load since new versions/new drafts
# do not have _files until they are saved so we cannot rely on it
return {}

def _record_files(self, entry):
"""Transform file records of a record."""
return self._file_records(entry)

def _transform(self, entry):
"""Transform a single entry."""
is_draft = "deposits" in entry["json"]["$schema"]
Expand All @@ -60,11 +44,9 @@ def _transform(self, entry):
return {
"draft": draft,
"parent": parent,
"draft_files": self._draft_files(entry),
}

return {
"record": self._record(entry),
"parent": self._parent(entry),
"record_files": self._record_files(entry),
}