zenodo · slint · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023
diff --git a/migrator/README.md b/migrator/README.md
@@ -194,28 +194,39 @@ directly on the legacy database, in the following manner:
 DB_URI="postgresql://zenodo@zenodo-legacy-db-host:5432/zenodo"
 
 # Users, ~30min
-psql $DB_URI -f scripts/users_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/users-$(date -I).jsonl.gz"
+psql $DB_URI -f scripts/users_dump.sql | sed 's/\\\\/\\/g' > "dumps/users-$(date -I).jsonl"
 # Communities, ~5min
-psql $DB_URI -f scripts/communities_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/communities-$(date -I).jsonl.gz"
+psql $DB_URI -f scripts/communities_dump.sql | sed 's/\\\\/\\/g' > "dumps/communities-$(date -I).jsonl"
 # Community record inclusion requests, ~10min
-psql $DB_URI -f scripts/requests_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/requests-$(date -I).jsonl.gz"
+psql $DB_URI -f scripts/requests_dump.sql | sed 's/\\\\/\\/g' > "dumps/requests-$(date -I).jsonl"
 # Records, ~2-3h
-psql $DB_URI -f scripts/records_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/records-$(date -I).jsonl.gz"
+psql $DB_URI -f scripts/records_dump.sql | sed 's/\\\\/\\/g' > "dumps/records-$(date -I).jsonl"
 # Deposits/drafts, ~30min
-psql $DB_URI -f scripts/deposits_dump.sql | sed 's/\\\\/\\/g' | gzip -c > "dumps/deposits-$(date -I).jsonl.gz"
-
-# For dumping files we use a different style, since we're not filtering anything:
+psql $DB_URI -f scripts/deposits_dump.sql | sed 's/\\\\/\\/g' > "dumps/deposits-$(date -I).jsonl"
+# Deleted records, ~3-4h
+psql $DB_URI -f scripts/deleted_records_dump.sql | sed 's/\\\\/\\/g' > "dumps/deleted-records-$(date -I).jsonl"
+
+# Oauth2 server clients
+psql $DB_URI -f scripts/oauth2server_clients_dump.sql | sed 's/\\\\/\\/g' > "dumps/oauth2server-clients-$(date -I).jsonl"
+# Oauth2 server tokens
+psql $DB_URI -f scripts/oauth2server_tokens_dump.sql | sed 's/\\\\/\\/g' > "dumps/oauth2server-tokens-$(date -I).jsonl"
+
+# NOTE: For OAuth client and files we use the CSV format, since we're not transforming anything
+# Oauth-Client accounts
+psql $DB_URI -f scripts/oauthclient_remoteaccount_dump.sql | sed 's/\\\\/\\/g' > "dumps/oauthclient_remoteaccount.csv"
+# Oauth-Client tokens
+psql $DB_URI -f scripts/oauthclient_remotetoken_dump.sql | sed 's/\\\\/\\/g' > "dumps/oauthclient_remotetoken.csv"
 # File instances, ~10min
-psql $DB_URI -f scripts/files_files_dump.sql | gzip -c > "dumps/files_files-$(date -I).csv.gz"
+psql $DB_URI -f scripts/files_files_dump.sql > "dumps/files_files-$(date -I).csv"
 # Buckets, ~1min
-psql $DB_URI -f scripts/files_bucket_dump.sql | gzip -c > "dumps/files_bucket-$(date -I).csv.gz"
+psql $DB_URI -f scripts/files_bucket_dump.sql > "dumps/files_bucket-$(date -I).csv"
 # File object versions, ~3min
-psql $DB_URI -f scripts/files_object_dump.sql | gzip -c > "dumps/files_object-$(date -I).csv.gz"
+psql $DB_URI -f scripts/files_object_dump.sql > "dumps/files_object-$(date -I).csv"
 
 # Webhook Events
-psql $DB_URI -f scripts/webhook_events_dump.sql | gzip -c > "dumps/webhook_events-$(date -I).csv.gz"
+psql $DB_URI -f scripts/webhook_events_dump.sql > "dumps/webhook_events-$(date -I).csv"
 # GitHub repositories
-psql $DB_URI -f scripts/github_repositories_dump.sql | gzip -c > "dumps/github_repositories-$(date -I).csv.gz"
+psql $DB_URI -f scripts/github_repositories_dump.sql > "dumps/github_repositories-$(date -I).csv"
 # GitHub releases
-psql $DB_URI -f scripts/github_releases_dump.sql | gzip -c > "dumps/github_releases-$(date -I).jsonl.gz"
+psql $DB_URI -f scripts/github_releases_dump.sql > "dumps/github_releases-$(date -I).jsonl"
 ```
diff --git a/migrator/scripts/dump_awards_db.py b/migrator/scripts/dump_awards_db.py
@@ -13,11 +13,12 @@
 from invenio_rdm_migrator.utils import ts
 
 DATA_PATHS = [
-    "awards-apr-2023.jsonl.gz",  # https://zenodo.org/record/7870151
-    "awards-mar-2023.jsonl.gz",  # https://zenodo.org/record/7803150
-    "awards-feb-2023.jsonl.gz",  # https://zenodo.org/record/7683844
-    "awards-jan-2023.jsonl.gz",  # https://zenodo.org/record/7561801
-    "awards-dec-2022.jsonl.gz",  # https://zenodo.org/record/7745773
+    "awards-2023-08.jsonl.gz"  # https://zenodo.org/record/8224080
+    "awards-2023-04.jsonl.gz"  # https://zenodo.org/record/7870151
+    "awards-2023-03.jsonl.gz"  # https://zenodo.org/record/7803150
+    "awards-2023-02.jsonl.gz"  # https://zenodo.org/record/7683844
+    "awards-2023-01.jsonl.gz"  # https://zenodo.org/record/7561801
+    "awards-2022-12.jsonl.gz"  # https://zenodo.org/record/7745773
 ]
 
 VOCABULARIES_AWARDS_OPENAIRE_FUNDERS = {
@@ -64,6 +65,16 @@ def transform_openaire_grant(data):
         return
 
     award["id"] = f"{funder_id}::{code}"
+
+    funding = next(iter(data.get("funding", [])), None)
+    if funding:
+        funding_stream_id = funding.get("funding_stream", {}).get("id", "")
+        # Example funding stream ID: `EC::HE::HORIZON-AG-UN`. We need the `EC`
+        # string, i.e. the second "part" of the identifier.
+        program = next(iter(funding_stream_id.split("::")[1:2]), "")
+        if program:
+            award["program"] = program
+
     identifiers = []
     if funder_id == VOCABULARIES_AWARDS_EC_ROR_ID:
         identifiers.append(

diff --git a/migrator/scripts/oauth2server_clients_dump.sql b/migrator/scripts/oauth2server_clients_dump.sql
@@ -0,0 +1,20 @@
+COPY (
+    SELECT
+        row_to_json(clients)
+    FROM
+        (
+            SELECT
+                name,
+                description,
+                website,
+                user_id,
+                client_id,
+                client_secret,
+                is_confidential,
+                is_internal,
+                _redirect_uris,
+                _default_scopes
+            FROM
+                oauth2server_client
+        ) as clients
+) TO STDOUT;
diff --git a/migrator/scripts/oauth2server_tokens_dump.sql b/migrator/scripts/oauth2server_tokens_dump.sql
@@ -0,0 +1,20 @@
+COPY (
+    SELECT
+        row_to_json(tokens)
+    FROM
+        (
+            SELECT
+                id,
+                client_id,
+                user_id,
+                token_type,
+                access_token,
+                refresh_token,
+                expires,
+                _scopes,
+                is_personal,
+                is_internal
+            FROM
+                oauth2server_token
+        ) as tokens
+) TO STDOUT;
diff --git a/migrator/scripts/oauthclient_remoteaccount_dump.sql b/migrator/scripts/oauthclient_remoteaccount_dump.sql
@@ -0,0 +1,10 @@
+COPY (
+  SELECT
+    id,
+    user_id,
+    client_id,
+    extra_data,
+    created,
+    updated
+  FROM oauthclient_remoteaccount
+) TO STDOUT WITH (FORMAT csv);
diff --git a/migrator/scripts/oauthclient_remotetoken_dump.sql b/migrator/scripts/oauthclient_remotetoken_dump.sql
@@ -0,0 +1,11 @@
+COPY (
+  SELECT
+    id_remote_account,
+    token_type,
+    access_token,
+    secret,
+    created,
+    updated
+  FROM
+    oauthclient_remotetoken
+) TO STDOUT WITH (FORMAT csv);
diff --git a/migrator/scripts/users_dump.sql b/migrator/scripts/users_dump.sql
@@ -6,23 +6,11 @@ COPY (
             up.*,
             coalesce(se.created, null) as safelisted_at,
             coalesce(se.notes, null) as safelisted_notes,
-            coalesce(user_t.tokens, null) AS tokens,
             coalesce(user_i.identities, null) AS identities,
             coalesce(user_sa.session_activity, null) AS session_activity
         FROM accounts_user AS u
         LEFT JOIN userprofiles_userprofile up ON u.id = up.user_id
         LEFT JOIN safelist_entries se ON u.id = se.user_id
-        LEFT JOIN LATERAL (
-            SELECT json_agg(row_to_json(_t)) AS tokens
-            FROM (
-                SELECT t.*, c.name
-                FROM oauth2server_token AS t
-                JOIN oauth2server_client c ON t.client_id = c.client_id
-                WHERE t.user_id = u.id
-                    AND t.is_personal = true
-                    AND t.is_internal = false
-            ) as _t
-        ) AS user_t ON true
         LEFT JOIN LATERAL (
             SELECT json_agg(row_to_json(i)) AS identities
             FROM oauthclient_useridentity AS i

diff --git a/migrator/zenodo_rdm_migrator/extract/sql.py b/migrator/zenodo_rdm_migrator/extract/sql.py
diff --git a/migrator/zenodo_rdm_migrator/stream.py b/migrator/zenodo_rdm_migrator/stream.py
@@ -21,11 +21,12 @@
 )
 from invenio_rdm_migrator.streams.oauth import (
     OAuthClientCopyLoad,
-    OAuthRemoteTokenTransform,
-    OAuthServerCopyLoad,
+    OAuthServerClientCopyLoad,
+    OAuthServerClientTransform,
+    OAuthServerTokenCopyLoad,
     OAuthServerTokenTransform,
 )
-from invenio_rdm_migrator.streams.records import RDMRecordCopyLoad
+from invenio_rdm_migrator.streams.records import RDMDraftCopyLoad, RDMRecordCopyLoad
 from invenio_rdm_migrator.streams.requests import RequestCopyLoad
 from invenio_rdm_migrator.streams.users import UserCopyLoad
 
@@ -58,7 +59,7 @@
     name="drafts",
     extract_cls=JSONLExtract,
     transform_cls=ZenodoRecordTransform,
-    load_cls=RDMRecordCopyLoad,
+    load_cls=RDMDraftCopyLoad,
 )
 """ETL stream for Zenodo to RDM drafts."""
 
@@ -104,21 +105,25 @@
 
 OAuthClientStreamDefinition = StreamDefinition(
     name="oauthclient",
-    # only the tokens need loading from file, the rest are existing data
-    extract_cls=JSONLExtract,
-    # will use transform the tokens and forward on the rest
-    transform_cls=OAuthRemoteTokenTransform,
+    extract_cls=None,
+    transform_cls=None,
     load_cls=OAuthClientCopyLoad,
 )
 """ETL stream for Zenodo to import OAutch clients related information."""
 
-OAuthServerStreamDefinition = StreamDefinition(
+OAuthServerClientStreamDefinition = StreamDefinition(
+    name="oauthserver",
+    extract_cls=JSONLExtract,
+    transform_cls=OAuthServerClientTransform,
+    load_cls=OAuthServerClientCopyLoad,
+)
+"""ETL stream for Zenodo to import OAutch clients related information."""
+
+OAuthServerTokenStreamDefinition = StreamDefinition(
     name="oauthserver",
-    # only the tokens need loading from file, the rest are existing data
     extract_cls=JSONLExtract,
-    # will use transform the tokens and forward on the rest
     transform_cls=OAuthServerTokenTransform,
-    load_cls=OAuthServerCopyLoad,
+    load_cls=OAuthServerTokenCopyLoad,
 )
 """ETL stream for Zenodo to import OAutch clients related information."""
 

diff --git a/migrator/zenodo_rdm_migrator/transform/entries/parents.py b/migrator/zenodo_rdm_migrator/transform/entries/parents.py
@@ -26,11 +26,17 @@ def __init__(self, partial=False):
         self.partial = partial
 
     def _communities(self, entry):
+        result = {}
         communities = entry["json"].get("communities")
         if communities:
             slugs = [slug for slug in communities]
-            return {"ids": slugs, "default": slugs[0]}
-        return {}
+            result["ids"] = slugs
+            # If there's only one community, we set it as the default also
+            if len(slugs) == 1:
+                result["default"] = slugs[0]
+            else:
+                result["default"] = None
+        return result
 
     def _pids(self, entry):
         pids = {}

diff --git a/migrator/zenodo_rdm_migrator/transform/records.py b/migrator/zenodo_rdm_migrator/transform/records.py
@@ -10,7 +10,6 @@
 from invenio_rdm_migrator.streams.records import RDMRecordTransform
 
 from .entries.parents import ParentRecordEntry
-from .entries.records.record_files import ZenodoRDMRecordFileEntry
 from .entries.records.records import ZenodoDraftEntry, ZenodoRecordEntry
 
 
@@ -29,21 +28,6 @@ def _draft(self, entry):
         """Extract a draft."""
         return ZenodoDraftEntry().transform(entry)
 
-    def _file_records(self, entry):
-        """Transform file records for an entry."""
-        files = entry["json"].get("_files", [])
-        return list(map(ZenodoRDMRecordFileEntry(context=entry).transform, files))
-
-    def _draft_files(self, entry):
-        """Transform file records of a record."""
-        # draft files are migrated as post load since new versions/new drafts
-        # do not have _files until they are saved so we cannot rely on it
-        return {}
-
-    def _record_files(self, entry):
-        """Transform file records of a record."""
-        return self._file_records(entry)
-
     def _transform(self, entry):
         """Transform a single entry."""
         is_draft = "deposits" in entry["json"]["$schema"]
@@ -60,11 +44,9 @@ def _transform(self, entry):
             return {
                 "draft": draft,
                 "parent": parent,
-                "draft_files": self._draft_files(entry),
             }
 
         return {
             "record": self._record(entry),
             "parent": self._parent(entry),
-            "record_files": self._record_files(entry),
         }