Merge pull request #3083 from responsible-ai-collaborative/staging

Deploy to Production
responsible-ai-collaborative · Sep 10, 2024 · 0e0ef1b · 0e0ef1b
2 parents 8262d88 + 2174d92
commit 0e0ef1b
Show file tree

Hide file tree

Showing 16 changed files with 2,435 additions and 2,658 deletions.
diff --git a/.github/workflows/db-backup.yml b/.github/workflows/db-backup.yml
@@ -0,0 +1,57 @@
+name: Public backup to the cloud
+
+on:
+  schedule:
+    - cron: "0 10 * * 1" # At 10:00 on Monday.
+  workflow_dispatch:
+    inputs:
+      environment:
+        description: The Github environment to load secrets from
+        type: string  
+        required: true
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  build-and-run-backups:
+    # If the execution is triggered by a schedule, the environment is production
+    environment: ${{ inputs.environment || 'production' }}
+    name: Backup
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y \
+            coreutils \
+            bash \
+            tzdata \
+            python3-pip \
+            curl \
+            npm
+          wget -qO - https://www.mongodb.org/static/pgp/server-6.0.asc |  gpg --dearmor | sudo tee /usr/share/keyrings/mongodb.gpg > /dev/null
+          echo "deb [ arch=amd64 signed-by=/usr/share/keyrings/mongodb.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/6.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-6.0.list
+          sudo apt update
+          sudo apt install -y mongodb-database-tools
+      
+      - name: Install boto3
+        run: pip install boto3
+
+      - name: Generate public backup
+        run: |
+          ./bin/backup.sh
+          ./bin/prune.sh
+          ./bin/list.sh
+        working-directory: site/db-backup
+        env:
+          CLOUDFLARE_R2_ACCOUNT_ID: ${{ vars.CLOUDFLARE_R2_ACCOUNT_ID }}
+          CLOUDFLARE_R2_WRITE_ACCESS_KEY_ID: ${{ secrets.CLOUDFLARE_R2_WRITE_ACCESS_KEY_ID }}
+          CLOUDFLARE_R2_WRITE_SECRET_ACCESS_KEY: ${{ secrets.CLOUDFLARE_R2_WRITE_SECRET_ACCESS_KEY }}
+          CLOUDFLARE_R2_BUCKET_NAME: ${{ vars.CLOUDFLARE_R2_BUCKET_NAME }}
+          MONGODB_URI: ${{ secrets.MONGODB_CONNECTION_STRING }}
diff --git a/site/db-backup/README.md b/site/db-backup/README.md
@@ -0,0 +1,33 @@
+This is a quick port of the forked project to support JSON and CSV backups of the [AIID](https://incidentdatabase.ai/).
+
+The complete state of the database will be backed up on a weekly basis in both JSON and CSV form. The backups can be downloaded from [here](https://incidentdatabase.ai/research/snapshots/).
+
+Requirements
+------------
+
+- Cloudflare R2 Access Key ID/Secret Access Key, which must have the access rights of the target Cloudflare R2 bucket.
+- MongoDB credentials with read access to the target database.
+
+Usage
+-----
+
+The GitHub Action "Public backup to the cloud" [/.github/workflows/db-backup.yml](/.github/workflows/db-backup.yml) will run the backup script at 10:00 AM every Monday.
+
+After running this, `backup-YYYYMMdd.tar.bz2` will be placed on the Cloudflare R2 Bucket.
+
+
+Required environment variables
+---------
+
+| Variable              | Description                                                                    |
+| --------------------- | ------------------------------------------------------------------------------ |
+| CLOUDFLARE_R2_ACCOUNT_ID     | Cloudflare R2 account ID |
+| CLOUDFLARE_R2_BUCKET_NAME     | Cloudflare R2 public bucket name (ie: "aiid-public") |
+
+Required environment secrets
+
+| Secret              | Description                                                                    |
+| --------------------- | ------------------------------------------------------------------------------ |
+| CLOUDFLARE_R2_WRITE_ACCESS_KEY_ID     | Cloudflare R2 Access Key ID with write permission |
+| CLOUDFLARE_R2_WRITE_SECRET_ACCESS_KEY     | Cloudflare R2 Access Secret ID with write permission|
+| MONGODB_CONNECTION_STRING     | mongodb+srv://[username]:[password]@aiiddev.[CLUSTER].mongodb.net |
diff --git a/site/db-backup/bin/backup.sh b/site/db-backup/bin/backup.sh
@@ -0,0 +1,130 @@
+#!/bin/bash -e
+
+echo "--------------------------------------"
+echo "Starting backup.sh script execution..."
+echo "--------------------------------------"
+
+# settings
+BACKUPFILE_PREFIX="backup"
+CLOUDFLARE_R2_ACCOUNT_ID=${CLOUDFLARE_R2_ACCOUNT_ID}
+MONGODB_DBNAME="aiidprod"
+MONGODB_DBNAME_TRANSLATIONS="translations"
+
+# start script
+CWD=$(/usr/bin/dirname $0)
+cd $CWD
+
+. ./functions.sh
+NOW=$(create_current_yyyymmddhhmmss)
+
+echo "=== $0 started at $(/bin/date "+%Y/%m/%d %H:%M:%S") ==="
+
+TMPDIR="/tmp"
+TARGET_DIRNAME="mongodump_full_snapshot"
+TARGET="${TMPDIR}/${TARGET_DIRNAME}"
+TAR_CMD="/bin/tar"
+TAR_OPTS="jcvf"
+
+DIRNAME=$(/usr/bin/dirname ${TARGET})
+BASENAME=$(/usr/bin/basename ${TARGET})
+TARBALL="${BACKUPFILE_PREFIX}-${NOW}.tar.bz2"
+TARBALL_FULLPATH="${TMPDIR}/${TARBALL}"
+
+# check parameters
+# deprecate the old option
+if [ "x${CLOUDFLARE_R2_ACCOUNT_ID}" == "x" ]; then
+  echo "ERROR: CLOUDFLARE_R2_ACCOUNT_ID must be specified." 1>&2
+  exit 1
+fi
+if [ -z "${CLOUDFLARE_R2_WRITE_ACCESS_KEY_ID}" ]; then
+  echo "ERROR: If CLOUDFLARE_R2_ACCOUNT_ID environment variable is defined, you have to define the CLOUDFLARE_R2_WRITE_ACCESS_KEY_ID as well" 1>&2
+  exit 1
+fi
+if [ -z "${CLOUDFLARE_R2_WRITE_SECRET_ACCESS_KEY}" ]; then
+  echo "ERROR: If CLOUDFLARE_R2_ACCOUNT_ID environment variable is defined, you have to define the CLOUDFLARE_R2_WRITE_SECRET_ACCESS_KEY as well" 1>&2
+  exit 1
+fi
+if [ -z "${CLOUDFLARE_R2_BUCKET_NAME}" ]; then
+  echo "ERROR: If CLOUDFLARE_R2_ACCOUNT_ID environment variable is defined, you have to define the CLOUDFLARE_R2_BUCKET_NAME as well" 1>&2
+  exit 1
+fi
+
+echo "Dump MongoDB 'aiidprod' database..."
+mongodump -o ${TARGET} --uri=${MONGODB_URI}/${MONGODB_DBNAME}
+
+echo "Dump MongoDB 'translations' database..."
+mongodump -o ${TARGET} --uri=${MONGODB_URI}/${MONGODB_DBNAME_TRANSLATIONS}
+
+echo "Export collections as CSV files..."
+mongoexport -o ${TARGET}/incidents.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=incidents --fields=_id,incident_id,date,reports,Alleged\ deployer\ of\ AI\ system,Alleged\ developer\ of\ AI\ system,Alleged\ harmed\ or\ nearly\ harmed\ parties,description,title
+mongoexport -o ${TARGET}/duplicates.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=duplicates --fields=duplicate_incident_number,true_incident_number
+mongoexport -o ${TARGET}/quickadd.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=quickadd --fields=incident_id,url,date_submitted,source_domain
+mongoexport -o ${TARGET}/submissions.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=submissions --fields=authors,date_downloaded,date_modified,date_published,date_submitted,image_url,incident_date,incident_id,language,mongodb_id,source_domain,submitters,text,title,url
+mongoexport -o ${TARGET}/reports.csv --uri=${MONGODB_URI}/${MONGODB_DBNAME} -v --type=csv --collection=reports --fields=_id,incident_id,authors,date_downloaded,date_modified,date_published,date_submitted,description,epoch_date_downloaded,epoch_date_modified,epoch_date_published,epoch_date_submitted,image_url,language,ref_number,report_number,source_domain,submitters,text,title,url,tags
+
+###### Begin Taxa CSV Export ######
+
+# Temporary file name to store MongoDB export of "taxa"
+taxa_json="taxa_items.json"
+
+# Export all documents from the "taxa" collection to a temporary JSON file
+mongoexport --uri=${MONGODB_URI}/${MONGODB_DBNAME} -c taxa --type=json -o $taxa_json --jsonArray --quiet
+
+# Check if mongoexport ran successfully
+if [ $? -ne 0 ]; then
+    echo "Error executing mongoexport for the 'taxa' collection. Check the MongoDB URI and other parameters."
+    exit 1
+fi
+
+# Get all unique namespaces from the taxa JSON file
+namespaces=$(jq -r '.[].namespace' "$taxa_json" | sort | uniq)
+
+# Iterate over each namespace and execute the corresponding process
+for namespace in $namespaces; do
+
+    # Temporary JSON file name to store MongoDB export of "classifications"
+    classification_json="classifications_${namespace}.json"
+
+    # Run mongoexport to export documents from the "classifications" collection
+    mongoexport --uri=${MONGODB_URI}/${MONGODB_DBNAME} -c classifications --type=json -o $classification_json --query="{\"namespace\": \"$namespace\"}" --jsonArray --quiet
+
+    # Check if mongoexport ran successfully
+    if [ $? -ne 0 ]; then
+        echo "Error executing mongoexport for the namespace $namespace. Check the MongoDB URI and other parameters."
+        continue # Skip to the next namespace if there is an error
+    fi
+
+    # Invoke the Python script with the provided parameters
+    python3 taxonomy_csv_export.py "$namespace" "$taxa_json" "$classification_json" ${TARGET}
+
+    # Check if the Python script ran successfully
+    if [ $? -ne 0 ]; then
+        echo "Error executing taxonomy_csv_export.py for the namespace $namespace."
+        continue # Skip to the next namespace if there is an error
+    fi
+
+    # Delete the temporary JSON file
+    rm -f $classification_json
+done
+
+# Delete the temporary JSON file
+rm -f $taxa_json
+
+echo "All namespaces have completed processing."
+
+###### End Taxa CSV Export ######
+
+## Create a license file
+echo "Report contents are subject to their own intellectual property rights. Unless otherwise noted, the database is shared under (CC BY-SA 4.0). See: https://creativecommons.org/licenses/by-sa/4.0/" >${TARGET}/license.txt
+
+# run tar command
+echo "Start backup ${TARGET} into ${CLOUDFLARE_R2_BUCKET_NAME} ..."
+time ${TAR_CMD} ${TAR_OPTS} ${TARBALL_FULLPATH} -C ${DIRNAME} ${BASENAME}
+
+# upload tarball to Cloudflare R2
+r2_copy_file ${CLOUDFLARE_R2_ACCOUNT_ID} ${CLOUDFLARE_R2_WRITE_ACCESS_KEY_ID} ${CLOUDFLARE_R2_WRITE_SECRET_ACCESS_KEY} ${CLOUDFLARE_R2_BUCKET_NAME} ${TARBALL_FULLPATH} ${TARBALL}
+
+# call healthchecks url for successful backup
+if [ "x${HEALTHCHECKS_URL}" != "x" ]; then
+  curl -fsS --retry 3 ${HEALTHCHECKS_URL} >/dev/null
+fi
diff --git a/site/db-backup/bin/cloudflare_operations.py b/site/db-backup/bin/cloudflare_operations.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+import sys
+
+import argparse
+import boto3
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Simple client for uploading, deleting, listing, and checking objects in Cloudlfare R2 buckets."
+    )
+
+    parser.add_argument(
+        "--operation",
+        choices=["list", "upload", "delete", "check_exists"],
+        required=True,
+        help="",
+    )
+
+    # Arguments that are always required.
+    parser.add_argument("--account_id", required=True, help="Cloudflare account ID")
+    parser.add_argument(
+        "--access_key", required=True, help="Cloudflare R2 bucket access key"
+    )
+    parser.add_argument(
+        "--secret_key", required=True, help="Cloudflare R2 bucket secret key"
+    )
+    parser.add_argument(
+        "--bucket_name", required=True, help="Cloudflare R2 bucket name"
+    )
+
+    parser.add_argument(
+        "--file_path",
+        required=False,
+        help="Path to the file to be uploaded or deleted.",
+    )
+    parser.add_argument(
+        "--object_key",
+        required=False,
+        help="Key under which the object should be stored in the bucket.",
+    )
+
+    args = parser.parse_args()
+
+    # Arguments required for only some operations.
+    if args.operation == "upload":
+        if args.file_path is None:
+            parser.error("--operation={upload} requires --file_path.")
+
+    if args.operation in ["upload", "delete", "check_exists"]:
+        if args.object_key is None:
+            parser.error(
+                "--operation={delete,upload,check_exists} requires --object_key."
+            )
+
+    return args
+
+
+def create_cloudflare_client(account_id, access_key, secret_key, region="auto"):
+    endpoint_url = f"https://{account_id}.r2.cloudflarestorage.com"
+    cloudflare_client = boto3.client(
+        service_name="s3",
+        endpoint_url=endpoint_url,
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        region_name=region,
+    )
+    return cloudflare_client
+
+
+def main(args):
+    cloudflare_client = create_cloudflare_client(
+        args.account_id, args.access_key, args.secret_key
+    )
+
+    if args.operation == "list":
+        response = cloudflare_client.list_objects_v2(Bucket=args.bucket_name)
+
+        if "Contents" in response:
+            for obj in response["Contents"]:
+                print(obj["Key"], "size:", obj["Size"])
+
+    elif args.operation == "upload":
+        cloudflare_client.upload_file(
+            args.file_path,
+            args.bucket_name,
+            args.object_key,
+            ExtraArgs={"ContentType": "application/x-bzip2"},
+        )
+        print("-----------------------------")
+        print(
+            f"Successfully uploaded file {args.file_path} (key: {args.object_key}) to bucket {args.bucket_name}"
+        )
+        print("-----------------------------")
+
+    elif args.operation == "delete":
+        cloudflare_client.delete_object(Bucket=args.bucket_name, Key=args.object_key)
+        print("-----------------------------")
+        print(
+            f"Successfully deleted file {args.object_key} from bucket {args.bucket_name}"
+        )
+        print("-----------------------------")
+
+    elif args.operation == "check_exists":
+        # Raises error/non-zero exit if object doesn't exist. Otherwise success, raises nothing.
+        cloudflare_client.get_object(Bucket=args.bucket_name, Key=args.object_key)
+
+    else:
+        raise NotImplementedError
+
+    sys.exit()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)