diff --git a/deploy/data_exporter/.gitignore b/deploy/data_exporter/.gitignore new file mode 100644 index 000000000..0a03531c6 --- /dev/null +++ b/deploy/data_exporter/.gitignore @@ -0,0 +1 @@ +.aws-sam diff --git a/deploy/data_exporter/__init__.py b/deploy/data_exporter/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deploy/data_exporter/data_export_function/Dockerfile b/deploy/data_exporter/data_export_function/Dockerfile new file mode 100644 index 000000000..c8da43e18 --- /dev/null +++ b/deploy/data_exporter/data_export_function/Dockerfile @@ -0,0 +1,15 @@ +FROM public.ecr.aws/docker/library/ubuntu:24.04 + +RUN apt update && \ + apt install -y postgresql-client-16 python3.12 python3-pip curl unzip python3.12-venv && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN python3 -m venv .venv +RUN ./.venv/bin/pip install -r requirements.txt +RUN ./.venv/bin/pip install awslambdaric + +COPY . . + +ENTRYPOINT [".venv/bin/python", "-m", "awslambdaric" ] +CMD [ "app.lambda_handler" ] diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py new file mode 100644 index 000000000..68ec122b5 --- /dev/null +++ b/deploy/data_exporter/data_export_function/app.py @@ -0,0 +1,216 @@ +import os +import subprocess +from datetime import datetime, timedelta, timezone + +import boto3 +import psycopg +from psycopg import sql + +ssm = boto3.client("ssm") +s3 = boto3.client("s3", region_name="eu-west-1") +BUCKET_NAME = "dc-ynr-short-term-backups" +PREFIX = "ynr-export" +FILENAME_FORMAT = "{PREFIX}-{CURRENT_TIME_STR}.dump" + + +def get_parameter(name): + response = ssm.get_parameter(Name=name) + return response["Parameter"]["Value"] + + +SOURCE_DATABASE = "ynr" +TMP_DATABASE_NAME = "ynr-for-dev-export" +DB_HOST = get_parameter("/ynr/production/POSTGRES_HOST") +DB_USER = get_parameter("/ynr/production/POSTGRES_USERNAME") +DB_PASSWORD = get_parameter("/ynr/production/POSTGRES_PASSWORD") +DB_PORT = "5432" +os.environ["PGPASSWORD"] = DB_PASSWORD + + +def get_db_conn(db_name): + conn = psycopg.connect( + dbname=db_name, + user=DB_USER, + password=DB_PASSWORD, + host=DB_HOST, + port=DB_PORT, + ) + conn.autocommit = True + return conn + + +def create_database_and_restore(): + conn = get_db_conn(SOURCE_DATABASE) + try: + with conn.cursor() as cur: + print(f"Deleting {TMP_DATABASE_NAME}") + cur.execute( + sql.SQL("DROP DATABASE IF EXISTS {};").format( + sql.Identifier(TMP_DATABASE_NAME) + ) + ) + with conn.cursor() as cur: + # SQL to create the new database from the source + print(f"Creating {TMP_DATABASE_NAME}") + cur.execute( + sql.SQL("CREATE DATABASE {} ;").format( + sql.Identifier(TMP_DATABASE_NAME), + ) + ) + print( + f"Database '{TMP_DATABASE_NAME}' created successfully '{SOURCE_DATABASE}'." + ) + except psycopg.Error as e: + print(f"Error creating database: {e}") + raise + + finally: + conn.close() + + # Dump and restore the source DB to the temp one + dump_command = [ + "pg_dump", + "-h", + DB_HOST, + "-U", + DB_USER, + "-d", + SOURCE_DATABASE, + "-Fc", + ] + + restore_command = [ + "pg_restore", + "-h", + DB_HOST, + "-U", + DB_USER, + "-d", + TMP_DATABASE_NAME, + ] + + print("Populating new database (pg_dump | pg_restore") + with subprocess.Popen( + dump_command, + stdout=subprocess.PIPE, + ) as dump_proc: + subprocess.run( + restore_command, + stdin=dump_proc.stdout, + check=True, + ) + dump_proc.stdout.close() + + +def clean_database(): + conn = get_db_conn(db_name=TMP_DATABASE_NAME) + with conn.cursor() as cur: + print("Cleaning Users table") + cur.execute( + """UPDATE auth_user SET + email = CONCAT('anon_', id, '@example.com'), + password = md5(random()::text); + """ + ) + print("Cleaning Account email table") + cur.execute( + """UPDATE account_emailaddress SET + email = CONCAT('anon_', id, '@example.com'); + """ + ) + print("Cleaning IP addresses from LoggedActions") + cur.execute( + """UPDATE candidates_loggedaction SET + ip_address = '127.0.0.1'; + """ + ) + print("Cleaning API tokens") + cur.execute( + """UPDATE authtoken_token SET + key = md5(random()::text); + """ + ) + print("Cleaning sessions") + cur.execute("""TRUNCATE TABLE django_session;""") + + +def get_filename(): + return FILENAME_FORMAT.format( + PREFIX=PREFIX, CURRENT_TIME=datetime.now().isoformat().replace(":", "-") + ) + + +def dump_and_export(): + dump_file = "/tmp/db_dump.sql" # Temporary file for the dump + + print("Run pg_dump to create the database dump") + try: + subprocess.run( + [ + "pg_dump", + "-h", + DB_HOST, + "-U", + DB_USER, + "-d", + TMP_DATABASE_NAME, + "-Fc", + "-f", + dump_file, + ], + check=True, + ) + + file_name = get_filename() + + print("Upload the dump to S3") + s3.upload_file(dump_file, BUCKET_NAME, file_name) + + print("Generate a presigned URL for downloading the dump") + presigned_url = s3.generate_presigned_url( + "get_object", + Params={"Bucket": BUCKET_NAME, "Key": file_name}, + ExpiresIn=3600, # URL expires in 1 hour + ) + print("Finished") + return presigned_url + + except subprocess.CalledProcessError as e: + return f"Error generating database dump: {str(e)}" + + +def check_for_recent_exports(): + """ + If we've exported a file in the last hour, don't export another one + + """ + one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1) + response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=PREFIX) + if "Contents" in response: + recent_files = [ + obj + for obj in response["Contents"] + if obj["LastModified"] >= one_hour_ago + ] + + recent_files.sort(key=lambda obj: obj["LastModified"], reverse=True) + + if recent_files: + return s3.generate_presigned_url( + "get_object", + Params={"Bucket": BUCKET_NAME, "Key": recent_files[0]["Key"]}, + ExpiresIn=3600, # URL expires in 1 hour + ) + return None + + +def lambda_handler(event, context): + if recent_export := check_for_recent_exports(): + return recent_export + + print("Creating temp database") + create_database_and_restore() + print("Cleaning temp database") + clean_database() + print("Dumping and exporting") + return dump_and_export() diff --git a/deploy/data_exporter/data_export_function/requirements.txt b/deploy/data_exporter/data_export_function/requirements.txt new file mode 100644 index 000000000..934ff63fb --- /dev/null +++ b/deploy/data_exporter/data_export_function/requirements.txt @@ -0,0 +1,2 @@ +boto3===1.35.56 +psycopg[binary]==3.2.3 diff --git a/deploy/data_exporter/samconfig.toml b/deploy/data_exporter/samconfig.toml new file mode 100644 index 000000000..7eefffa43 --- /dev/null +++ b/deploy/data_exporter/samconfig.toml @@ -0,0 +1,33 @@ +# More information about the configuration file can be found here: +# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-config.html +version = 0.1 + +[default.global.parameters] +stack_name = "ynr-data-exporter" + +[default.build.parameters] +cached = true +parallel = true + +[default.validate.parameters] +lint = true + +[default.deploy.parameters] +capabilities = "CAPABILITY_IAM" +confirm_changeset = true +resolve_s3 = true +s3_prefix = "ynr-data-exporter" +region = "eu-west-2" +image_repositories = ["DataExportFunction=929325949831.dkr.ecr.eu-west-2.amazonaws.com/ynrdataexporter736bb2dc/dataexportfunctionb95e9e19repo"] + +[default.package.parameters] +resolve_s3 = true + +[default.sync.parameters] +watch = true + +[default.local_start_api.parameters] +warm_containers = "EAGER" + +[default.local_start_lambda.parameters] +warm_containers = "EAGER" diff --git a/deploy/data_exporter/template.yaml b/deploy/data_exporter/template.yaml new file mode 100644 index 000000000..a88c8a578 --- /dev/null +++ b/deploy/data_exporter/template.yaml @@ -0,0 +1,49 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: > + data_exporter + + Exports data from the prod database, cleans it and puts the resulting dump in an S3 bucket + +Globals: + Function: + Timeout: 600 # 10 minutes + MemorySize: 1024 + + LoggingConfig: + LogFormat: JSON +Resources: + DataExportFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: ynr-data-exporter + PackageType: Image + ImageUri: data_export_function + # Needs to be at least as big as the DB export, currently at around 350mb + EphemeralStorage: + Size: 1024 + # Don't allow more than one export job to run at a time + ReservedConcurrentExecutions: 1 + Policies: + - Statement: + - Sid: S3Access + Effect: Allow + Action: + - s3:* + Resource: + - 'arn:aws:s3:::dc-ynr-short-term-backups' + - 'arn:aws:s3:::dc-ynr-short-term-backups/*' + - Sid: SSM + Effect: Allow + Action: + - ssm:* + Resource: + - 'arn:aws:ssm:*:*:parameter/ynr/*' + +Outputs: + DataExportFunction: + Description: Hello World Lambda Function ARN + Value: !GetAtt DataExportFunction.Arn + DataExportFunctionIamRole: + Description: Implicit IAM Role created for Hello World function + Value: !GetAtt DataExportFunctionRole.Arn diff --git a/scripts/check-database-url.sh b/scripts/check-database-url.sh new file mode 100755 index 000000000..7dacba9bb --- /dev/null +++ b/scripts/check-database-url.sh @@ -0,0 +1,95 @@ +# This script does two things: +# +# 1. Gets a DATABASE_URL from the environment or the first argument and +# normalizes it to be able to connect to postgres's CLI tools +# 2. Validates that it's possible to connect to the URL provided +# 3. Sets a validated URL as the `_SCRIPT_DATABASE_URL` environment variable for +# use in other scripts. This only happens if the script detects it's not +# being invoked directly. +# +# This script can be used on its own for validating connections (useful for +# debugging different environments and catching problems early) or as a +# utility script in other scripts that need to connect to a database. + +REQUIRED_POSTGRES_VERSION="16" + +# Check for required tools +REQUIRED_TOOLS="createdb psql" +for tool in $REQUIRED_TOOLS; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "Error: $tool is required but not installed." >&2 + exit 1 + fi +done + + +# Get the database URL +# TODO: we might want this to be its own script +# 1. Check if DATABASE_URL is provided as the first argument +if [ -n "${1:-}" ]; then + echo "Getting DATABASE_URL from the provided argument" + DATABASE_URL="$1" +# 2. Check DATABASE_URL is set in the environment +elif [ -n "$DATABASE_URL" ]; then + echo "Getting DATABASE_URL from the environment" + DATABASE_URL="$DATABASE_URL" +fi + +# Normalize if DATABASE_URL starts with "postgis://" +# We do this because `dj-database-url` uses "postgis://" +# to alter the Django engine that's used, but the postgres +# cli tools don't support this protocol. +case "$DATABASE_URL" in postgis://*) + DATABASE_URL="postgres://${DATABASE_URL#postgis://}" + ;; +esac + +# Check if DATABASE_URL is set after all attempts +if [ -z "$DATABASE_URL" ]; then + echo "Error: DATABASE_URL is not provided." + echo "please the environment variable DATABASE_URL or pass it in as an argument" + echo "The format must comply with \033[4mhttps://www.postgresql.org/docs/$REQUIRED_POSTGRES_VERSION/libpq-connect.html#LIBPQ-CONNSTRING-URIS\033[0m" + exit 1 +fi + +# Extract the database name from the database URL. +# 1. Use sed to remove any trailing slashes +# 2. Use `tr` to replace slashes with newlines +# 3. Use tail to get the last line, e.g the last element after a slash +# 4. Use the same method to strip off any query arguments after a `?` +DB_NAME=$(echo "$DATABASE_URL" | sed 's:/*$::' | tr "/" "\n" | tail -n 1 | tr "?" "\n" | head -n 1) + +# Create the database if it doesn't exist. +# If it already exists, we don't fail. At this point, +# we're only making a DB to ensure that we can connect to the +# database URL in the next step, so we can ignore fails here. +# Because of this, we route the output of `createdb` to /dev/null. +# Without this, the script prints an error that might confuse users +echo "Creating the DB if it doesn't exist." +createdb $DB_NAME >/dev/null 2>&1 || true + +# Check that we can connect to the local DB before returning +psql $DATABASE_URL -c "\q" +if [ $? -ne 0 ]; then + echo "❌ Failed to connect to $DATABASE_URL" + exit 1 +fi + + +# Check the server version +SERVER_POSTGRES_VERSION=$(psql -t -c "SHOW server_version;" -d $DATABASE_URL | cut -d '.' -f 1) +if [ $SERVER_POSTGRES_VERSION != $REQUIRED_POSTGRES_VERSION ]; then + echo "❌ Postgres version $REQUIRED_POSTGRES_VERSION required, found $SERVER_POSTGRES_VERSION" +fi + +echo "✅ Successfully connected to the local database '$DB_NAME'" + + +# Check if the basename of $0 (the file that was executed) is the same +# as this file name. If not, this script is being called as a 'utility' +# so we should set an environment variable. +if [ "${0##*/}" != "check-database-url.sh" ]; then + # Script is being sourced, export a "private" DATABASE URL + # that we can use in other scripts + export _SCRIPT_DATABASE_URL=$DATABASE_URL +fi diff --git a/scripts/get-prod-db.sh b/scripts/get-prod-db.sh new file mode 100755 index 000000000..562e6b574 --- /dev/null +++ b/scripts/get-prod-db.sh @@ -0,0 +1,71 @@ +#!/bin/sh +set -euxo + +# This script invokes an AWS Lambda function to retrieve a URL for downloading +# a cleaned version of the production database and then restores +# that data locally. By default the db name is "ynr-prod" but you can change the +# local name by passing it as the first argument to the script. +# +# This script requires access to the YNR production AWS account +# +# Usage: +# ./scripts/get-prod-db.sh [LOCAL_DB_NAME] +# +# Arguments: +# LOCAL_DB_NAME: Optional. Name of the local database to restore data to. +# Defaults to 'ynr-prod' if not specified. + +# Configurable variables +LAMBDA_FUNCTION_NAME="ynr-data-exporter" +LOCAL_DB_NAME="${1:-ynr-prod}" + +# Check for required tools +REQUIRED_TOOLS="aws dropdb createdb pg_restore wget" +for tool in $REQUIRED_TOOLS; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "Error: $tool is required but not installed." >&2 + exit 1 + fi +done + +# Check the DB URL and get the cleaned $_SCRIPT_DATABASE_URL +. ./scripts/check-database-url.sh + + +# Create a temporary file and set up clean up on script exit +TEMP_FILE=$(mktemp) +trap 'rm -f "$TEMP_FILE"' EXIT + +# Invoke AWS Lambda and store the result in the temp file +# The result is a pre-signed URL to the dump file on S3 +echo "Invoking Lambda to get DB URL. This might take a few minutes..." +aws lambda invoke \ + --function-name "$LAMBDA_FUNCTION_NAME" \ + --cli-read-timeout=0 \ + --no-cli-pager \ + --output text \ + --query 'Payload' \ + "$TEMP_FILE" + +# Extract the URL from the response +# This is because the response is quoted, so we just need to remove the quotation marks +URL=$(sed 's/^"\(.*\)"$/\1/' "$TEMP_FILE") +case "$URL" in + https://*) + echo "Got URL: $(URL)" + + ;; + *) + echo "The received URL looks invalid. This might mean the database export failed." + echo "Check the logs of the '$LAMBDA_FUNCTION_NAME' Lambda function" + exit 1 + ;; +esac + +echo "Dropping DB $(_SCRIPT_DATABASE_URL)" +dropdb --if-exists "$_SCRIPT_DATABASE_URL" +echo "Creating DB $(_SCRIPT_DATABASE_URL)" +createdb "$_SCRIPT_DATABASE_URL" + +echo "Downloading and restoring DB $(_SCRIPT_DATABASE_URL)" +wget -qO- "$URL" | pg_restore -d "$_SCRIPT_DATABASE_URL" -Fc --no-owner --no-privileges