diff --git a/deploy/data_exporter/.gitignore b/deploy/data_exporter/.gitignore new file mode 100644 index 000000000..0a03531c6 --- /dev/null +++ b/deploy/data_exporter/.gitignore @@ -0,0 +1 @@ +.aws-sam diff --git a/deploy/data_exporter/__init__.py b/deploy/data_exporter/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deploy/data_exporter/data_export_function/Dockerfile b/deploy/data_exporter/data_export_function/Dockerfile new file mode 100644 index 000000000..63a5fd7ec --- /dev/null +++ b/deploy/data_exporter/data_export_function/Dockerfile @@ -0,0 +1,14 @@ +FROM public.ecr.aws/docker/library/ubuntu:24.04 + +RUN apt update && \ + apt install -y postgresql-client-16 python3.12 python3-pip curl unzip && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip3 install --break-system-packages -r requirements.txt +RUN pip3 install --break-system-packages awslambdaric + +COPY . . + +ENTRYPOINT ["python3", "-m", "awslambdaric" ] +CMD [ "app.lambda_handler" ] diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py new file mode 100644 index 000000000..ec8f1c3cd --- /dev/null +++ b/deploy/data_exporter/data_export_function/app.py @@ -0,0 +1,178 @@ +import os +import subprocess +from datetime import datetime, timedelta, timezone + +import boto3 +import psycopg +from psycopg import sql + +ssm = boto3.client("ssm") +s3 = boto3.client("s3", region_name="eu-west-1") +bucket_name = "dc-ynr-short-term-backups" +current_time = datetime.now().isoformat() +PREFIX = "ynr-export" +FILENAME = f"{PREFIX}-{current_time.replace(':', '-')}.dump" + + +def get_parameter(name): + response = ssm.get_parameter(Name=name) + return response["Parameter"]["Value"] + + +SOURCE_DATABASE = "ynr_export_test" +TMP_DATABASE_NAME = "ynr-for-dev-export" +DB_HOST = get_parameter("/ynr/production/POSTGRES_HOST") +DB_USER = get_parameter("/ynr/production/POSTGRES_USERNAME") +DB_PASSWORD = get_parameter("/ynr/production/POSTGRES_PASSWORD") +DB_PORT = "5432" +os.environ["PGPASSWORD"] = DB_PASSWORD + + +def get_db_conn(db_name): + conn = psycopg.connect( + dbname=db_name, + user=DB_USER, + password=DB_PASSWORD, + host=DB_HOST, + port=DB_PORT, + ) + conn.autocommit = True + return conn + + +def create_database_from_template(): + # Connect to the PostgreSQL server (usually to the 'postgres' database for administrative tasks) + conn = get_db_conn(SOURCE_DATABASE) + # Enable autocommit to run CREATE DATABASE commands + try: + with conn.cursor() as cur: + print(f"Deleting {TMP_DATABASE_NAME}") + cur.execute( + sql.SQL("DROP DATABASE IF EXISTS {};").format( + sql.Identifier(TMP_DATABASE_NAME) + ) + ) + with conn.cursor() as cur: + # SQL to create the new database from the template + print(f"Creating {TMP_DATABASE_NAME}") + cur.execute( + sql.SQL("CREATE DATABASE {} TEMPLATE {};").format( + sql.Identifier(TMP_DATABASE_NAME), + sql.Identifier(SOURCE_DATABASE), + ) + ) + print( + f"Database '{TMP_DATABASE_NAME}' created successfully from template '{SOURCE_DATABASE}'." + ) + except psycopg.Error as e: + print(f"Error creating database: {e}") + finally: + conn.close() + + +def clean_database(): + conn = get_db_conn(db_name=TMP_DATABASE_NAME) + with conn.cursor() as cur: + print("Cleaning Users table") + cur.execute( + """UPDATE auth_user SET + email = CONCAT('anon_', id, '@example.com'), + password = md5(random()::text); + """ + ) + print("Cleaning Account email table") + cur.execute( + """UPDATE auth_user SET + email = CONCAT('anon_', id, '@example.com'); + """ + ) + print("Cleaning IP addresses from LoggedActions") + cur.execute( + """UPDATE candidates_loggedaction SET + ip_address = '127.0.0.1'; + """ + ) + print("Cleaning API tokens") + cur.execute( + """UPDATE authtoken_token SET + key = md5(random()::text); + """ + ) + print("Cleaning sessions") + cur.execute("""TRUNCATE TABLE django_session;""") + + +def dump_and_export(): + dump_file = "/tmp/db_dump.sql" # Temporary file for the dump + + # Database credentials and parameters + + print("Run pg_dump to create the database dump") + try: + subprocess.run( + [ + "pg_dump", + "-h", + DB_HOST, + "-U", + DB_USER, + "-d", + TMP_DATABASE_NAME, + "-Fc", + "-f", + dump_file, + ], + check=True, + ) + + print("Upload the dump to S3") + s3.upload_file(dump_file, bucket_name, FILENAME) + + print("Generate a presigned URL for downloading the dump") + presigned_url = s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket_name, "Key": FILENAME}, + ExpiresIn=3600, # URL expires in 1 hour + ) + print("Finished") + return presigned_url + + except subprocess.CalledProcessError as e: + return f"Error generating database dump: {str(e)}" + + +def check_for_recent_exports(): + """ + If we've exported a file in the last hour, don't export another one + + """ + one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1) + response = s3.list_objects_v2(Bucket=bucket_name, Prefix=PREFIX) + if "Contents" in response: + recent_files = [ + obj + for obj in response["Contents"] + if obj["LastModified"] >= one_hour_ago + ] + + recent_files.sort(key=lambda obj: obj["LastModified"], reverse=True) + + if recent_files: + return s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket_name, "Key": recent_files[0]["Key"]}, + ExpiresIn=3600, # URL expires in 1 hour + ) + return None + + +def lambda_handler(event, context): + if recent_export := check_for_recent_exports(): + return recent_export + + print("Creating temp database") + create_database_from_template() + print("Cleaning temp database") + clean_database() + print("Dumping and exporting") + return dump_and_export() diff --git a/deploy/data_exporter/data_export_function/requirements.txt b/deploy/data_exporter/data_export_function/requirements.txt new file mode 100644 index 000000000..934ff63fb --- /dev/null +++ b/deploy/data_exporter/data_export_function/requirements.txt @@ -0,0 +1,2 @@ +boto3===1.35.56 +psycopg[binary]==3.2.3 diff --git a/deploy/data_exporter/samconfig.toml b/deploy/data_exporter/samconfig.toml new file mode 100644 index 000000000..7eefffa43 --- /dev/null +++ b/deploy/data_exporter/samconfig.toml @@ -0,0 +1,33 @@ +# More information about the configuration file can be found here: +# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-config.html +version = 0.1 + +[default.global.parameters] +stack_name = "ynr-data-exporter" + +[default.build.parameters] +cached = true +parallel = true + +[default.validate.parameters] +lint = true + +[default.deploy.parameters] +capabilities = "CAPABILITY_IAM" +confirm_changeset = true +resolve_s3 = true +s3_prefix = "ynr-data-exporter" +region = "eu-west-2" +image_repositories = ["DataExportFunction=929325949831.dkr.ecr.eu-west-2.amazonaws.com/ynrdataexporter736bb2dc/dataexportfunctionb95e9e19repo"] + +[default.package.parameters] +resolve_s3 = true + +[default.sync.parameters] +watch = true + +[default.local_start_api.parameters] +warm_containers = "EAGER" + +[default.local_start_lambda.parameters] +warm_containers = "EAGER" diff --git a/deploy/data_exporter/template.yaml b/deploy/data_exporter/template.yaml new file mode 100644 index 000000000..a88c8a578 --- /dev/null +++ b/deploy/data_exporter/template.yaml @@ -0,0 +1,49 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: > + data_exporter + + Exports data from the prod database, cleans it and puts the resulting dump in an S3 bucket + +Globals: + Function: + Timeout: 600 # 10 minutes + MemorySize: 1024 + + LoggingConfig: + LogFormat: JSON +Resources: + DataExportFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: ynr-data-exporter + PackageType: Image + ImageUri: data_export_function + # Needs to be at least as big as the DB export, currently at around 350mb + EphemeralStorage: + Size: 1024 + # Don't allow more than one export job to run at a time + ReservedConcurrentExecutions: 1 + Policies: + - Statement: + - Sid: S3Access + Effect: Allow + Action: + - s3:* + Resource: + - 'arn:aws:s3:::dc-ynr-short-term-backups' + - 'arn:aws:s3:::dc-ynr-short-term-backups/*' + - Sid: SSM + Effect: Allow + Action: + - ssm:* + Resource: + - 'arn:aws:ssm:*:*:parameter/ynr/*' + +Outputs: + DataExportFunction: + Description: Hello World Lambda Function ARN + Value: !GetAtt DataExportFunction.Arn + DataExportFunctionIamRole: + Description: Implicit IAM Role created for Hello World function + Value: !GetAtt DataExportFunctionRole.Arn diff --git a/scripts/get-prod-db.sh b/scripts/get-prod-db.sh new file mode 100755 index 000000000..1b94a765c --- /dev/null +++ b/scripts/get-prod-db.sh @@ -0,0 +1,57 @@ +#!/bin/sh +set -euxo + +# This script invokes an AWS Lambda function to retrieve a URL for downloading +# a cleaned version of the production database and then restores +# that data locally. By default the db name is "ynr-prod" but you can change the +# local name by passing it as the first argument to the script. +# +# This script requires access to the YNR production AWS account +# +# Usage: +# ./script.sh [LOCAL_DB_NAME] +# +# Arguments: +# LOCAL_DB_NAME: Optional. Name of the local database to restore data to. +# Defaults to 'ynr-prod' if not specified. + +# Configurable variables +LAMBDA_FUNCTION_NAME="ynr-data-exporter" +LOCAL_DB_NAME="${1:-ynr-prod}" + +# Check for required tools +REQUIRED_TOOLS="aws dropdb createdb pg_restore wget" +for tool in $REQUIRED_TOOLS; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "Error: $tool is required but not installed." >&2 + exit 1 + fi +done + +# Create a temporary file and set up clean up on script exit +TEMP_FILE=$(mktemp) +trap 'rm -f "$TEMP_FILE"' EXIT + +# Invoke AWS Lambda and store the result in the temp file +# The result is a presigned URL to the dump file on S3 +echo "Invoking Lambda to get DB URL. This might take a few minutes..." +aws lambda invoke \ + --function-name "$LAMBDA_FUNCTION_NAME" \ + --cli-read-timeout=0 \ + --no-cli-pager \ + --output text \ + --query 'Payload' \ + "$TEMP_FILE" + +# Extract the URL from the response +# This is because the response is quoted, so we just need to remove the quotation marks +URL=$(sed 's/^"\(.*\)"$/\1/' "$TEMP_FILE") +echo "Got URL: $(URL)" + +echo "Dropping DB $(LOCAL_DB_NAME)" +dropdb --if-exists "$LOCAL_DB_NAME" +echo "Creating DB $(LOCAL_DB_NAME)" +createdb "$LOCAL_DB_NAME" + +echo "Downloading and restoring DB $(LOCAL_DB_NAME)" +wget -qO- "$URL" | pg_restore -d "$LOCAL_DB_NAME" -Fc --no-owner --no-privileges