From 8a729cd9e4ec423b74d50fdd4a2ea085a387e5fb Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Tue, 12 Nov 2024 13:21:47 +0000 Subject: [PATCH 01/10] Spike data export for restoring prod db to local --- deploy/data_exporter/.gitignore | 1 + deploy/data_exporter/__init__.py | 0 .../data_export_function/Dockerfile | 14 ++ .../data_exporter/data_export_function/app.py | 178 ++++++++++++++++++ .../data_export_function/requirements.txt | 2 + deploy/data_exporter/samconfig.toml | 33 ++++ deploy/data_exporter/template.yaml | 49 +++++ scripts/get-prod-db.sh | 57 ++++++ 8 files changed, 334 insertions(+) create mode 100644 deploy/data_exporter/.gitignore create mode 100644 deploy/data_exporter/__init__.py create mode 100644 deploy/data_exporter/data_export_function/Dockerfile create mode 100644 deploy/data_exporter/data_export_function/app.py create mode 100644 deploy/data_exporter/data_export_function/requirements.txt create mode 100644 deploy/data_exporter/samconfig.toml create mode 100644 deploy/data_exporter/template.yaml create mode 100755 scripts/get-prod-db.sh diff --git a/deploy/data_exporter/.gitignore b/deploy/data_exporter/.gitignore new file mode 100644 index 000000000..0a03531c6 --- /dev/null +++ b/deploy/data_exporter/.gitignore @@ -0,0 +1 @@ +.aws-sam diff --git a/deploy/data_exporter/__init__.py b/deploy/data_exporter/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deploy/data_exporter/data_export_function/Dockerfile b/deploy/data_exporter/data_export_function/Dockerfile new file mode 100644 index 000000000..63a5fd7ec --- /dev/null +++ b/deploy/data_exporter/data_export_function/Dockerfile @@ -0,0 +1,14 @@ +FROM public.ecr.aws/docker/library/ubuntu:24.04 + +RUN apt update && \ + apt install -y postgresql-client-16 python3.12 python3-pip curl unzip && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip3 install --break-system-packages -r requirements.txt +RUN pip3 install --break-system-packages awslambdaric + +COPY . . + +ENTRYPOINT ["python3", "-m", "awslambdaric" ] +CMD [ "app.lambda_handler" ] diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py new file mode 100644 index 000000000..8d6704be5 --- /dev/null +++ b/deploy/data_exporter/data_export_function/app.py @@ -0,0 +1,178 @@ +import os +import subprocess +from datetime import datetime, timedelta, timezone + +import boto3 +import psycopg +from psycopg import sql + +ssm = boto3.client("ssm") +s3 = boto3.client("s3", region_name="eu-west-1") +bucket_name = "dc-ynr-short-term-backups" +current_time = datetime.now().isoformat() +PREFIX = "ynr-export" +FILENAME = f"{PREFIX}-{current_time.replace(':', '-')}.dump" + + +def get_parameter(name): + response = ssm.get_parameter(Name=name) + return response["Parameter"]["Value"] + + +SOURCE_DATABASE = "ynr" +TMP_DATABASE_NAME = "ynr-for-dev-export" +DB_HOST = get_parameter("/ynr/production/POSTGRES_HOST") +DB_USER = get_parameter("/ynr/production/POSTGRES_USERNAME") +DB_PASSWORD = get_parameter("/ynr/production/POSTGRES_PASSWORD") +DB_PORT = "5432" +os.environ["PGPASSWORD"] = DB_PASSWORD + + +def get_db_conn(db_name): + conn = psycopg.connect( + dbname=db_name, + user=DB_USER, + password=DB_PASSWORD, + host=DB_HOST, + port=DB_PORT, + ) + conn.autocommit = True + return conn + + +def create_database_from_template(): + # Connect to the PostgreSQL server (usually to the 'postgres' database for administrative tasks) + conn = get_db_conn(SOURCE_DATABASE) + # Enable autocommit to run CREATE DATABASE commands + try: + with conn.cursor() as cur: + print(f"Deleting {TMP_DATABASE_NAME}") + cur.execute( + sql.SQL("DROP DATABASE IF EXISTS {};").format( + sql.Identifier(TMP_DATABASE_NAME) + ) + ) + with conn.cursor() as cur: + # SQL to create the new database from the template + print(f"Creating {TMP_DATABASE_NAME}") + cur.execute( + sql.SQL("CREATE DATABASE {} TEMPLATE {};").format( + sql.Identifier(TMP_DATABASE_NAME), + sql.Identifier(SOURCE_DATABASE), + ) + ) + print( + f"Database '{TMP_DATABASE_NAME}' created successfully from template '{SOURCE_DATABASE}'." + ) + except psycopg.Error as e: + print(f"Error creating database: {e}") + finally: + conn.close() + + +def clean_database(): + conn = get_db_conn(db_name=TMP_DATABASE_NAME) + with conn.cursor() as cur: + print("Cleaning Users table") + cur.execute( + """UPDATE auth_user SET + email = CONCAT('anon_', id, '@example.com'), + password = md5(random()::text); + """ + ) + print("Cleaning Account email table") + cur.execute( + """UPDATE auth_user SET + email = CONCAT('anon_', id, '@example.com'); + """ + ) + print("Cleaning IP addresses from LoggedActions") + cur.execute( + """UPDATE candidates_loggedaction SET + ip_address = '127.0.0.1'; + """ + ) + print("Cleaning API tokens") + cur.execute( + """UPDATE authtoken_token SET + key = md5(random()::text); + """ + ) + print("Cleaning sessions") + cur.execute("""TRUNCATE TABLE django_session;""") + + +def dump_and_export(): + dump_file = "/tmp/db_dump.sql" # Temporary file for the dump + + # Database credentials and parameters + + print("Run pg_dump to create the database dump") + try: + subprocess.run( + [ + "pg_dump", + "-h", + DB_HOST, + "-U", + DB_USER, + "-d", + TMP_DATABASE_NAME, + "-Fc", + "-f", + dump_file, + ], + check=True, + ) + + print("Upload the dump to S3") + s3.upload_file(dump_file, bucket_name, FILENAME) + + print("Generate a presigned URL for downloading the dump") + presigned_url = s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket_name, "Key": FILENAME}, + ExpiresIn=3600, # URL expires in 1 hour + ) + print("Finished") + return presigned_url + + except subprocess.CalledProcessError as e: + return f"Error generating database dump: {str(e)}" + + +def check_for_recent_exports(): + """ + If we've exported a file in the last hour, don't export another one + + """ + one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1) + response = s3.list_objects_v2(Bucket=bucket_name, Prefix=PREFIX) + if "Contents" in response: + recent_files = [ + obj + for obj in response["Contents"] + if obj["LastModified"] >= one_hour_ago + ] + + recent_files.sort(key=lambda obj: obj["LastModified"], reverse=True) + + if recent_files: + return s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket_name, "Key": recent_files[0]["Key"]}, + ExpiresIn=3600, # URL expires in 1 hour + ) + return None + + +def lambda_handler(event, context): + if recent_export := check_for_recent_exports(): + return recent_export + + print("Creating temp database") + create_database_from_template() + print("Cleaning temp database") + clean_database() + print("Dumping and exporting") + return dump_and_export() diff --git a/deploy/data_exporter/data_export_function/requirements.txt b/deploy/data_exporter/data_export_function/requirements.txt new file mode 100644 index 000000000..934ff63fb --- /dev/null +++ b/deploy/data_exporter/data_export_function/requirements.txt @@ -0,0 +1,2 @@ +boto3===1.35.56 +psycopg[binary]==3.2.3 diff --git a/deploy/data_exporter/samconfig.toml b/deploy/data_exporter/samconfig.toml new file mode 100644 index 000000000..7eefffa43 --- /dev/null +++ b/deploy/data_exporter/samconfig.toml @@ -0,0 +1,33 @@ +# More information about the configuration file can be found here: +# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-config.html +version = 0.1 + +[default.global.parameters] +stack_name = "ynr-data-exporter" + +[default.build.parameters] +cached = true +parallel = true + +[default.validate.parameters] +lint = true + +[default.deploy.parameters] +capabilities = "CAPABILITY_IAM" +confirm_changeset = true +resolve_s3 = true +s3_prefix = "ynr-data-exporter" +region = "eu-west-2" +image_repositories = ["DataExportFunction=929325949831.dkr.ecr.eu-west-2.amazonaws.com/ynrdataexporter736bb2dc/dataexportfunctionb95e9e19repo"] + +[default.package.parameters] +resolve_s3 = true + +[default.sync.parameters] +watch = true + +[default.local_start_api.parameters] +warm_containers = "EAGER" + +[default.local_start_lambda.parameters] +warm_containers = "EAGER" diff --git a/deploy/data_exporter/template.yaml b/deploy/data_exporter/template.yaml new file mode 100644 index 000000000..a88c8a578 --- /dev/null +++ b/deploy/data_exporter/template.yaml @@ -0,0 +1,49 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: > + data_exporter + + Exports data from the prod database, cleans it and puts the resulting dump in an S3 bucket + +Globals: + Function: + Timeout: 600 # 10 minutes + MemorySize: 1024 + + LoggingConfig: + LogFormat: JSON +Resources: + DataExportFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: ynr-data-exporter + PackageType: Image + ImageUri: data_export_function + # Needs to be at least as big as the DB export, currently at around 350mb + EphemeralStorage: + Size: 1024 + # Don't allow more than one export job to run at a time + ReservedConcurrentExecutions: 1 + Policies: + - Statement: + - Sid: S3Access + Effect: Allow + Action: + - s3:* + Resource: + - 'arn:aws:s3:::dc-ynr-short-term-backups' + - 'arn:aws:s3:::dc-ynr-short-term-backups/*' + - Sid: SSM + Effect: Allow + Action: + - ssm:* + Resource: + - 'arn:aws:ssm:*:*:parameter/ynr/*' + +Outputs: + DataExportFunction: + Description: Hello World Lambda Function ARN + Value: !GetAtt DataExportFunction.Arn + DataExportFunctionIamRole: + Description: Implicit IAM Role created for Hello World function + Value: !GetAtt DataExportFunctionRole.Arn diff --git a/scripts/get-prod-db.sh b/scripts/get-prod-db.sh new file mode 100755 index 000000000..1b94a765c --- /dev/null +++ b/scripts/get-prod-db.sh @@ -0,0 +1,57 @@ +#!/bin/sh +set -euxo + +# This script invokes an AWS Lambda function to retrieve a URL for downloading +# a cleaned version of the production database and then restores +# that data locally. By default the db name is "ynr-prod" but you can change the +# local name by passing it as the first argument to the script. +# +# This script requires access to the YNR production AWS account +# +# Usage: +# ./script.sh [LOCAL_DB_NAME] +# +# Arguments: +# LOCAL_DB_NAME: Optional. Name of the local database to restore data to. +# Defaults to 'ynr-prod' if not specified. + +# Configurable variables +LAMBDA_FUNCTION_NAME="ynr-data-exporter" +LOCAL_DB_NAME="${1:-ynr-prod}" + +# Check for required tools +REQUIRED_TOOLS="aws dropdb createdb pg_restore wget" +for tool in $REQUIRED_TOOLS; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "Error: $tool is required but not installed." >&2 + exit 1 + fi +done + +# Create a temporary file and set up clean up on script exit +TEMP_FILE=$(mktemp) +trap 'rm -f "$TEMP_FILE"' EXIT + +# Invoke AWS Lambda and store the result in the temp file +# The result is a presigned URL to the dump file on S3 +echo "Invoking Lambda to get DB URL. This might take a few minutes..." +aws lambda invoke \ + --function-name "$LAMBDA_FUNCTION_NAME" \ + --cli-read-timeout=0 \ + --no-cli-pager \ + --output text \ + --query 'Payload' \ + "$TEMP_FILE" + +# Extract the URL from the response +# This is because the response is quoted, so we just need to remove the quotation marks +URL=$(sed 's/^"\(.*\)"$/\1/' "$TEMP_FILE") +echo "Got URL: $(URL)" + +echo "Dropping DB $(LOCAL_DB_NAME)" +dropdb --if-exists "$LOCAL_DB_NAME" +echo "Creating DB $(LOCAL_DB_NAME)" +createdb "$LOCAL_DB_NAME" + +echo "Downloading and restoring DB $(LOCAL_DB_NAME)" +wget -qO- "$URL" | pg_restore -d "$LOCAL_DB_NAME" -Fc --no-owner --no-privileges From db04b9deb2180b2bb4289e6081cd1a77ff2c5381 Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Thu, 14 Nov 2024 13:30:04 +0000 Subject: [PATCH 02/10] Use pg_dump/restore rather than a template DB --- .../data_exporter/data_export_function/app.py | 43 ++++++++++++++++--- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py index 8d6704be5..c437e2782 100644 --- a/deploy/data_exporter/data_export_function/app.py +++ b/deploy/data_exporter/data_export_function/app.py @@ -40,7 +40,7 @@ def get_db_conn(db_name): return conn -def create_database_from_template(): +def create_database_and_restore(): # Connect to the PostgreSQL server (usually to the 'postgres' database for administrative tasks) conn = get_db_conn(SOURCE_DATABASE) # Enable autocommit to run CREATE DATABASE commands @@ -56,19 +56,52 @@ def create_database_from_template(): # SQL to create the new database from the template print(f"Creating {TMP_DATABASE_NAME}") cur.execute( - sql.SQL("CREATE DATABASE {} TEMPLATE {};").format( + sql.SQL("CREATE DATABASE {} ;").format( sql.Identifier(TMP_DATABASE_NAME), - sql.Identifier(SOURCE_DATABASE), ) ) print( - f"Database '{TMP_DATABASE_NAME}' created successfully from template '{SOURCE_DATABASE}'." + f"Database '{TMP_DATABASE_NAME}' created successfully '{SOURCE_DATABASE}'." ) except psycopg.Error as e: print(f"Error creating database: {e}") finally: conn.close() + # Dump and restore the source DB to the temp one + dump_command = [ + "pg_dump", + "-h", + DB_HOST, + "-U", + DB_USER, + "-d", + SOURCE_DATABASE, + "-Fc", + ] + + restore_command = [ + "pg_restore", + "-h", + DB_HOST, + "-U", + DB_USER, + "-d", + TMP_DATABASE_NAME, + ] + + print("Populating new database (pg_dump | pg_restore") + with subprocess.Popen( + dump_command, + stdout=subprocess.PIPE, + ) as dump_proc: + subprocess.run( + restore_command, + stdin=dump_proc.stdout, + check=True, + ) + dump_proc.stdout.close() + def clean_database(): conn = get_db_conn(db_name=TMP_DATABASE_NAME) @@ -171,7 +204,7 @@ def lambda_handler(event, context): return recent_export print("Creating temp database") - create_database_from_template() + create_database_and_restore() print("Cleaning temp database") clean_database() print("Dumping and exporting") From 7405f180e74b8e47b9d1fc3ba1112be1895cc1e1 Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Thu, 14 Nov 2024 13:32:37 +0000 Subject: [PATCH 03/10] Upper case BUCKET_NAME --- deploy/data_exporter/data_export_function/app.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py index c437e2782..057df0997 100644 --- a/deploy/data_exporter/data_export_function/app.py +++ b/deploy/data_exporter/data_export_function/app.py @@ -8,7 +8,7 @@ ssm = boto3.client("ssm") s3 = boto3.client("s3", region_name="eu-west-1") -bucket_name = "dc-ynr-short-term-backups" +BUCKET_NAME = "dc-ynr-short-term-backups" current_time = datetime.now().isoformat() PREFIX = "ynr-export" FILENAME = f"{PREFIX}-{current_time.replace(':', '-')}.dump" @@ -159,12 +159,12 @@ def dump_and_export(): ) print("Upload the dump to S3") - s3.upload_file(dump_file, bucket_name, FILENAME) + s3.upload_file(dump_file, BUCKET_NAME, FILENAME) print("Generate a presigned URL for downloading the dump") presigned_url = s3.generate_presigned_url( "get_object", - Params={"Bucket": bucket_name, "Key": FILENAME}, + Params={"Bucket": BUCKET_NAME, "Key": FILENAME}, ExpiresIn=3600, # URL expires in 1 hour ) print("Finished") @@ -180,7 +180,7 @@ def check_for_recent_exports(): """ one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1) - response = s3.list_objects_v2(Bucket=bucket_name, Prefix=PREFIX) + response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=PREFIX) if "Contents" in response: recent_files = [ obj @@ -193,7 +193,7 @@ def check_for_recent_exports(): if recent_files: return s3.generate_presigned_url( "get_object", - Params={"Bucket": bucket_name, "Key": recent_files[0]["Key"]}, + Params={"Bucket": BUCKET_NAME, "Key": recent_files[0]["Key"]}, ExpiresIn=3600, # URL expires in 1 hour ) return None From 4ef5a0e531bf6cb36ca8152286683115af1ff7a3 Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Thu, 14 Nov 2024 13:33:48 +0000 Subject: [PATCH 04/10] Fix comment --- scripts/get-prod-db.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/get-prod-db.sh b/scripts/get-prod-db.sh index 1b94a765c..4452a4259 100755 --- a/scripts/get-prod-db.sh +++ b/scripts/get-prod-db.sh @@ -9,7 +9,7 @@ set -euxo # This script requires access to the YNR production AWS account # # Usage: -# ./script.sh [LOCAL_DB_NAME] +# ./scripts/get-prod-db.sh [LOCAL_DB_NAME] # # Arguments: # LOCAL_DB_NAME: Optional. Name of the local database to restore data to. From 7996447c9c41ed5b61270da9292020616e41fe48 Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Thu, 14 Nov 2024 13:38:28 +0000 Subject: [PATCH 05/10] Use venv inside Docker --- deploy/data_exporter/data_export_function/Dockerfile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/deploy/data_exporter/data_export_function/Dockerfile b/deploy/data_exporter/data_export_function/Dockerfile index 63a5fd7ec..c8da43e18 100644 --- a/deploy/data_exporter/data_export_function/Dockerfile +++ b/deploy/data_exporter/data_export_function/Dockerfile @@ -1,14 +1,15 @@ FROM public.ecr.aws/docker/library/ubuntu:24.04 RUN apt update && \ - apt install -y postgresql-client-16 python3.12 python3-pip curl unzip && \ + apt install -y postgresql-client-16 python3.12 python3-pip curl unzip python3.12-venv && \ rm -rf /var/lib/apt/lists/* COPY requirements.txt . -RUN pip3 install --break-system-packages -r requirements.txt -RUN pip3 install --break-system-packages awslambdaric +RUN python3 -m venv .venv +RUN ./.venv/bin/pip install -r requirements.txt +RUN ./.venv/bin/pip install awslambdaric COPY . . -ENTRYPOINT ["python3", "-m", "awslambdaric" ] +ENTRYPOINT [".venv/bin/python", "-m", "awslambdaric" ] CMD [ "app.lambda_handler" ] From c1ab7c0fb120e9d16c8aceabaa016106662dcf65 Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Thu, 14 Nov 2024 13:42:29 +0000 Subject: [PATCH 06/10] Clean up comments --- deploy/data_exporter/data_export_function/app.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py index 057df0997..e2f07d004 100644 --- a/deploy/data_exporter/data_export_function/app.py +++ b/deploy/data_exporter/data_export_function/app.py @@ -41,9 +41,7 @@ def get_db_conn(db_name): def create_database_and_restore(): - # Connect to the PostgreSQL server (usually to the 'postgres' database for administrative tasks) conn = get_db_conn(SOURCE_DATABASE) - # Enable autocommit to run CREATE DATABASE commands try: with conn.cursor() as cur: print(f"Deleting {TMP_DATABASE_NAME}") @@ -53,7 +51,7 @@ def create_database_and_restore(): ) ) with conn.cursor() as cur: - # SQL to create the new database from the template + # SQL to create the new database from the source print(f"Creating {TMP_DATABASE_NAME}") cur.execute( sql.SQL("CREATE DATABASE {} ;").format( @@ -138,8 +136,6 @@ def clean_database(): def dump_and_export(): dump_file = "/tmp/db_dump.sql" # Temporary file for the dump - # Database credentials and parameters - print("Run pg_dump to create the database dump") try: subprocess.run( From 71e4a2f6986d271805dc850ddc61ddcf17837486 Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Thu, 14 Nov 2024 13:42:36 +0000 Subject: [PATCH 07/10] Raise on error --- deploy/data_exporter/data_export_function/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py index e2f07d004..29c2a6583 100644 --- a/deploy/data_exporter/data_export_function/app.py +++ b/deploy/data_exporter/data_export_function/app.py @@ -63,6 +63,8 @@ def create_database_and_restore(): ) except psycopg.Error as e: print(f"Error creating database: {e}") + raise + finally: conn.close() From c36c93afa078fcf6a4c30133bcc6be57d7f7bfa5 Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Thu, 14 Nov 2024 13:54:27 +0000 Subject: [PATCH 08/10] Fix table name --- deploy/data_exporter/data_export_function/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py index 29c2a6583..8b1c5c214 100644 --- a/deploy/data_exporter/data_export_function/app.py +++ b/deploy/data_exporter/data_export_function/app.py @@ -115,7 +115,7 @@ def clean_database(): ) print("Cleaning Account email table") cur.execute( - """UPDATE auth_user SET + """UPDATE account_emailaddress SET email = CONCAT('anon_', id, '@example.com'); """ ) From 51b35706346ae55da7dfb02e385935da7b1d293c Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Fri, 15 Nov 2024 12:56:31 +0000 Subject: [PATCH 09/10] A more robust solution to getting a database connection --- scripts/check-database-url.sh | 95 +++++++++++++++++++++++++++++++++++ scripts/get-prod-db.sh | 30 ++++++++--- 2 files changed, 117 insertions(+), 8 deletions(-) create mode 100755 scripts/check-database-url.sh diff --git a/scripts/check-database-url.sh b/scripts/check-database-url.sh new file mode 100755 index 000000000..7dacba9bb --- /dev/null +++ b/scripts/check-database-url.sh @@ -0,0 +1,95 @@ +# This script does two things: +# +# 1. Gets a DATABASE_URL from the environment or the first argument and +# normalizes it to be able to connect to postgres's CLI tools +# 2. Validates that it's possible to connect to the URL provided +# 3. Sets a validated URL as the `_SCRIPT_DATABASE_URL` environment variable for +# use in other scripts. This only happens if the script detects it's not +# being invoked directly. +# +# This script can be used on its own for validating connections (useful for +# debugging different environments and catching problems early) or as a +# utility script in other scripts that need to connect to a database. + +REQUIRED_POSTGRES_VERSION="16" + +# Check for required tools +REQUIRED_TOOLS="createdb psql" +for tool in $REQUIRED_TOOLS; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "Error: $tool is required but not installed." >&2 + exit 1 + fi +done + + +# Get the database URL +# TODO: we might want this to be its own script +# 1. Check if DATABASE_URL is provided as the first argument +if [ -n "${1:-}" ]; then + echo "Getting DATABASE_URL from the provided argument" + DATABASE_URL="$1" +# 2. Check DATABASE_URL is set in the environment +elif [ -n "$DATABASE_URL" ]; then + echo "Getting DATABASE_URL from the environment" + DATABASE_URL="$DATABASE_URL" +fi + +# Normalize if DATABASE_URL starts with "postgis://" +# We do this because `dj-database-url` uses "postgis://" +# to alter the Django engine that's used, but the postgres +# cli tools don't support this protocol. +case "$DATABASE_URL" in postgis://*) + DATABASE_URL="postgres://${DATABASE_URL#postgis://}" + ;; +esac + +# Check if DATABASE_URL is set after all attempts +if [ -z "$DATABASE_URL" ]; then + echo "Error: DATABASE_URL is not provided." + echo "please the environment variable DATABASE_URL or pass it in as an argument" + echo "The format must comply with \033[4mhttps://www.postgresql.org/docs/$REQUIRED_POSTGRES_VERSION/libpq-connect.html#LIBPQ-CONNSTRING-URIS\033[0m" + exit 1 +fi + +# Extract the database name from the database URL. +# 1. Use sed to remove any trailing slashes +# 2. Use `tr` to replace slashes with newlines +# 3. Use tail to get the last line, e.g the last element after a slash +# 4. Use the same method to strip off any query arguments after a `?` +DB_NAME=$(echo "$DATABASE_URL" | sed 's:/*$::' | tr "/" "\n" | tail -n 1 | tr "?" "\n" | head -n 1) + +# Create the database if it doesn't exist. +# If it already exists, we don't fail. At this point, +# we're only making a DB to ensure that we can connect to the +# database URL in the next step, so we can ignore fails here. +# Because of this, we route the output of `createdb` to /dev/null. +# Without this, the script prints an error that might confuse users +echo "Creating the DB if it doesn't exist." +createdb $DB_NAME >/dev/null 2>&1 || true + +# Check that we can connect to the local DB before returning +psql $DATABASE_URL -c "\q" +if [ $? -ne 0 ]; then + echo "❌ Failed to connect to $DATABASE_URL" + exit 1 +fi + + +# Check the server version +SERVER_POSTGRES_VERSION=$(psql -t -c "SHOW server_version;" -d $DATABASE_URL | cut -d '.' -f 1) +if [ $SERVER_POSTGRES_VERSION != $REQUIRED_POSTGRES_VERSION ]; then + echo "❌ Postgres version $REQUIRED_POSTGRES_VERSION required, found $SERVER_POSTGRES_VERSION" +fi + +echo "✅ Successfully connected to the local database '$DB_NAME'" + + +# Check if the basename of $0 (the file that was executed) is the same +# as this file name. If not, this script is being called as a 'utility' +# so we should set an environment variable. +if [ "${0##*/}" != "check-database-url.sh" ]; then + # Script is being sourced, export a "private" DATABASE URL + # that we can use in other scripts + export _SCRIPT_DATABASE_URL=$DATABASE_URL +fi diff --git a/scripts/get-prod-db.sh b/scripts/get-prod-db.sh index 4452a4259..562e6b574 100755 --- a/scripts/get-prod-db.sh +++ b/scripts/get-prod-db.sh @@ -28,12 +28,16 @@ for tool in $REQUIRED_TOOLS; do fi done +# Check the DB URL and get the cleaned $_SCRIPT_DATABASE_URL +. ./scripts/check-database-url.sh + + # Create a temporary file and set up clean up on script exit TEMP_FILE=$(mktemp) trap 'rm -f "$TEMP_FILE"' EXIT # Invoke AWS Lambda and store the result in the temp file -# The result is a presigned URL to the dump file on S3 +# The result is a pre-signed URL to the dump file on S3 echo "Invoking Lambda to get DB URL. This might take a few minutes..." aws lambda invoke \ --function-name "$LAMBDA_FUNCTION_NAME" \ @@ -46,12 +50,22 @@ aws lambda invoke \ # Extract the URL from the response # This is because the response is quoted, so we just need to remove the quotation marks URL=$(sed 's/^"\(.*\)"$/\1/' "$TEMP_FILE") -echo "Got URL: $(URL)" +case "$URL" in + https://*) + echo "Got URL: $(URL)" + + ;; + *) + echo "The received URL looks invalid. This might mean the database export failed." + echo "Check the logs of the '$LAMBDA_FUNCTION_NAME' Lambda function" + exit 1 + ;; +esac -echo "Dropping DB $(LOCAL_DB_NAME)" -dropdb --if-exists "$LOCAL_DB_NAME" -echo "Creating DB $(LOCAL_DB_NAME)" -createdb "$LOCAL_DB_NAME" +echo "Dropping DB $(_SCRIPT_DATABASE_URL)" +dropdb --if-exists "$_SCRIPT_DATABASE_URL" +echo "Creating DB $(_SCRIPT_DATABASE_URL)" +createdb "$_SCRIPT_DATABASE_URL" -echo "Downloading and restoring DB $(LOCAL_DB_NAME)" -wget -qO- "$URL" | pg_restore -d "$LOCAL_DB_NAME" -Fc --no-owner --no-privileges +echo "Downloading and restoring DB $(_SCRIPT_DATABASE_URL)" +wget -qO- "$URL" | pg_restore -d "$_SCRIPT_DATABASE_URL" -Fc --no-owner --no-privileges From 577723cd64dc414a4d0a52e1488480b33b91796c Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Fri, 15 Nov 2024 13:01:37 +0000 Subject: [PATCH 10/10] Make the file name in a function --- deploy/data_exporter/data_export_function/app.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py index 8b1c5c214..68ec122b5 100644 --- a/deploy/data_exporter/data_export_function/app.py +++ b/deploy/data_exporter/data_export_function/app.py @@ -9,9 +9,8 @@ ssm = boto3.client("ssm") s3 = boto3.client("s3", region_name="eu-west-1") BUCKET_NAME = "dc-ynr-short-term-backups" -current_time = datetime.now().isoformat() PREFIX = "ynr-export" -FILENAME = f"{PREFIX}-{current_time.replace(':', '-')}.dump" +FILENAME_FORMAT = "{PREFIX}-{CURRENT_TIME_STR}.dump" def get_parameter(name): @@ -135,6 +134,12 @@ def clean_database(): cur.execute("""TRUNCATE TABLE django_session;""") +def get_filename(): + return FILENAME_FORMAT.format( + PREFIX=PREFIX, CURRENT_TIME=datetime.now().isoformat().replace(":", "-") + ) + + def dump_and_export(): dump_file = "/tmp/db_dump.sql" # Temporary file for the dump @@ -156,13 +161,15 @@ def dump_and_export(): check=True, ) + file_name = get_filename() + print("Upload the dump to S3") - s3.upload_file(dump_file, BUCKET_NAME, FILENAME) + s3.upload_file(dump_file, BUCKET_NAME, file_name) print("Generate a presigned URL for downloading the dump") presigned_url = s3.generate_presigned_url( "get_object", - Params={"Bucket": BUCKET_NAME, "Key": FILENAME}, + Params={"Bucket": BUCKET_NAME, "Key": file_name}, ExpiresIn=3600, # URL expires in 1 hour ) print("Finished")