Skip to content

Commit

Permalink
Spike data export for restoring prod db to local
Browse files Browse the repository at this point in the history
  • Loading branch information
symroe committed Nov 12, 2024
1 parent 219f6db commit 8a729cd
Show file tree
Hide file tree
Showing 8 changed files with 334 additions and 0 deletions.
1 change: 1 addition & 0 deletions deploy/data_exporter/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.aws-sam
Empty file.
14 changes: 14 additions & 0 deletions deploy/data_exporter/data_export_function/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM public.ecr.aws/docker/library/ubuntu:24.04

RUN apt update && \
apt install -y postgresql-client-16 python3.12 python3-pip curl unzip && \
rm -rf /var/lib/apt/lists/*

COPY requirements.txt .
RUN pip3 install --break-system-packages -r requirements.txt
RUN pip3 install --break-system-packages awslambdaric

COPY . .

ENTRYPOINT ["python3", "-m", "awslambdaric" ]
CMD [ "app.lambda_handler" ]
178 changes: 178 additions & 0 deletions deploy/data_exporter/data_export_function/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import os
import subprocess
from datetime import datetime, timedelta, timezone

import boto3
import psycopg
from psycopg import sql

ssm = boto3.client("ssm")
s3 = boto3.client("s3", region_name="eu-west-1")
bucket_name = "dc-ynr-short-term-backups"
current_time = datetime.now().isoformat()
PREFIX = "ynr-export"
FILENAME = f"{PREFIX}-{current_time.replace(':', '-')}.dump"


def get_parameter(name):
response = ssm.get_parameter(Name=name)
return response["Parameter"]["Value"]


SOURCE_DATABASE = "ynr"
TMP_DATABASE_NAME = "ynr-for-dev-export"
DB_HOST = get_parameter("/ynr/production/POSTGRES_HOST")
DB_USER = get_parameter("/ynr/production/POSTGRES_USERNAME")
DB_PASSWORD = get_parameter("/ynr/production/POSTGRES_PASSWORD")
DB_PORT = "5432"
os.environ["PGPASSWORD"] = DB_PASSWORD


def get_db_conn(db_name):
conn = psycopg.connect(
dbname=db_name,
user=DB_USER,
password=DB_PASSWORD,
host=DB_HOST,
port=DB_PORT,
)
conn.autocommit = True
return conn


def create_database_from_template():
# Connect to the PostgreSQL server (usually to the 'postgres' database for administrative tasks)
conn = get_db_conn(SOURCE_DATABASE)
# Enable autocommit to run CREATE DATABASE commands
try:
with conn.cursor() as cur:
print(f"Deleting {TMP_DATABASE_NAME}")
cur.execute(
sql.SQL("DROP DATABASE IF EXISTS {};").format(
sql.Identifier(TMP_DATABASE_NAME)
)
)
with conn.cursor() as cur:
# SQL to create the new database from the template
print(f"Creating {TMP_DATABASE_NAME}")
cur.execute(
sql.SQL("CREATE DATABASE {} TEMPLATE {};").format(
sql.Identifier(TMP_DATABASE_NAME),
sql.Identifier(SOURCE_DATABASE),
)
)
print(
f"Database '{TMP_DATABASE_NAME}' created successfully from template '{SOURCE_DATABASE}'."
)
except psycopg.Error as e:
print(f"Error creating database: {e}")
finally:
conn.close()


def clean_database():
conn = get_db_conn(db_name=TMP_DATABASE_NAME)
with conn.cursor() as cur:
print("Cleaning Users table")
cur.execute(
"""UPDATE auth_user SET
email = CONCAT('anon_', id, '@example.com'),
password = md5(random()::text);
"""
)
print("Cleaning Account email table")
cur.execute(
"""UPDATE auth_user SET
email = CONCAT('anon_', id, '@example.com');
"""
)
print("Cleaning IP addresses from LoggedActions")
cur.execute(
"""UPDATE candidates_loggedaction SET
ip_address = '127.0.0.1';
"""
)
print("Cleaning API tokens")
cur.execute(
"""UPDATE authtoken_token SET
key = md5(random()::text);
"""
)
print("Cleaning sessions")
cur.execute("""TRUNCATE TABLE django_session;""")


def dump_and_export():
dump_file = "/tmp/db_dump.sql" # Temporary file for the dump

# Database credentials and parameters

print("Run pg_dump to create the database dump")
try:
subprocess.run(
[
"pg_dump",
"-h",
DB_HOST,
"-U",
DB_USER,
"-d",
TMP_DATABASE_NAME,
"-Fc",
"-f",
dump_file,
],
check=True,
)

print("Upload the dump to S3")
s3.upload_file(dump_file, bucket_name, FILENAME)

print("Generate a presigned URL for downloading the dump")
presigned_url = s3.generate_presigned_url(
"get_object",
Params={"Bucket": bucket_name, "Key": FILENAME},
ExpiresIn=3600, # URL expires in 1 hour
)
print("Finished")
return presigned_url

except subprocess.CalledProcessError as e:
return f"Error generating database dump: {str(e)}"


def check_for_recent_exports():
"""
If we've exported a file in the last hour, don't export another one
"""
one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1)
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=PREFIX)
if "Contents" in response:
recent_files = [
obj
for obj in response["Contents"]
if obj["LastModified"] >= one_hour_ago
]

recent_files.sort(key=lambda obj: obj["LastModified"], reverse=True)

if recent_files:
return s3.generate_presigned_url(
"get_object",
Params={"Bucket": bucket_name, "Key": recent_files[0]["Key"]},
ExpiresIn=3600, # URL expires in 1 hour
)
return None


def lambda_handler(event, context):
if recent_export := check_for_recent_exports():
return recent_export

print("Creating temp database")
create_database_from_template()
print("Cleaning temp database")
clean_database()
print("Dumping and exporting")
return dump_and_export()
2 changes: 2 additions & 0 deletions deploy/data_exporter/data_export_function/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
boto3===1.35.56
psycopg[binary]==3.2.3
33 changes: 33 additions & 0 deletions deploy/data_exporter/samconfig.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# More information about the configuration file can be found here:
# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-config.html
version = 0.1

[default.global.parameters]
stack_name = "ynr-data-exporter"

[default.build.parameters]
cached = true
parallel = true

[default.validate.parameters]
lint = true

[default.deploy.parameters]
capabilities = "CAPABILITY_IAM"
confirm_changeset = true
resolve_s3 = true
s3_prefix = "ynr-data-exporter"
region = "eu-west-2"
image_repositories = ["DataExportFunction=929325949831.dkr.ecr.eu-west-2.amazonaws.com/ynrdataexporter736bb2dc/dataexportfunctionb95e9e19repo"]

[default.package.parameters]
resolve_s3 = true

[default.sync.parameters]
watch = true

[default.local_start_api.parameters]
warm_containers = "EAGER"

[default.local_start_lambda.parameters]
warm_containers = "EAGER"
49 changes: 49 additions & 0 deletions deploy/data_exporter/template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
AWSTemplateFormatVersion: '2010-09-09'
Transform: AWS::Serverless-2016-10-31
Description: >
data_exporter
Exports data from the prod database, cleans it and puts the resulting dump in an S3 bucket
Globals:
Function:
Timeout: 600 # 10 minutes
MemorySize: 1024

LoggingConfig:
LogFormat: JSON
Resources:
DataExportFunction:
Type: AWS::Serverless::Function
Properties:
FunctionName: ynr-data-exporter
PackageType: Image
ImageUri: data_export_function
# Needs to be at least as big as the DB export, currently at around 350mb
EphemeralStorage:
Size: 1024
# Don't allow more than one export job to run at a time
ReservedConcurrentExecutions: 1
Policies:
- Statement:
- Sid: S3Access
Effect: Allow
Action:
- s3:*
Resource:
- 'arn:aws:s3:::dc-ynr-short-term-backups'
- 'arn:aws:s3:::dc-ynr-short-term-backups/*'
- Sid: SSM
Effect: Allow
Action:
- ssm:*
Resource:
- 'arn:aws:ssm:*:*:parameter/ynr/*'

Outputs:
DataExportFunction:
Description: Hello World Lambda Function ARN
Value: !GetAtt DataExportFunction.Arn
DataExportFunctionIamRole:
Description: Implicit IAM Role created for Hello World function
Value: !GetAtt DataExportFunctionRole.Arn
57 changes: 57 additions & 0 deletions scripts/get-prod-db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/sh
set -euxo

# This script invokes an AWS Lambda function to retrieve a URL for downloading
# a cleaned version of the production database and then restores
# that data locally. By default the db name is "ynr-prod" but you can change the
# local name by passing it as the first argument to the script.
#
# This script requires access to the YNR production AWS account
#
# Usage:
# ./script.sh [LOCAL_DB_NAME]
#
# Arguments:
# LOCAL_DB_NAME: Optional. Name of the local database to restore data to.
# Defaults to 'ynr-prod' if not specified.

# Configurable variables
LAMBDA_FUNCTION_NAME="ynr-data-exporter"
LOCAL_DB_NAME="${1:-ynr-prod}"

# Check for required tools
REQUIRED_TOOLS="aws dropdb createdb pg_restore wget"
for tool in $REQUIRED_TOOLS; do
if ! command -v "$tool" >/dev/null 2>&1; then
echo "Error: $tool is required but not installed." >&2
exit 1
fi
done

# Create a temporary file and set up clean up on script exit
TEMP_FILE=$(mktemp)
trap 'rm -f "$TEMP_FILE"' EXIT

# Invoke AWS Lambda and store the result in the temp file
# The result is a presigned URL to the dump file on S3
echo "Invoking Lambda to get DB URL. This might take a few minutes..."
aws lambda invoke \
--function-name "$LAMBDA_FUNCTION_NAME" \
--cli-read-timeout=0 \
--no-cli-pager \
--output text \
--query 'Payload' \
"$TEMP_FILE"

# Extract the URL from the response
# This is because the response is quoted, so we just need to remove the quotation marks
URL=$(sed 's/^"\(.*\)"$/\1/' "$TEMP_FILE")
echo "Got URL: $(URL)"

echo "Dropping DB $(LOCAL_DB_NAME)"
dropdb --if-exists "$LOCAL_DB_NAME"
echo "Creating DB $(LOCAL_DB_NAME)"
createdb "$LOCAL_DB_NAME"

echo "Downloading and restoring DB $(LOCAL_DB_NAME)"
wget -qO- "$URL" | pg_restore -d "$LOCAL_DB_NAME" -Fc --no-owner --no-privileges

0 comments on commit 8a729cd

Please sign in to comment.