-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Spike data export for restoring prod db to local
- Loading branch information
Showing
8 changed files
with
334 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.aws-sam |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
FROM public.ecr.aws/docker/library/ubuntu:24.04 | ||
|
||
RUN apt update && \ | ||
apt install -y postgresql-client-16 python3.12 python3-pip curl unzip && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
COPY requirements.txt . | ||
RUN pip3 install --break-system-packages -r requirements.txt | ||
RUN pip3 install --break-system-packages awslambdaric | ||
|
||
COPY . . | ||
|
||
ENTRYPOINT ["python3", "-m", "awslambdaric" ] | ||
CMD [ "app.lambda_handler" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
import os | ||
import subprocess | ||
from datetime import datetime, timedelta, timezone | ||
|
||
import boto3 | ||
import psycopg | ||
from psycopg import sql | ||
|
||
ssm = boto3.client("ssm") | ||
s3 = boto3.client("s3", region_name="eu-west-1") | ||
bucket_name = "dc-ynr-short-term-backups" | ||
current_time = datetime.now().isoformat() | ||
PREFIX = "ynr-export" | ||
FILENAME = f"{PREFIX}-{current_time.replace(':', '-')}.dump" | ||
|
||
|
||
def get_parameter(name): | ||
response = ssm.get_parameter(Name=name) | ||
return response["Parameter"]["Value"] | ||
|
||
|
||
SOURCE_DATABASE = "ynr" | ||
TMP_DATABASE_NAME = "ynr-for-dev-export" | ||
DB_HOST = get_parameter("/ynr/production/POSTGRES_HOST") | ||
DB_USER = get_parameter("/ynr/production/POSTGRES_USERNAME") | ||
DB_PASSWORD = get_parameter("/ynr/production/POSTGRES_PASSWORD") | ||
DB_PORT = "5432" | ||
os.environ["PGPASSWORD"] = DB_PASSWORD | ||
|
||
|
||
def get_db_conn(db_name): | ||
conn = psycopg.connect( | ||
dbname=db_name, | ||
user=DB_USER, | ||
password=DB_PASSWORD, | ||
host=DB_HOST, | ||
port=DB_PORT, | ||
) | ||
conn.autocommit = True | ||
return conn | ||
|
||
|
||
def create_database_from_template(): | ||
# Connect to the PostgreSQL server (usually to the 'postgres' database for administrative tasks) | ||
conn = get_db_conn(SOURCE_DATABASE) | ||
# Enable autocommit to run CREATE DATABASE commands | ||
try: | ||
with conn.cursor() as cur: | ||
print(f"Deleting {TMP_DATABASE_NAME}") | ||
cur.execute( | ||
sql.SQL("DROP DATABASE IF EXISTS {};").format( | ||
sql.Identifier(TMP_DATABASE_NAME) | ||
) | ||
) | ||
with conn.cursor() as cur: | ||
# SQL to create the new database from the template | ||
print(f"Creating {TMP_DATABASE_NAME}") | ||
cur.execute( | ||
sql.SQL("CREATE DATABASE {} TEMPLATE {};").format( | ||
sql.Identifier(TMP_DATABASE_NAME), | ||
sql.Identifier(SOURCE_DATABASE), | ||
) | ||
) | ||
print( | ||
f"Database '{TMP_DATABASE_NAME}' created successfully from template '{SOURCE_DATABASE}'." | ||
) | ||
except psycopg.Error as e: | ||
print(f"Error creating database: {e}") | ||
finally: | ||
conn.close() | ||
|
||
|
||
def clean_database(): | ||
conn = get_db_conn(db_name=TMP_DATABASE_NAME) | ||
with conn.cursor() as cur: | ||
print("Cleaning Users table") | ||
cur.execute( | ||
"""UPDATE auth_user SET | ||
email = CONCAT('anon_', id, '@example.com'), | ||
password = md5(random()::text); | ||
""" | ||
) | ||
print("Cleaning Account email table") | ||
cur.execute( | ||
"""UPDATE auth_user SET | ||
email = CONCAT('anon_', id, '@example.com'); | ||
""" | ||
) | ||
print("Cleaning IP addresses from LoggedActions") | ||
cur.execute( | ||
"""UPDATE candidates_loggedaction SET | ||
ip_address = '127.0.0.1'; | ||
""" | ||
) | ||
print("Cleaning API tokens") | ||
cur.execute( | ||
"""UPDATE authtoken_token SET | ||
key = md5(random()::text); | ||
""" | ||
) | ||
print("Cleaning sessions") | ||
cur.execute("""TRUNCATE TABLE django_session;""") | ||
|
||
|
||
def dump_and_export(): | ||
dump_file = "/tmp/db_dump.sql" # Temporary file for the dump | ||
|
||
# Database credentials and parameters | ||
|
||
print("Run pg_dump to create the database dump") | ||
try: | ||
subprocess.run( | ||
[ | ||
"pg_dump", | ||
"-h", | ||
DB_HOST, | ||
"-U", | ||
DB_USER, | ||
"-d", | ||
TMP_DATABASE_NAME, | ||
"-Fc", | ||
"-f", | ||
dump_file, | ||
], | ||
check=True, | ||
) | ||
|
||
print("Upload the dump to S3") | ||
s3.upload_file(dump_file, bucket_name, FILENAME) | ||
|
||
print("Generate a presigned URL for downloading the dump") | ||
presigned_url = s3.generate_presigned_url( | ||
"get_object", | ||
Params={"Bucket": bucket_name, "Key": FILENAME}, | ||
ExpiresIn=3600, # URL expires in 1 hour | ||
) | ||
print("Finished") | ||
return presigned_url | ||
|
||
except subprocess.CalledProcessError as e: | ||
return f"Error generating database dump: {str(e)}" | ||
|
||
|
||
def check_for_recent_exports(): | ||
""" | ||
If we've exported a file in the last hour, don't export another one | ||
""" | ||
one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1) | ||
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=PREFIX) | ||
if "Contents" in response: | ||
recent_files = [ | ||
obj | ||
for obj in response["Contents"] | ||
if obj["LastModified"] >= one_hour_ago | ||
] | ||
|
||
recent_files.sort(key=lambda obj: obj["LastModified"], reverse=True) | ||
|
||
if recent_files: | ||
return s3.generate_presigned_url( | ||
"get_object", | ||
Params={"Bucket": bucket_name, "Key": recent_files[0]["Key"]}, | ||
ExpiresIn=3600, # URL expires in 1 hour | ||
) | ||
return None | ||
|
||
|
||
def lambda_handler(event, context): | ||
if recent_export := check_for_recent_exports(): | ||
return recent_export | ||
|
||
print("Creating temp database") | ||
create_database_from_template() | ||
print("Cleaning temp database") | ||
clean_database() | ||
print("Dumping and exporting") | ||
return dump_and_export() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
boto3===1.35.56 | ||
psycopg[binary]==3.2.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# More information about the configuration file can be found here: | ||
# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-config.html | ||
version = 0.1 | ||
|
||
[default.global.parameters] | ||
stack_name = "ynr-data-exporter" | ||
|
||
[default.build.parameters] | ||
cached = true | ||
parallel = true | ||
|
||
[default.validate.parameters] | ||
lint = true | ||
|
||
[default.deploy.parameters] | ||
capabilities = "CAPABILITY_IAM" | ||
confirm_changeset = true | ||
resolve_s3 = true | ||
s3_prefix = "ynr-data-exporter" | ||
region = "eu-west-2" | ||
image_repositories = ["DataExportFunction=929325949831.dkr.ecr.eu-west-2.amazonaws.com/ynrdataexporter736bb2dc/dataexportfunctionb95e9e19repo"] | ||
|
||
[default.package.parameters] | ||
resolve_s3 = true | ||
|
||
[default.sync.parameters] | ||
watch = true | ||
|
||
[default.local_start_api.parameters] | ||
warm_containers = "EAGER" | ||
|
||
[default.local_start_lambda.parameters] | ||
warm_containers = "EAGER" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
AWSTemplateFormatVersion: '2010-09-09' | ||
Transform: AWS::Serverless-2016-10-31 | ||
Description: > | ||
data_exporter | ||
Exports data from the prod database, cleans it and puts the resulting dump in an S3 bucket | ||
Globals: | ||
Function: | ||
Timeout: 600 # 10 minutes | ||
MemorySize: 1024 | ||
|
||
LoggingConfig: | ||
LogFormat: JSON | ||
Resources: | ||
DataExportFunction: | ||
Type: AWS::Serverless::Function | ||
Properties: | ||
FunctionName: ynr-data-exporter | ||
PackageType: Image | ||
ImageUri: data_export_function | ||
# Needs to be at least as big as the DB export, currently at around 350mb | ||
EphemeralStorage: | ||
Size: 1024 | ||
# Don't allow more than one export job to run at a time | ||
ReservedConcurrentExecutions: 1 | ||
Policies: | ||
- Statement: | ||
- Sid: S3Access | ||
Effect: Allow | ||
Action: | ||
- s3:* | ||
Resource: | ||
- 'arn:aws:s3:::dc-ynr-short-term-backups' | ||
- 'arn:aws:s3:::dc-ynr-short-term-backups/*' | ||
- Sid: SSM | ||
Effect: Allow | ||
Action: | ||
- ssm:* | ||
Resource: | ||
- 'arn:aws:ssm:*:*:parameter/ynr/*' | ||
|
||
Outputs: | ||
DataExportFunction: | ||
Description: Hello World Lambda Function ARN | ||
Value: !GetAtt DataExportFunction.Arn | ||
DataExportFunctionIamRole: | ||
Description: Implicit IAM Role created for Hello World function | ||
Value: !GetAtt DataExportFunctionRole.Arn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#!/bin/sh | ||
set -euxo | ||
|
||
# This script invokes an AWS Lambda function to retrieve a URL for downloading | ||
# a cleaned version of the production database and then restores | ||
# that data locally. By default the db name is "ynr-prod" but you can change the | ||
# local name by passing it as the first argument to the script. | ||
# | ||
# This script requires access to the YNR production AWS account | ||
# | ||
# Usage: | ||
# ./script.sh [LOCAL_DB_NAME] | ||
# | ||
# Arguments: | ||
# LOCAL_DB_NAME: Optional. Name of the local database to restore data to. | ||
# Defaults to 'ynr-prod' if not specified. | ||
|
||
# Configurable variables | ||
LAMBDA_FUNCTION_NAME="ynr-data-exporter" | ||
LOCAL_DB_NAME="${1:-ynr-prod}" | ||
|
||
# Check for required tools | ||
REQUIRED_TOOLS="aws dropdb createdb pg_restore wget" | ||
for tool in $REQUIRED_TOOLS; do | ||
if ! command -v "$tool" >/dev/null 2>&1; then | ||
echo "Error: $tool is required but not installed." >&2 | ||
exit 1 | ||
fi | ||
done | ||
|
||
# Create a temporary file and set up clean up on script exit | ||
TEMP_FILE=$(mktemp) | ||
trap 'rm -f "$TEMP_FILE"' EXIT | ||
|
||
# Invoke AWS Lambda and store the result in the temp file | ||
# The result is a presigned URL to the dump file on S3 | ||
echo "Invoking Lambda to get DB URL. This might take a few minutes..." | ||
aws lambda invoke \ | ||
--function-name "$LAMBDA_FUNCTION_NAME" \ | ||
--cli-read-timeout=0 \ | ||
--no-cli-pager \ | ||
--output text \ | ||
--query 'Payload' \ | ||
"$TEMP_FILE" | ||
|
||
# Extract the URL from the response | ||
# This is because the response is quoted, so we just need to remove the quotation marks | ||
URL=$(sed 's/^"\(.*\)"$/\1/' "$TEMP_FILE") | ||
echo "Got URL: $(URL)" | ||
|
||
echo "Dropping DB $(LOCAL_DB_NAME)" | ||
dropdb --if-exists "$LOCAL_DB_NAME" | ||
echo "Creating DB $(LOCAL_DB_NAME)" | ||
createdb "$LOCAL_DB_NAME" | ||
|
||
echo "Downloading and restoring DB $(LOCAL_DB_NAME)" | ||
wget -qO- "$URL" | pg_restore -d "$LOCAL_DB_NAME" -Fc --no-owner --no-privileges |