From 28dc960baa398fbcb6dd12d7bb2005636821a87b Mon Sep 17 00:00:00 2001 From: Sym Roe Date: Tue, 12 Nov 2024 13:21:47 +0000 Subject: [PATCH] Spike data export for restoring prod db to local --- deploy/data_exporter/.gitignore | 1 + deploy/data_exporter/README.md | 130 +++++++++++++ deploy/data_exporter/__init__.py | 0 .../data_export_function/Dockerfile | 14 ++ .../data_exporter/data_export_function/app.py | 178 ++++++++++++++++++ .../data_export_function/requirements.txt | 2 + deploy/data_exporter/samconfig.toml | 33 ++++ deploy/data_exporter/template.yaml | 49 +++++ scripts/get-prod-db.sh | 57 ++++++ 9 files changed, 464 insertions(+) create mode 100644 deploy/data_exporter/.gitignore create mode 100644 deploy/data_exporter/README.md create mode 100644 deploy/data_exporter/__init__.py create mode 100644 deploy/data_exporter/data_export_function/Dockerfile create mode 100644 deploy/data_exporter/data_export_function/app.py create mode 100644 deploy/data_exporter/data_export_function/requirements.txt create mode 100644 deploy/data_exporter/samconfig.toml create mode 100644 deploy/data_exporter/template.yaml create mode 100755 scripts/get-prod-db.sh diff --git a/deploy/data_exporter/.gitignore b/deploy/data_exporter/.gitignore new file mode 100644 index 000000000..0a03531c6 --- /dev/null +++ b/deploy/data_exporter/.gitignore @@ -0,0 +1 @@ +.aws-sam diff --git a/deploy/data_exporter/README.md b/deploy/data_exporter/README.md new file mode 100644 index 000000000..c0026321c --- /dev/null +++ b/deploy/data_exporter/README.md @@ -0,0 +1,130 @@ +# data_exporter + +This project contains source code and supporting files for a serverless application that you can deploy with the SAM CLI. It includes the following files and folders. + +- hello_world - Code for the application's Lambda function. +- events - Invocation events that you can use to invoke the function. +- tests - Unit tests for the application code. +- template.yaml - A template that defines the application's AWS resources. + +The application uses several AWS resources, including Lambda functions and an API Gateway API. These resources are defined in the `template.yaml` file in this project. You can update the template to add AWS resources through the same deployment process that updates your application code. + +If you prefer to use an integrated development environment (IDE) to build and test your application, you can use the AWS Toolkit. +The AWS Toolkit is an open source plug-in for popular IDEs that uses the SAM CLI to build and deploy serverless applications on AWS. The AWS Toolkit also adds a simplified step-through debugging experience for Lambda function code. See the following links to get started. + +* [CLion](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [GoLand](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [IntelliJ](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [WebStorm](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [Rider](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [PhpStorm](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [PyCharm](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [RubyMine](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [DataGrip](https://docs.aws.amazon.com/toolkit-for-jetbrains/latest/userguide/welcome.html) +* [VS Code](https://docs.aws.amazon.com/toolkit-for-vscode/latest/userguide/welcome.html) +* [Visual Studio](https://docs.aws.amazon.com/toolkit-for-visual-studio/latest/user-guide/welcome.html) + +## Deploy the sample application + +The Serverless Application Model Command Line Interface (SAM CLI) is an extension of the AWS CLI that adds functionality for building and testing Lambda applications. It uses Docker to run your functions in an Amazon Linux environment that matches Lambda. It can also emulate your application's build environment and API. + +To use the SAM CLI, you need the following tools. + +* SAM CLI - [Install the SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-install.html) +* [Python 3 installed](https://www.python.org/downloads/) +* Docker - [Install Docker community edition](https://hub.docker.com/search/?type=edition&offering=community) + +To build and deploy your application for the first time, run the following in your shell: + +```bash +sam build --use-container +sam deploy --guided +``` + +The first command will build the source of your application. The second command will package and deploy your application to AWS, with a series of prompts: + +* **Stack Name**: The name of the stack to deploy to CloudFormation. This should be unique to your account and region, and a good starting point would be something matching your project name. +* **AWS Region**: The AWS region you want to deploy your app to. +* **Confirm changes before deploy**: If set to yes, any change sets will be shown to you before execution for manual review. If set to no, the AWS SAM CLI will automatically deploy application changes. +* **Allow SAM CLI IAM role creation**: Many AWS SAM templates, including this example, create AWS IAM roles required for the AWS Lambda function(s) included to access AWS services. By default, these are scoped down to minimum required permissions. To deploy an AWS CloudFormation stack which creates or modifies IAM roles, the `CAPABILITY_IAM` value for `capabilities` must be provided. If permission isn't provided through this prompt, to deploy this example you must explicitly pass `--capabilities CAPABILITY_IAM` to the `sam deploy` command. +* **Save arguments to samconfig.toml**: If set to yes, your choices will be saved to a configuration file inside the project, so that in the future you can just re-run `sam deploy` without parameters to deploy changes to your application. + +You can find your API Gateway Endpoint URL in the output values displayed after deployment. + +## Use the SAM CLI to build and test locally + +Build your application with the `sam build --use-container` command. + +```bash +data_exporter$ sam build --use-container +``` + +The SAM CLI installs dependencies defined in `hello_world/requirements.txt`, creates a deployment package, and saves it in the `.aws-sam/build` folder. + +Test a single function by invoking it directly with a test event. An event is a JSON document that represents the input that the function receives from the event source. Test events are included in the `events` folder in this project. + +Run functions locally and invoke them with the `sam local invoke` command. + +```bash +data_exporter$ sam local invoke HelloWorldFunction --event events/event.json +``` + +The SAM CLI can also emulate your application's API. Use the `sam local start-api` to run the API locally on port 3000. + +```bash +data_exporter$ sam local start-api +data_exporter$ curl http://localhost:3000/ +``` + +The SAM CLI reads the application template to determine the API's routes and the functions that they invoke. The `Events` property on each function's definition includes the route and method for each path. + +```yaml + Events: + HelloWorld: + Type: Api + Properties: + Path: /hello + Method: get +``` + +## Add a resource to your application +The application template uses AWS Serverless Application Model (AWS SAM) to define application resources. AWS SAM is an extension of AWS CloudFormation with a simpler syntax for configuring common serverless application resources such as functions, triggers, and APIs. For resources not included in [the SAM specification](https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md), you can use standard [AWS CloudFormation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-template-resource-type-ref.html) resource types. + +## Fetch, tail, and filter Lambda function logs + +To simplify troubleshooting, SAM CLI has a command called `sam logs`. `sam logs` lets you fetch logs generated by your deployed Lambda function from the command line. In addition to printing the logs on the terminal, this command has several nifty features to help you quickly find the bug. + +`NOTE`: This command works for all AWS Lambda functions; not just the ones you deploy using SAM. + +```bash +data_exporter$ sam logs -n HelloWorldFunction --stack-name "data_exporter" --tail +``` + +You can find more information and examples about filtering Lambda function logs in the [SAM CLI Documentation](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-logging.html). + +## Tests + +Tests are defined in the `tests` folder in this project. Use PIP to install the test dependencies and run tests. + +```bash +data_exporter$ pip install -r tests/requirements.txt --user +# unit test +data_exporter$ python -m pytest tests/unit -v +# integration test, requiring deploying the stack first. +# Create the env variable AWS_SAM_STACK_NAME with the name of the stack we are testing +data_exporter$ AWS_SAM_STACK_NAME="data_exporter" python -m pytest tests/integration -v +``` + +## Cleanup + +To delete the sample application that you created, use the AWS CLI. Assuming you used your project name for the stack name, you can run the following: + +```bash +sam delete --stack-name "data_exporter" +``` + +## Resources + +See the [AWS SAM developer guide](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/what-is-sam.html) for an introduction to SAM specification, the SAM CLI, and serverless application concepts. + +Next, you can use AWS Serverless Application Repository to deploy ready to use Apps that go beyond hello world samples and learn how authors developed their applications: [AWS Serverless Application Repository main page](https://aws.amazon.com/serverless/serverlessrepo/) diff --git a/deploy/data_exporter/__init__.py b/deploy/data_exporter/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deploy/data_exporter/data_export_function/Dockerfile b/deploy/data_exporter/data_export_function/Dockerfile new file mode 100644 index 000000000..63a5fd7ec --- /dev/null +++ b/deploy/data_exporter/data_export_function/Dockerfile @@ -0,0 +1,14 @@ +FROM public.ecr.aws/docker/library/ubuntu:24.04 + +RUN apt update && \ + apt install -y postgresql-client-16 python3.12 python3-pip curl unzip && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip3 install --break-system-packages -r requirements.txt +RUN pip3 install --break-system-packages awslambdaric + +COPY . . + +ENTRYPOINT ["python3", "-m", "awslambdaric" ] +CMD [ "app.lambda_handler" ] diff --git a/deploy/data_exporter/data_export_function/app.py b/deploy/data_exporter/data_export_function/app.py new file mode 100644 index 000000000..ec8f1c3cd --- /dev/null +++ b/deploy/data_exporter/data_export_function/app.py @@ -0,0 +1,178 @@ +import os +import subprocess +from datetime import datetime, timedelta, timezone + +import boto3 +import psycopg +from psycopg import sql + +ssm = boto3.client("ssm") +s3 = boto3.client("s3", region_name="eu-west-1") +bucket_name = "dc-ynr-short-term-backups" +current_time = datetime.now().isoformat() +PREFIX = "ynr-export" +FILENAME = f"{PREFIX}-{current_time.replace(':', '-')}.dump" + + +def get_parameter(name): + response = ssm.get_parameter(Name=name) + return response["Parameter"]["Value"] + + +SOURCE_DATABASE = "ynr_export_test" +TMP_DATABASE_NAME = "ynr-for-dev-export" +DB_HOST = get_parameter("/ynr/production/POSTGRES_HOST") +DB_USER = get_parameter("/ynr/production/POSTGRES_USERNAME") +DB_PASSWORD = get_parameter("/ynr/production/POSTGRES_PASSWORD") +DB_PORT = "5432" +os.environ["PGPASSWORD"] = DB_PASSWORD + + +def get_db_conn(db_name): + conn = psycopg.connect( + dbname=db_name, + user=DB_USER, + password=DB_PASSWORD, + host=DB_HOST, + port=DB_PORT, + ) + conn.autocommit = True + return conn + + +def create_database_from_template(): + # Connect to the PostgreSQL server (usually to the 'postgres' database for administrative tasks) + conn = get_db_conn(SOURCE_DATABASE) + # Enable autocommit to run CREATE DATABASE commands + try: + with conn.cursor() as cur: + print(f"Deleting {TMP_DATABASE_NAME}") + cur.execute( + sql.SQL("DROP DATABASE IF EXISTS {};").format( + sql.Identifier(TMP_DATABASE_NAME) + ) + ) + with conn.cursor() as cur: + # SQL to create the new database from the template + print(f"Creating {TMP_DATABASE_NAME}") + cur.execute( + sql.SQL("CREATE DATABASE {} TEMPLATE {};").format( + sql.Identifier(TMP_DATABASE_NAME), + sql.Identifier(SOURCE_DATABASE), + ) + ) + print( + f"Database '{TMP_DATABASE_NAME}' created successfully from template '{SOURCE_DATABASE}'." + ) + except psycopg.Error as e: + print(f"Error creating database: {e}") + finally: + conn.close() + + +def clean_database(): + conn = get_db_conn(db_name=TMP_DATABASE_NAME) + with conn.cursor() as cur: + print("Cleaning Users table") + cur.execute( + """UPDATE auth_user SET + email = CONCAT('anon_', id, '@example.com'), + password = md5(random()::text); + """ + ) + print("Cleaning Account email table") + cur.execute( + """UPDATE auth_user SET + email = CONCAT('anon_', id, '@example.com'); + """ + ) + print("Cleaning IP addresses from LoggedActions") + cur.execute( + """UPDATE candidates_loggedaction SET + ip_address = '127.0.0.1'; + """ + ) + print("Cleaning API tokens") + cur.execute( + """UPDATE authtoken_token SET + key = md5(random()::text); + """ + ) + print("Cleaning sessions") + cur.execute("""TRUNCATE TABLE django_session;""") + + +def dump_and_export(): + dump_file = "/tmp/db_dump.sql" # Temporary file for the dump + + # Database credentials and parameters + + print("Run pg_dump to create the database dump") + try: + subprocess.run( + [ + "pg_dump", + "-h", + DB_HOST, + "-U", + DB_USER, + "-d", + TMP_DATABASE_NAME, + "-Fc", + "-f", + dump_file, + ], + check=True, + ) + + print("Upload the dump to S3") + s3.upload_file(dump_file, bucket_name, FILENAME) + + print("Generate a presigned URL for downloading the dump") + presigned_url = s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket_name, "Key": FILENAME}, + ExpiresIn=3600, # URL expires in 1 hour + ) + print("Finished") + return presigned_url + + except subprocess.CalledProcessError as e: + return f"Error generating database dump: {str(e)}" + + +def check_for_recent_exports(): + """ + If we've exported a file in the last hour, don't export another one + + """ + one_hour_ago = datetime.now(timezone.utc) - timedelta(hours=1) + response = s3.list_objects_v2(Bucket=bucket_name, Prefix=PREFIX) + if "Contents" in response: + recent_files = [ + obj + for obj in response["Contents"] + if obj["LastModified"] >= one_hour_ago + ] + + recent_files.sort(key=lambda obj: obj["LastModified"], reverse=True) + + if recent_files: + return s3.generate_presigned_url( + "get_object", + Params={"Bucket": bucket_name, "Key": recent_files[0]["Key"]}, + ExpiresIn=3600, # URL expires in 1 hour + ) + return None + + +def lambda_handler(event, context): + if recent_export := check_for_recent_exports(): + return recent_export + + print("Creating temp database") + create_database_from_template() + print("Cleaning temp database") + clean_database() + print("Dumping and exporting") + return dump_and_export() diff --git a/deploy/data_exporter/data_export_function/requirements.txt b/deploy/data_exporter/data_export_function/requirements.txt new file mode 100644 index 000000000..934ff63fb --- /dev/null +++ b/deploy/data_exporter/data_export_function/requirements.txt @@ -0,0 +1,2 @@ +boto3===1.35.56 +psycopg[binary]==3.2.3 diff --git a/deploy/data_exporter/samconfig.toml b/deploy/data_exporter/samconfig.toml new file mode 100644 index 000000000..7eefffa43 --- /dev/null +++ b/deploy/data_exporter/samconfig.toml @@ -0,0 +1,33 @@ +# More information about the configuration file can be found here: +# https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-config.html +version = 0.1 + +[default.global.parameters] +stack_name = "ynr-data-exporter" + +[default.build.parameters] +cached = true +parallel = true + +[default.validate.parameters] +lint = true + +[default.deploy.parameters] +capabilities = "CAPABILITY_IAM" +confirm_changeset = true +resolve_s3 = true +s3_prefix = "ynr-data-exporter" +region = "eu-west-2" +image_repositories = ["DataExportFunction=929325949831.dkr.ecr.eu-west-2.amazonaws.com/ynrdataexporter736bb2dc/dataexportfunctionb95e9e19repo"] + +[default.package.parameters] +resolve_s3 = true + +[default.sync.parameters] +watch = true + +[default.local_start_api.parameters] +warm_containers = "EAGER" + +[default.local_start_lambda.parameters] +warm_containers = "EAGER" diff --git a/deploy/data_exporter/template.yaml b/deploy/data_exporter/template.yaml new file mode 100644 index 000000000..a88c8a578 --- /dev/null +++ b/deploy/data_exporter/template.yaml @@ -0,0 +1,49 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: AWS::Serverless-2016-10-31 +Description: > + data_exporter + + Exports data from the prod database, cleans it and puts the resulting dump in an S3 bucket + +Globals: + Function: + Timeout: 600 # 10 minutes + MemorySize: 1024 + + LoggingConfig: + LogFormat: JSON +Resources: + DataExportFunction: + Type: AWS::Serverless::Function + Properties: + FunctionName: ynr-data-exporter + PackageType: Image + ImageUri: data_export_function + # Needs to be at least as big as the DB export, currently at around 350mb + EphemeralStorage: + Size: 1024 + # Don't allow more than one export job to run at a time + ReservedConcurrentExecutions: 1 + Policies: + - Statement: + - Sid: S3Access + Effect: Allow + Action: + - s3:* + Resource: + - 'arn:aws:s3:::dc-ynr-short-term-backups' + - 'arn:aws:s3:::dc-ynr-short-term-backups/*' + - Sid: SSM + Effect: Allow + Action: + - ssm:* + Resource: + - 'arn:aws:ssm:*:*:parameter/ynr/*' + +Outputs: + DataExportFunction: + Description: Hello World Lambda Function ARN + Value: !GetAtt DataExportFunction.Arn + DataExportFunctionIamRole: + Description: Implicit IAM Role created for Hello World function + Value: !GetAtt DataExportFunctionRole.Arn diff --git a/scripts/get-prod-db.sh b/scripts/get-prod-db.sh new file mode 100755 index 000000000..1b94a765c --- /dev/null +++ b/scripts/get-prod-db.sh @@ -0,0 +1,57 @@ +#!/bin/sh +set -euxo + +# This script invokes an AWS Lambda function to retrieve a URL for downloading +# a cleaned version of the production database and then restores +# that data locally. By default the db name is "ynr-prod" but you can change the +# local name by passing it as the first argument to the script. +# +# This script requires access to the YNR production AWS account +# +# Usage: +# ./script.sh [LOCAL_DB_NAME] +# +# Arguments: +# LOCAL_DB_NAME: Optional. Name of the local database to restore data to. +# Defaults to 'ynr-prod' if not specified. + +# Configurable variables +LAMBDA_FUNCTION_NAME="ynr-data-exporter" +LOCAL_DB_NAME="${1:-ynr-prod}" + +# Check for required tools +REQUIRED_TOOLS="aws dropdb createdb pg_restore wget" +for tool in $REQUIRED_TOOLS; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "Error: $tool is required but not installed." >&2 + exit 1 + fi +done + +# Create a temporary file and set up clean up on script exit +TEMP_FILE=$(mktemp) +trap 'rm -f "$TEMP_FILE"' EXIT + +# Invoke AWS Lambda and store the result in the temp file +# The result is a presigned URL to the dump file on S3 +echo "Invoking Lambda to get DB URL. This might take a few minutes..." +aws lambda invoke \ + --function-name "$LAMBDA_FUNCTION_NAME" \ + --cli-read-timeout=0 \ + --no-cli-pager \ + --output text \ + --query 'Payload' \ + "$TEMP_FILE" + +# Extract the URL from the response +# This is because the response is quoted, so we just need to remove the quotation marks +URL=$(sed 's/^"\(.*\)"$/\1/' "$TEMP_FILE") +echo "Got URL: $(URL)" + +echo "Dropping DB $(LOCAL_DB_NAME)" +dropdb --if-exists "$LOCAL_DB_NAME" +echo "Creating DB $(LOCAL_DB_NAME)" +createdb "$LOCAL_DB_NAME" + +echo "Downloading and restoring DB $(LOCAL_DB_NAME)" +wget -qO- "$URL" | pg_restore -d "$LOCAL_DB_NAME" -Fc --no-owner --no-privileges