Skip to content

Commit

Permalink
[App Signals] E2E Testing: EC2 Use Case (#615)
Browse files Browse the repository at this point in the history
  • Loading branch information
majanjua-amzn authored Nov 22, 2023
1 parent 096b4ba commit f1f2dc8
Show file tree
Hide file tree
Showing 41 changed files with 2,002 additions and 44 deletions.
20 changes: 20 additions & 0 deletions .github/workflows/appsignals-e2e-ec2-canary-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
## This workflow aims to run the Application Signals end-to-end tests as a canary to
## test the artifacts for App Signals enablement. It will deploy a sample app and remote
## service on two EC2 instances, call the APIs, and validate the generated telemetry,
## including logs, metrics, and traces.
name: App Signals Enablement - E2E EC2 Canary Testing
on:
schedule:
- cron: '0/15 * * * *' # run the workflow every 15 minutes
workflow_dispatch: # be able to run the workflow on demand

permissions:
id-token: write
contents: read

jobs:
e2e-canary-test:
uses: ./.github/workflows/appsignals-e2e-ec2-test.yml
secrets: inherit
with:
caller-workflow-name: 'appsignals-e2e-ec2-canary-test'
166 changes: 166 additions & 0 deletions .github/workflows/appsignals-e2e-ec2-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# This is a reusable workflow for running the E2E test for App Signals.
# It is meant to be called from another workflow.
# Read more about reusable workflows: https://docs.github.com/en/actions/using-workflows/reusing-workflows#overview
name: App Signals Enablement E2E Testing - EC2 Use Case
on:
workflow_call:
inputs:
caller-workflow-name:
required: true
type: string

permissions:
id-token: write
contents: read

env:
AWS_DEFAULT_REGION: us-east-1
TEST_ACCOUNT: ${{ secrets.APP_SIGNALS_E2E_TEST_ACC }}
SAMPLE_APP_FRONTEND_SERVICE_JAR: "s3://aws-appsignals-sample-app/main-service.jar"
SAMPLE_APP_REMOTE_SERVICE_JAR: "s3://aws-appsignals-sample-app/remote-service.jar"
APP_SIGNALS_CW_AGENT_RPM: "https://amazoncloudwatch-agent-us-east-1.s3.amazonaws.com/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm"
APP_SIGNALS_ADOT_JAR: "https://github.com/aws-observability/aws-otel-java-instrumentation/releases/latest/download/aws-opentelemetry-agent.jar"
METRIC_NAMESPACE: AppSignals
LOG_GROUP_NAME: /aws/appsignals/generic

jobs:
e2e-ec2-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Generate testing id
run: echo TESTING_ID="${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.E2E_TEST_ROLE_ARN }}
aws-region: ${{ env.AWS_DEFAULT_REGION }}

- name: Set up terraform
uses: hashicorp/setup-terraform@v2
with:
terraform_wrapper: false

- name: Deploy sample app via terraform
working-directory: testing/terraform/ec2
run: |
terraform init
terraform validate
terraform apply -auto-approve \
-var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
-var="test_id=${{ env.TESTING_ID }}" \
-var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" \
-var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \
-var="cw_agent_rpm=${{ env.APP_SIGNALS_CW_AGENT_RPM }}" \
-var="adot_jar=${{ env.APP_SIGNALS_ADOT_JAR }}"
- name: Get the sample app endpoint
run: |
echo "MAIN_SERVICE_ENDPOINT=$(terraform output sample_app_main_service_public_dns):8080" >> $GITHUB_ENV
echo "REMOTE_SERVICE_IP=$(terraform output sample_app_remote_service_public_ip)" >> $GITHUB_ENV
working-directory: testing/terraform/ec2

- name: Wait for app endpoint to come online
id: endpoint-check
run: |
attempt_counter=0
max_attempts=30
until $(curl --output /dev/null --silent --head --fail http://${{ env.MAIN_SERVICE_ENDPOINT }}); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Max attempts reached"
exit 1
fi
printf '.'
attempt_counter=$(($attempt_counter+1))
sleep 10
done
# This steps increases the speed of the validation by creating the telemetry data in advance
- name: Call all test APIs
continue-on-error: true
run: |
curl -S -s -o /dev/null http://${{ env.MAIN_SERVICE_ENDPOINT }}/outgoing-http-call/
curl -S -s -o /dev/null http://${{ env.MAIN_SERVICE_ENDPOINT }}/aws-sdk-call/
curl -S -s -o /dev/null http://${{ env.MAIN_SERVICE_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_IP }}/
curl -S -s -o /dev/null http://${{ env.MAIN_SERVICE_ENDPOINT }}/client-call/
# Validation for pulse telemetry data
- name: Validate generated EMF logs
id: log-validation
run: ./gradlew testing:validator:run --args='-c ec2/log-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.MAIN_SERVICE_ENDPOINT }}
--remote-service-deployment-name ${{ env.REMOTE_SERVICE_IP }}:8080
--region ${{ env.AWS_DEFAULT_REGION }}
--account-id ${{ env.TEST_ACCOUNT }}
--metric-namespace ${{ env.METRIC_NAMESPACE }}
--log-group ${{ env.LOG_GROUP_NAME }}
--service-name sample-application-${{ env.TESTING_ID }}
--remote-service-name sample-remote-application-${{ env.TESTING_ID }}
--request-body ip=${{ env.REMOTE_SERVICE_IP }}
--rollup'

- name: Validate generated metrics
id: metric-validation
if: (success() || steps.log-validation.outcome == 'failure') && !cancelled()
run: ./gradlew testing:validator:run --args='-c ec2/metric-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.MAIN_SERVICE_ENDPOINT }}
--remote-service-deployment-name ${{ env.REMOTE_SERVICE_IP }}:8080
--region ${{ env.AWS_DEFAULT_REGION }}
--account-id ${{ env.TEST_ACCOUNT }}
--metric-namespace ${{ env.METRIC_NAMESPACE }}
--log-group ${{ env.LOG_GROUP_NAME }}
--service-name sample-application-${{ env.TESTING_ID }}
--remote-service-name sample-remote-application-${{ env.TESTING_ID }}
--request-body ip=${{ env.REMOTE_SERVICE_IP }}
--rollup'

- name: Validate generated traces
id: trace-validation
if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
run: ./gradlew testing:validator:run --args='-c ec2/trace-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.MAIN_SERVICE_ENDPOINT }}
--remote-service-deployment-name ${{ env.REMOTE_SERVICE_IP }}:8080
--region ${{ env.AWS_DEFAULT_REGION }}
--account-id ${{ env.TEST_ACCOUNT }}
--metric-namespace ${{ env.METRIC_NAMESPACE }}
--log-group ${{ env.LOG_GROUP_NAME }}
--service-name sample-application-${{ env.TESTING_ID }}
--remote-service-name sample-remote-application-${{ env.TESTING_ID }}
--request-body ip=${{ env.REMOTE_SERVICE_IP }}
--rollup'

- name: Publish metric on test result
if: always()
run: |
if [[ "${{ steps.log-validation.outcome }}" == "success" && "${{ steps.metric-validation.outcome }}" == "success" && "${{ steps.trace-validation.outcome }}" == "success" ]]; then
aws cloudwatch put-metric-data --namespace 'ADOT/GitHubActions' \
--metric-name Failure \
--dimensions repository=${{ github.repository }},branch=${{ github.ref_name }},workflow=${{ inputs.caller-workflow-name }} \
--value 0.0 \
--region ${{ env.AWS_DEFAULT_REGION }}
else
aws cloudwatch put-metric-data --namespace 'ADOT/GitHubActions' \
--metric-name Failure \
--dimensions repository=${{ github.repository }},branch=${{ github.ref_name }},workflow=${{ inputs.caller-workflow-name }} \
--value 1.0 \
--region ${{ env.AWS_DEFAULT_REGION }}
fi
# Clean up Procedures

- name: Terraform destroy
if: always()
continue-on-error: true
working-directory: testing/terraform/ec2
run: |
terraform destroy -auto-approve \
-var="test_id=${{ env.TESTING_ID }}"
6 changes: 4 additions & 2 deletions .github/workflows/appsignals-e2e-eks-canary-test.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
## This workflow aims to run the end-to-end tests in canary fashion to
## test the prod artifacts for App Signals enablement.
## This workflow aims to run the Application Signals end-to-end tests as a canary to
## test the artifacts for App Signals enablement. It will deploy a sample app and remote
## service onto an EKS cluster, call the APIs, and validate the generated telemetry,
## including logs, metrics, and traces.
name: App Signals Enablement - E2E EKS Canary Testing
on:
schedule:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/appsignals-e2e-eks-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ jobs:
- name: Call endpoint and validate generated EMF logs
id: log-validation
if: steps.endpoint-check.outcome == 'success' && !cancelled()
run: ./gradlew testing:validator:run --args='-c log-validation.yml
run: ./gradlew testing:validator:run --args='-c eks/log-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
--region ${{ env.AWS_DEFAULT_REGION }}
Expand All @@ -176,7 +176,7 @@ jobs:
- name: Call endpoints and validate generated metrics
id: metric-validation
if: (success() || steps.log-validation.outcome == 'failure') && !cancelled()
run: ./gradlew testing:validator:run --args='-c metric-validation.yml
run: ./gradlew testing:validator:run --args='-c eks/metric-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
--region ${{ env.AWS_DEFAULT_REGION }}
Expand All @@ -194,7 +194,7 @@ jobs:
- name: Call endpoints and validate generated traces
id: trace-validation
if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
run: ./gradlew testing:validator:run --args='-c trace-validation.yml
run: ./gradlew testing:validator:run --args='-c eks/trace-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
--region ${{ env.AWS_DEFAULT_REGION }}
Expand Down
20 changes: 20 additions & 0 deletions testing/terraform/ec2/amazon-cloudwatch-agent.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"agent": {
"debug": true,
"region": "$REGION"
},
"traces": {
"traces_collected": {
"app_signals": {
"enabled": true
}
}
},
"logs": {
"metrics_collected": {
"app_signals": {
"enabled": true
}
}
}
}
143 changes: 143 additions & 0 deletions testing/terraform/ec2/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
}
}
}

# Define the provider for AWS
provider "aws" {}

resource "aws_default_vpc" "default" {}

resource "tls_private_key" "ssh_key" {
algorithm = "RSA"
rsa_bits = 4096
}

resource "aws_key_pair" "aws_ssh_key" {
key_name = "instance_key-${var.test_id}"
public_key = tls_private_key.ssh_key.public_key_openssh
}

locals {
ssh_key_name = aws_key_pair.aws_ssh_key.key_name
private_key_content = tls_private_key.ssh_key.private_key_pem
}

resource "aws_instance" "main_service_instance" {
ami = "ami-0b021814637c6d457" # Amazon Linux 2 (free tier)
instance_type = "t2.micro"
key_name = local.ssh_key_name
iam_instance_profile = "APP_SIGNALS_EC2_TEST_ROLE"
vpc_security_group_ids = [aws_default_vpc.default.default_security_group_id]
associate_public_ip_address = true
instance_initiated_shutdown_behavior = "terminate"

tags = {
Name = "main-service-${var.test_id}"
}
}

resource "null_resource" "main_service_setup" {
connection {
type = "ssh"
user = var.user
private_key = local.private_key_content
host = aws_instance.main_service_instance.public_ip
}

provisioner "remote-exec" {
inline = [
# Install Java 11 and tmux
"yes | sudo amazon-linux-extras install java-openjdk11",

# Copy in CW Agent configuration
"agent_config='${replace(replace(file("./amazon-cloudwatch-agent.json"), "/\\s+/", ""), "$REGION", var.aws_region)}'",
"echo $agent_config > amazon-cloudwatch-agent.json",

# Get and run CW agent rpm
"wget -O cw-agent.rpm ${var.cw_agent_rpm}",
"sudo rpm -U ./cw-agent.rpm",
"sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:./amazon-cloudwatch-agent.json",

# Get ADOT
"wget -O adot.jar ${var.adot_jar}",

# Get and run the sample application with configuration
"aws s3 cp ${var.sample_app_jar} ./main-service.jar",

"JAVA_TOOL_OPTIONS=' -javaagent:/home/ec2-user/adot.jar' \\",
"OTEL_METRICS_EXPORTER=none \\",
"OTEL_SMP_ENABLED=true \\",
"OTEL_AWS_SMP_EXPORTER_ENDPOINT=http://localhost:4315 \\",
"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4315 \\",
"OTEL_RESOURCE_ATTRIBUTES=aws.hostedin.environment=EC2,service.name=sample-application-${var.test_id} \\",
"nohup java -jar main-service.jar &> nohup.out &",

# The application needs time to come up and reach a steady state, this should not take longer than 30 seconds
"sleep 30"
]
}

depends_on = [aws_instance.main_service_instance]
}

resource "aws_instance" "remote_service_instance" {
ami = "ami-0b021814637c6d457" # Amazon Linux 2 (free tier)
instance_type = "t2.micro"
key_name = local.ssh_key_name
iam_instance_profile = "APP_SIGNALS_EC2_TEST_ROLE"
vpc_security_group_ids = [aws_default_vpc.default.default_security_group_id]
associate_public_ip_address = true
instance_initiated_shutdown_behavior = "terminate"

tags = {
Name = "remote-service-${var.test_id}"
}
}

resource "null_resource" "remote_service_setup" {
connection {
type = "ssh"
user = var.user
private_key = local.private_key_content
host = aws_instance.remote_service_instance.public_ip
}

provisioner "remote-exec" {
inline = [
# Install Java 11 and tmux
"yes | sudo amazon-linux-extras install java-openjdk11",

# Copy in CW Agent configuration
"agent_config='${replace(replace(file("./amazon-cloudwatch-agent.json"), "/\\s+/", ""), "$REGION", var.aws_region)}'",
"echo $agent_config > amazon-cloudwatch-agent.json",

# Get and run CW agent rpm
"wget -O cw-agent.rpm ${var.cw_agent_rpm}",
"sudo rpm -U ./cw-agent.rpm",
"sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -s -c file:./amazon-cloudwatch-agent.json",

# Get ADOT
"wget -O adot.jar ${var.adot_jar}",

# Get and run the sample application with configuration
"aws s3 cp ${var.sample_remote_app_jar} ./remote-service.jar",

"JAVA_TOOL_OPTIONS=' -javaagent:/home/ec2-user/adot.jar' \\",
"OTEL_METRICS_EXPORTER=none \\",
"OTEL_SMP_ENABLED=true \\",
"OTEL_AWS_SMP_EXPORTER_ENDPOINT=http://localhost:4315 \\",
"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4315 \\",
"OTEL_RESOURCE_ATTRIBUTES=aws.hostedin.environment=EC2,service.name=sample-remote-application-${var.test_id} \\",
"nohup java -jar remote-service.jar &> nohup.out &",

# The application needs time to come up and reach a steady state, this should not take longer than 30 seconds
"sleep 30"
]
}

depends_on = [aws_instance.remote_service_instance]
}
Loading

0 comments on commit f1f2dc8

Please sign in to comment.