From 6cbd1b60fdd21bc9e8c9dad82688267a7b0a79e2 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:20:14 -0700 Subject: [PATCH] [IT-3975] Deploy opentelemetry collector (#14) **Problem:** 1. An Opentelemetry collector needs to be deployed to ECS to support forwarding telemetry data for long-term storage and analysis. 2. A configuration file for the collector needs to be sourced from AWS secrets manager and injected into the Otel collector to configure the service. 3. Environment variables need to be updated in the schematic container to support configuring it to send telemetry data to the Otel collector. 4. A container level health check is needed as this service is not fronted by a load balancer. **Solution:** 1. Deploying the otel collector contributor container to ECS. I had attempted to use the AWS otel collector, however, they do not support the Oauth2 extension that we will use to attached an Auth header on out-going requests: https://github.com/aws-observability/aws-otel-collector/issues/1492 2. Storing the otel config file in AWS Secret manager and injecting it into the Otel collector by overriding the docker CMD command on the container. 3. Setting environment variables on the schematic container to configure it sending telemetry data to the otel collector. 4. Pointing to our sage specific docker image that contains a binary compiled from golang that does container level health checks. **Testing:** 1. I verified that I was able to deploy both schematic and the otel collector to AWS ECS. 2. I verified that by setting the environment variables in schematic that it was able to produce, and forward it's data to the otel collector. 3. I verified that the otel collector was able to perform the oauth2 client credential exchange with Auth0 to obtain an access token. 4. I verified that both logs AND traces were forwarded to the kubernetes cluster/SigNoz and ingested into Clickhouse for long-term storage. 5. I verified that the telemetry data showed up in the SigNoz UI as expected. --- README.md | 145 +++++++++++++++++++++++++++++++++++++++++-- app.py | 58 ++++++++++++++--- src/service_props.py | 42 ++++++++++++- src/service_stack.py | 35 ++++++----- 4 files changed, 244 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 14ec428..e729fed 100644 --- a/README.md +++ b/README.md @@ -130,15 +130,30 @@ Once created take the ARN of the certificate and set that ARN in environment_var # Secrets Secrets can be manually created in the -[AWS Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/create_secret.html) - -To pass secrets to a container set the secrets manager `secret name` -when creating a ServiceProp objects: +[AWS Secrets Manager](https://docs.aws.amazon.com/secretsmanager/latest/userguide/create_secret.html). +When naming your secret make sure that the secret does not end in a pattern that matches +`-??????`, this will cause issues with how AWS CDK looks up secrets. +To pass secrets to a container set the secrets manager `container_secrets` +when creating a `ServiceProp` object. You'll be creating a list of `ServiceSecret` objects: ```python +from src.service_props import ServiceProps, ServiceSecret + app_service_props = ServiceProps( - "app", 443, 1024, f"ghcr.io/sage-bionetworks/app:v1.0", container_env_vars={}, - container_secret_name="app/dev/DATABASE" + container_name="app", + container_port=443, + container_memory=1024, + container_location="ghcr.io/sage-bionetworks/app:v1.0", + container_secrets=[ + ServiceSecret( + secret_name="app/dev/DATABASE", + environment_key="NAME_OF_ENVIRONMENT_VARIABLE_SET_FOR_CONTAINER", + ), + ServiceSecret( + secret_name="app/dev/PASSWORD", + environment_key="SINGLE_VALUE_SECRET", + ) + ] ) ``` @@ -150,6 +165,26 @@ For example, the KVs for `app/dev/DATABASE` could be: } ``` +And the value for `app/dev/PASSWORD` could be: `password` + +In the application (Python) code the secrets may be loaded into a dict using code like: + +```python +import json +import os + +all_secrets_dict = json.loads(os.environ["NAME_OF_ENVIRONMENT_VARIABLE_SET_FOR_CONTAINER"]) +``` + +In the case of a single value you may load the value like: + +```python +import os + +my_secret = os.environ.get("SINGLE_VALUE_SECRET", None) +``` + + > [!NOTE] > Retrieving secrets requires access to the AWS Secrets Manager @@ -247,3 +282,101 @@ The workflow for continuous integration: * CI deploys changes to the staging environment (stage.app.io) in the AWS prod account. * Changes are promoted (or merged) to the git prod branch. * CI deploys changes to the prod environment (prod.app.io) in the AWS prod account. + +# Creation/Forwarding of OpenTelemetry data +Schematic has been instrumented with a mix of +[automationally instrumented libraries](https://github.com/open-telemetry/opentelemetry-python-contrib/tree/main/instrumentation) +and [manual traces](https://opentelemetry-python.readthedocs.io/en/latest/api/trace.html). +In addition it's been configured at startup to [conditionally turn on trace/log exports](https://github.com/Sage-Bionetworks/schematic/blob/778bf54db9c5b4de0af334c4efe034b3dde0b348/schematic/__init__.py#L82-L139) +depending on how a few environment variables are set. The combination of these items let +the schematic container running in ECS export telemetry data out of the container to be +ingested somewhere else for long-term storage. + + +Schematic is configured to send it's telemetry data to the OpenTelemetry Collector +which then handles forwarding that data on to it's final destination. This is +accomplished by setting a few environment variables on the Schematic container such as: + +```python +from src.service_props import ServiceProps + +telemetry_environment_variables = { + "TRACING_EXPORT_FORMAT": "otlp", + "LOGGING_EXPORT_FORMAT": "otlp", + "TRACING_SERVICE_NAME": "schematic", + "LOGGING_SERVICE_NAME": "schematic", + "DEPLOYMENT_ENVIRONMENT": environment, + "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel-collector:4318", +} + +app_service_props = ServiceProps( + container_name="schematic-app", + container_location="ghcr.io/sage-bionetworks/app:v1.0" + container_port=443, + container_memory=1024, + container_env_vars=telemetry_environment_variables, +) +``` + + +## OpenTelemetry Collector +The OpenTelemetry collector is deployed into ECS and is running in +[Gateway mode](https://opentelemetry.io/docs/collector/deployment/gateway/). This +configuration allows for a single collector to be the central point for all telemetry +data leaving the context of this deployed infrastructure. This central point allows us +to configure where [authorization can be attached](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/extension/oauth2clientauthextension), +[requests batched up](https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md), or [sensitive data be stripped](https://docs.honeycomb.io/send-data/opentelemetry/collector/handle-sensitive-information/). + + +The configuration of all of these elements stems from [supplying a `config.yaml` file](https://opentelemetry.io/docs/collector/configuration/) as +an environment variable to the otel collector container at startup. This config file is +set up to be sourced from AWS Secrets manager. To accomplish this a filled out copy of +the following configuration file is stored in AWS Secrets manager (As Plaintext) +with the name `f"{stack_name_prefix}-DockerFargateStack/{environment}/opentelemetry-collector-configuration"`: + +``` +extensions: + health_check: + endpoint: "0.0.0.0:13133" + path: "/" + check_collector_pipeline: + enabled: true + interval: "5m" + exporter_failure_threshold: 5 + oauth2client: + client_id: FILL_ME_IN + client_secret: FILL_ME_IN + endpoint_params: + audience: FILL_ME_IN + token_url: FILL_ME_IN + # timeout for the token client + timeout: 2s + +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + send_batch_size: 50 + +exporters: + otlphttp/withauth: + endpoint: FILL_ME_IN + auth: + authenticator: oauth2client + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp/withauth] + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp/withauth] + extensions: [health_check, oauth2client] +``` diff --git a/app.py b/app.py index a5b9e3c..55d41a1 100644 --- a/app.py +++ b/app.py @@ -1,11 +1,12 @@ +from os import environ + import aws_cdk as cdk -from os import environ -from src.network_stack import NetworkStack from src.ecs_stack import EcsStack -from src.service_stack import LoadBalancedServiceStack from src.load_balancer_stack import LoadBalancerStack -from src.service_props import ServiceProps +from src.network_stack import NetworkStack +from src.service_props import ServiceProps, ServiceSecret +from src.service_stack import LoadBalancedServiceStack, ServiceStack # get the environment and set environment specific variables VALID_ENVIRONMENTS = ["dev", "stage", "prod"] @@ -35,7 +36,7 @@ case _: valid_envs_str = ",".join(VALID_ENVIRONMENTS) raise SystemExit( - f"Must set environment variable `ENV` to one of {valid_envs_str}" + f"Must set environment variable `ENV` to one of {valid_envs_str}. Currently set to {environment}." ) stack_name_prefix = f"schematic-{environment}" @@ -68,12 +69,27 @@ cdk_app, f"{stack_name_prefix}-load-balancer", network_stack.vpc ) +telemetry_environment_variables = { + "TRACING_EXPORT_FORMAT": "otlp", + "LOGGING_EXPORT_FORMAT": "otlp", + "TRACING_SERVICE_NAME": "schematic", + "LOGGING_SERVICE_NAME": "schematic", + "DEPLOYMENT_ENVIRONMENT": environment, + "OTEL_EXPORTER_OTLP_ENDPOINT": "http://otel-collector:4318", +} + app_service_props = ServiceProps( - "schematic-app", - "ghcr.io/sage-bionetworks/schematic:v0.1.90-beta", - 443, + container_name="schematic-app", + container_location="ghcr.io/sage-bionetworks/schematic:v0.1.94-beta", + container_port=443, container_memory=1024, - container_secret_name=f"{stack_name_prefix}-DockerFargateStack/{environment}/ecs", + container_env_vars=telemetry_environment_variables, + container_secrets=[ + ServiceSecret( + secret_name=f"{stack_name_prefix}-DockerFargateStack/{environment}/ecs", + environment_key="SECRETS_MANAGER_SECRETS", + ) + ], ) app_service_stack = LoadBalancedServiceStack( @@ -88,5 +104,27 @@ health_check_interval=5, ) -# Generate stacks +app_service_props_otel_collector = ServiceProps( + container_name="otel-collector", + container_port=4318, + container_memory=512, + container_location="ghcr.io/sage-bionetworks/sage-otel-collector:0.0.1", + container_secrets=[ + ServiceSecret( + secret_name=f"{stack_name_prefix}-DockerFargateStack/{environment}/opentelemetry-collector-configuration", + environment_key="CONFIG_CONTENT", + ) + ], + container_command=["--config", "env:CONFIG_CONTENT"], + container_healthcheck=cdk.aws_ecs.HealthCheck(command=["CMD", "/healthcheck"]), +) + +app_service_stack_otel_collector = ServiceStack( + scope=cdk_app, + construct_id=f"{stack_name_prefix}-otel-collector", + vpc=network_stack.vpc, + cluster=ecs_stack.cluster, + props=app_service_props_otel_collector, +) + cdk_app.synth() diff --git a/src/service_props.py b/src/service_props.py index 6b6b2f8..75c29cd 100644 --- a/src/service_props.py +++ b/src/service_props.py @@ -1,6 +1,28 @@ +from dataclasses import dataclass +from typing import List, Optional, Sequence + +from aws_cdk import aws_ecs as ecs + CONTAINER_LOCATION_PATH_ID = "path://" +@dataclass +class ServiceSecret: + """ + Holds onto configuration for the secrets to be used in the container. + + Attributes: + secret_name: The name of the secret as stored in the AWS Secrets Manager. + environment_key: The name of the environment variable to be set within the container. + """ + + secret_name: str + """The name of the secret as stored in the AWS Secrets Manager.""" + + environment_key: str + """The name of the environment variable to be set within the container.""" + + class ServiceProps: """ ECS service properties @@ -13,9 +35,11 @@ class ServiceProps: container_memory: the container application memory container_env_vars: a json dictionary of environment variables to pass into the container i.e. {"EnvA": "EnvValueA", "EnvB": "EnvValueB"} - container_secret_name: the secret's name in the AWS secrets manager + container_secrets: List of `ServiceSecret` resources to pull from AWS secrets manager auto_scale_min_capacity: the fargate auto scaling minimum capacity auto_scale_max_capacity: the fargate auto scaling maximum capacity + container_command: Optional commands to run during the container startup + container_healthcheck: Optional health check configuration for the container """ def __init__( @@ -25,9 +49,11 @@ def __init__( container_port: int, container_memory: int = 512, container_env_vars: dict = None, - container_secret_name: str = None, + container_secrets: List[ServiceSecret] = None, auto_scale_min_capacity: int = 1, auto_scale_max_capacity: int = 1, + container_command: Optional[Sequence[str]] = None, + container_healthcheck: Optional[ecs.HealthCheck] = None, ) -> None: self.container_name = container_name self.container_port = container_port @@ -37,8 +63,18 @@ def __init__( CONTAINER_LOCATION_PATH_ID ) self.container_location = container_location + if container_env_vars is None: self.container_env_vars = {} - self.container_secret_name = container_secret_name + else: + self.container_env_vars = container_env_vars + + if container_secrets is None: + self.container_secrets = [] + else: + self.container_secrets = container_secrets + self.auto_scale_min_capacity = auto_scale_min_capacity self.auto_scale_max_capacity = auto_scale_max_capacity + self.container_command = container_command + self.container_healthcheck = container_healthcheck diff --git a/src/service_stack.py b/src/service_stack.py index 3fa0ee7..eeb3614 100644 --- a/src/service_stack.py +++ b/src/service_stack.py @@ -1,17 +1,14 @@ import aws_cdk as cdk - -from aws_cdk import ( - Duration as duration, - aws_ecs as ecs, - aws_ec2 as ec2, - aws_logs as logs, - aws_elasticloadbalancingv2 as elbv2, - aws_certificatemanager as acm, - aws_iam as iam, - aws_secretsmanager as sm, -) - +from aws_cdk import Duration as duration +from aws_cdk import aws_certificatemanager as acm +from aws_cdk import aws_ec2 as ec2 +from aws_cdk import aws_ecs as ecs +from aws_cdk import aws_elasticloadbalancingv2 as elbv2 +from aws_cdk import aws_iam as iam +from aws_cdk import aws_logs as logs +from aws_cdk import aws_secretsmanager as sm from constructs import Construct + from src.service_props import ServiceProps ALB_HTTP_LISTENER_PORT = 80 @@ -79,16 +76,18 @@ def _get_secret(scope: Construct, id: str, name: str) -> sm.Secret: isecret = sm.Secret.from_secret_name_v2(scope, id, name) return ecs.Secret.from_secrets_manager(isecret) + secrets = {} + for secret in props.container_secrets: + secrets[secret.environment_key] = _get_secret( + self, f"sm-secrets-{secret.environment_key}", secret.secret_name + ) + self.container = self.task_definition.add_container( props.container_name, image=image, memory_limit_mib=props.container_memory, environment=props.container_env_vars, - secrets={ - "SECRETS_MANAGER_SECRETS": _get_secret( - self, "sm-secrets", props.container_secret_name - ) - }, + secrets=secrets, port_mappings=[ ecs.PortMapping( name=props.container_name, @@ -100,6 +99,8 @@ def _get_secret(scope: Construct, id: str, name: str) -> sm.Secret: stream_prefix=f"{construct_id}", log_retention=logs.RetentionDays.FOUR_MONTHS, ), + command=props.container_command, + health_check=props.container_healthcheck, ) self.security_group = ec2.SecurityGroup(self, "SecurityGroup", vpc=vpc)