From 6fb7fae8511edb4b9c7ef3fbc3373f7de5dd90a0 Mon Sep 17 00:00:00 2001 From: Patrick Date: Tue, 14 Nov 2023 05:57:27 -0500 Subject: [PATCH] feature/tracing-data: Trace data as artifact in CI (#11113) * WIP: generate traces as artifact * Splitting otel collector into dev and ci environments * feature/tracing-data: minor cleanup --- .github/tracing/README.md | 43 ++++++++- .../tracing/local-smoke-docker-compose.yaml | 4 +- .github/tracing/otel-collector-ci.yaml | 22 +++++ ...collector.yaml => otel-collector-dev.yaml} | 5 + .github/tracing/replay.sh | 6 ++ .github/workflows/integration-tests.yml | 92 +++++++------------ 6 files changed, 111 insertions(+), 61 deletions(-) create mode 100644 .github/tracing/otel-collector-ci.yaml rename .github/tracing/{otel-collector.yaml => otel-collector-dev.yaml} (67%) create mode 100644 .github/tracing/replay.sh diff --git a/.github/tracing/README.md b/.github/tracing/README.md index 6988383ca7b..eb757384295 100644 --- a/.github/tracing/README.md +++ b/.github/tracing/README.md @@ -1,5 +1,44 @@ # Distributed Tracing -These config files are for an OTEL collector, grafana Tempo, and a grafana UI instance to run as containers on the same network. +As part of the LOOP plugin effort, we've added distributed tracing to the core node. This is helpful for initial development and maintenance of LOOPs, but will also empower product teams building on top of core. + +## Dev environment + +One way to generate traces locally today is with the OCR2 basic smoke test. + +1. navigate to `.github/tracing/` and then run `docker compose --file local-smoke-docker-compose.yaml up` +2. setup a local docker registry at `127.0.0.1:5000` (https://www.docker.com/blog/how-to-use-your-own-registry-2/) +3. run `make build_push_plugin_docker_image` in `chainlink/integration-tests/Makefile` +4. run `SELECTED_NETWORKS=SIMULATED CHAINLINK_IMAGE="127.0.0.1:5000/chainlink" CHAINLINK_VERSION="develop" go test -run TestOCRv2Basic ./smoke/ocr2_test.go` +5. navigate to `localhost:3000/explore` in a web browser to query for traces + +Core and the median plugins are instrumented with open telemetry traces, which are sent to the OTEL collector and forwarded to the Tempo backend. The grafana UI can then read the trace data from the Tempo backend. + + -A localhost client can send gRPC calls to the server. The gRPC server is instrumented with open telemetry traces, which are sent to the OTEL collector and forwarded to the Tempo backend. The grafana UI can then read the trace data from the Tempo backend. \ No newline at end of file +## CI environment + +Another way to generate traces is by enabling traces for PRs. This will instrument traces for `TestOCRv2Basic` in the CI run. + +1. Cut a PR in the core repo +2. Add the `enable tracing` label to the PR +3. Navigate to `Integration Tests / ETH Smoke Tests ocr2-plugins (pull_request)` details +4. Navigate to the summary of the integration tests +5. After the test completes, the generated trace data will be saved as an artifact, currently called `trace-data` +6. Download the artifact to this directory (`chainlink/.github/tracing`) +7. `docker compose --file local-smoke-docker-compose.yaml up` +8. Run `sh replay.sh` to replay those traces to the otel-collector container that was spun up in the last step. +9. navigate to `localhost:3000/explore` in a web browser to query for traces + +The artifact is not json encoded - each individual line is a well formed and complete json object. + +## Configuration +This folder contains the following config files: +* otel-collector-ci.yaml +* otel-collector-dev.yaml +* tempo.yaml +* grafana-datasources.yaml + +These config files are for an OTEL collector, grafana Tempo, and a grafana UI instance to run as containers on the same network. +`otel-collector-dev.yaml` is the configuration for dev (i.e. your local machine) environments, and forwards traces from the otel collector to the grafana tempo instance on the same network. +`otel-collector-ci.yaml` is the configuration for the CI runs, and exports the trace data to the artifact from the github run. \ No newline at end of file diff --git a/.github/tracing/local-smoke-docker-compose.yaml b/.github/tracing/local-smoke-docker-compose.yaml index e0e60a675e5..744ba88ef69 100644 --- a/.github/tracing/local-smoke-docker-compose.yaml +++ b/.github/tracing/local-smoke-docker-compose.yaml @@ -6,9 +6,11 @@ services: image: otel/opentelemetry-collector:0.61.0 command: [ "--config=/etc/otel-collector.yaml" ] volumes: - - ./otel-collector.yaml:/etc/otel-collector.yaml + - ./otel-collector-dev.yaml:/etc/otel-collector.yaml + - ../../integration-tests/smoke/traces/trace-data.json:/etc/trace-data.json # local trace data stored consistent with smoke/logs ports: - "4317:4317" # otlp grpc + - "3100:3100" depends_on: - tempo networks: diff --git a/.github/tracing/otel-collector-ci.yaml b/.github/tracing/otel-collector-ci.yaml new file mode 100644 index 00000000000..0bf123d29b5 --- /dev/null +++ b/.github/tracing/otel-collector-ci.yaml @@ -0,0 +1,22 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:3100" +exporters: + file: + path: /tracing/trace-data.json + otlp: + endpoint: tempo:4317 + tls: + insecure: true +service: + telemetry: + logs: + level: "debug" # Set log level to debug + pipelines: + traces: + receivers: [otlp] + exporters: [file,otlp] \ No newline at end of file diff --git a/.github/tracing/otel-collector.yaml b/.github/tracing/otel-collector-dev.yaml similarity index 67% rename from .github/tracing/otel-collector.yaml rename to .github/tracing/otel-collector-dev.yaml index fb8721cba20..dd059127b81 100644 --- a/.github/tracing/otel-collector.yaml +++ b/.github/tracing/otel-collector-dev.yaml @@ -3,12 +3,17 @@ receivers: protocols: grpc: endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:3100" exporters: otlp: endpoint: tempo:4317 tls: insecure: true service: + telemetry: + logs: + level: "debug" # Set log level to debug pipelines: traces: receivers: [otlp] diff --git a/.github/tracing/replay.sh b/.github/tracing/replay.sh new file mode 100644 index 00000000000..b2e564567c4 --- /dev/null +++ b/.github/tracing/replay.sh @@ -0,0 +1,6 @@ +# Read JSON file and loop through each trace +while IFS= read -r trace; do + curl -X POST http://localhost:3100/v1/traces \ + -H "Content-Type: application/json" \ + -d "$trace" +done < "trace-data" diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 9294dceae6d..17f571fd636 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -368,67 +368,33 @@ jobs: # Create network docker network create --driver bridge tracing - # Start Grafana - cd ./.github/tracing - docker run -d --network=tracing --name=grafana -p 3000:3000 -v $PWD/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml -e GF_AUTH_ANONYMOUS_ENABLED=true -e GF_AUTH_ANONYMOUS_ORG_ROLE=Admin -e GF_AUTH_DISABLE_LOGIN_FORM=true -e GF_FEATURE_TOGGLES_ENABLE=traceqlEditor grafana/grafana:9.4.3 + # Make trace directory + cd integration-tests/smoke/ + mkdir ./traces + chmod -R 777 ./traces - # Start Tempo - docker run -d --network=tracing --name=tempo -v ./tempo.yaml:/etc/tempo.yaml -v $PWD/tempo-data:/tmp/tempo grafana/tempo:latest -config.file=/etc/tempo.yaml + # Switch directory + cd ../../.github/tracing - # Start OpenTelemetry Collector - docker run -d --network=tracing --name=otel-collector -v $PWD/otel-collector.yaml:/etc/otel-collector.yaml -p 4317:4317 otel/opentelemetry-collector:0.61.0 --config=/etc/otel-collector.yaml - - - name: Generate port - id: generate-port - if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.product.tag_suffix == '-plugins' - env: - GITHUB_PR_NUMBER: ${{ github.event.number }} - run: | - PORT_BASE=3001 - MAX_PORT=8000 + # Create a Docker volume for traces + # docker volume create otel-traces - # Use PR number as offset. Given GitHub PRs are incremental, this guarantees uniqueness for at least 5000 PRs. - OFFSET=$GITHUB_PR_NUMBER - echo "PR Number: $OFFSET" - - # Ensure that we don't exceed the max port - if (( OFFSET > (MAX_PORT - PORT_BASE) )); then - OFFSET=$((OFFSET % (MAX_PORT - PORT_BASE))) - fi - - # Map the offset to the port range - REMOTE_PORT=$((PORT_BASE + OFFSET)) - echo "REMOTE_PORT=$REMOTE_PORT" >> $GITHUB_OUTPUT - - name: Reverse SSH Tunneling - if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.product.tag_suffix == '-plugins' - env: - TRACING_SSH_KEY: ${{ secrets.TRACING_SSH_KEY }} - TRACING_SSH_SERVER: ${{ secrets.TRACING_SSH_SERVER }} - REMOTE_PORT: ${{ steps.generate-port.outputs.REMOTE_PORT }} - run: | - eval $(ssh-agent) - echo "test" - echo "$TRACING_SSH_KEY" | wc -c - echo "$TRACING_SSH_KEY" | tr -d '\r' | wc -c - echo "$TRACING_SSH_KEY" | tr -d '\r' | base64 --decode | ssh-add - - # f: background process - # N: do not execute a remote command - # R: remote port forwarding - ssh -o StrictHostKeyChecking=no -f -N -R $REMOTE_PORT:127.0.0.1:3000 user-gha@$TRACING_SSH_SERVER - echo "To view Grafana locally:" - echo "ssh -N -L 8000:localhost:$REMOTE_PORT user-gha@$TRACING_SSH_SERVER" - echo "Then visit http://localhost:8000 in a browser." - echo "If you are unable to connect, check with the security team that you have access to the tracing server." - - name: Show Grafana Logs - if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.product.tag_suffix == '-plugins' + # Start OpenTelemetry Collector + # Note the user must be set to the same user as the runner for the trace data to be accessible + docker run -d --network=tracing --name=otel-collector \ + -v $PWD/otel-collector-ci.yaml:/etc/otel-collector.yaml \ + -v $PWD/../../integration-tests/smoke/traces:/tracing \ + --user "$(id -u):$(id -g)" \ + -p 4317:4317 otel/opentelemetry-collector:0.88.0 --config=/etc/otel-collector.yaml + - name: Locate Docker Volume + id: locate-volume + if: false run: | - docker logs grafana - docker logs tempo - docker logs otel-collector - - name: Set sleep time to use in future steps + echo "VOLUME_PATH=$(docker volume inspect --format '{{ .Mountpoint }}' otel-traces)" >> $GITHUB_OUTPUT + - name: Show Otel-Collector Logs if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.product.tag_suffix == '-plugins' run: | - echo "SLEEP_TIME=2400" >> "$GITHUB_ENV" + docker logs otel-collector ## Run this step when changes that require tests to be run are made - name: Run Tests if: needs.changes.outputs.src == 'true' @@ -465,6 +431,10 @@ jobs: QA_AWS_REGION: ${{ secrets.QA_AWS_REGION }} QA_AWS_ROLE_TO_ASSUME: ${{ secrets.QA_AWS_ROLE_TO_ASSUME }} QA_KUBECONFIG: ${{ secrets.QA_KUBECONFIG }} + - name: Show Otel-Collector Logs + if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.product.tag_suffix == '-plugins' + run: | + docker logs otel-collector - name: Collect Metrics if: always() id: collect-gha-metrics @@ -475,11 +445,17 @@ jobs: this-job-name: ETH Smoke Tests ${{ matrix.product.name }}${{ matrix.product.tag_suffix }} test-results-file: '{"testType":"go","filePath":"/tmp/gotest.log"}' continue-on-error: true - - name: Keep action running to view traces + - name: Permissions on traces if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.product.tag_suffix == '-plugins' run: | - echo "Sleeping for $SLEEP_TIME seconds..." - sleep $SLEEP_TIME + ls -l ./integration-tests/smoke/traces + - name: Upload Trace Data + if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.product.tag_suffix == '-plugins' + uses: actions/upload-artifact@v3 + with: + name: trace-data + path: ./integration-tests/smoke/traces/trace-data.json + ### Used to check the required checks box when the matrix completes eth-smoke-tests: