From 8e1ea7e20be4fa39b40fad4272f913b8a53488db Mon Sep 17 00:00:00 2001 From: Tim McCormack Date: Thu, 1 Aug 2024 21:02:02 +0000 Subject: [PATCH] feat: Detect and log anomalous traces in DatadogDiagnosticMiddleware Manual testing: 1. Install in devstack, add middleware to lms/envs/devstack.py, and enable Waffle flag. 2. Make some requests and observe lack of new logging. 3. Add `self.dd_tracer.current_root_span().finish()` to start of `process_view` (faking an anomalous trace) and reinstall in devstack 4. Make some more requests and observe log output like the following: `Anomalous Datadog local root span (duration already set): id = 66abf66b000000001168265a391d04a3; duration = 0.108 sec; worker age = 1.560 sec` --- CHANGELOG.rst | 6 ++++ edx_arch_experiments/__init__.py | 2 +- .../datadog_diagnostics/README.rst | 7 ---- .../datadog_diagnostics/middleware.py | 33 ++++++++++++++++++- 4 files changed, 39 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8a601f0..74d6e57 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,12 @@ Change Log Unreleased ~~~~~~~~~~ +[3.7.0] - 2024-08-02 +~~~~~~~~~~~~~~~~~~~~ +Added +----- +* ``DatadogDiagnosticMiddleware`` can now detect and log anomalous traces, enabled by Waffle flag ``datadog.diagnostics.detect_anomalous_trace`` + [3.6.0] - 2024-07-24 ~~~~~~~~~~~~~~~~~~~~ Added diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py index ebdb6b8..6ab9526 100644 --- a/edx_arch_experiments/__init__.py +++ b/edx_arch_experiments/__init__.py @@ -2,4 +2,4 @@ A plugin to include applications under development by the architecture team at 2U. """ -__version__ = '3.6.0' +__version__ = '3.7.0' diff --git a/edx_arch_experiments/datadog_diagnostics/README.rst b/edx_arch_experiments/datadog_diagnostics/README.rst index c51dd6d..364ad6d 100644 --- a/edx_arch_experiments/datadog_diagnostics/README.rst +++ b/edx_arch_experiments/datadog_diagnostics/README.rst @@ -4,10 +4,3 @@ Datadog Diagnostics When installed in the LMS as a plugin app, the ``datadog_diagnostics`` app adds additional logging for debugging our Datadog integration. This is intended as a temporary situation while we debug the `trace concatenation issue `_. - -Usage -***** - -In LMS: - -- Install ``edx-arch-experiments`` as a dependency diff --git a/edx_arch_experiments/datadog_diagnostics/middleware.py b/edx_arch_experiments/datadog_diagnostics/middleware.py index ae3ffed..844695c 100644 --- a/edx_arch_experiments/datadog_diagnostics/middleware.py +++ b/edx_arch_experiments/datadog_diagnostics/middleware.py @@ -1,14 +1,31 @@ """ Diagnostic middleware for Datadog. + +To use, install edx-arch-experiments and add +``edx_arch_experiments.datadog_diagnostics.middleware.DatadogDiagnosticMiddleware`` +to ``MIDDLEWARE``, then set the below settings as needed. """ import logging +import time from django.core.exceptions import MiddlewareNotUsed from edx_toggles.toggles import WaffleFlag log = logging.getLogger(__name__) +# .. toggle_name: datadog.diagnostics.detect_anomalous_trace +# .. toggle_implementation: WaffleFlag +# .. toggle_default: False +# .. toggle_description: Enables logging of anomalous Datadog traces for web requests. +# .. toggle_warning: This is a noisy feature and so it should only be enabled +# for a percentage of requests. +# .. toggle_use_cases: temporary +# .. toggle_creation_date: 2024-08-01 +# .. toggle_target_removal_date: 2024-11-01 +# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692 +DETECT_ANOMALOUS_TRACE = WaffleFlag('datadog.diagnostics.detect_anomalous_trace', module_name=__name__) + # .. toggle_name: datadog.diagnostics.log_root_span # .. toggle_implementation: WaffleFlag # .. toggle_default: False @@ -43,6 +60,8 @@ def __init__(self, get_response): # If import fails, don't even load this middleware. raise MiddlewareNotUsed # pylint: disable=raise-missing-from + self.worker_start_epoch = time.time() + def __call__(self, request): return self.get_response(request) @@ -63,9 +82,21 @@ def log_diagnostics(self, request): """ Contains all the actual logging logic. """ + local_root_span = self.dd_tracer.current_root_span() + + if DETECT_ANOMALOUS_TRACE.is_enabled(): + root_duration_s = local_root_span.duration + if root_duration_s is not None: + worker_run_time_s = time.time() - self.worker_start_epoch + log.warning( + f"Anomalous Datadog local root span (duration already set): " + f"id = {local_root_span.trace_id:x}; " + f"duration = {root_duration_s:0.3f} sec; " + f"worker age = {worker_run_time_s:0.3f} sec" + ) + if LOG_ROOT_SPAN.is_enabled(): route_pattern = getattr(request.resolver_match, 'route', None) - local_root_span = self.dd_tracer.current_root_span() current_span = self.dd_tracer.current_span() # pylint: disable=protected-access log.info(