From 63a247893933be138dffdc534849aef411e230ea Mon Sep 17 00:00:00 2001 From: Tim McCormack Date: Thu, 11 Jul 2024 16:33:54 +0000 Subject: [PATCH] feat: Robustness improvements to datadog_diagnostics plugin - Add `DATADOG_DIAGNOSTICS_ENABLE` for quick disable if needed - Limit spans with `DATADOG_DIAGNOSTICS_MAX_SPANS` (default 100) - Fix scope of member variables - Add unit tests Manual testing: - Modify `common.djangoapps.student.views.dashboard.student_dashboard` in edx-platform to call `import time; time.sleep(10)` at the start of the view. - Start server and log - Visit /dashboard - While the browser is waiting, quickly make a small edit to an edx-platform file, causing an autoreload. - Confirm that spans are logged. --- CHANGELOG.rst | 11 ++++ edx_arch_experiments/__init__.py | 2 +- .../datadog_diagnostics/apps.py | 34 +++++++++++-- .../datadog_diagnostics/tests/__init__.py | 0 .../datadog_diagnostics/tests/test_app.py | 51 +++++++++++++++++++ 5 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 edx_arch_experiments/datadog_diagnostics/tests/__init__.py create mode 100644 edx_arch_experiments/datadog_diagnostics/tests/test_app.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 16ee9ed..726482d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,17 @@ Change Log Unreleased ~~~~~~~~~~ +[3.5.0] - 2024-07-11 +~~~~~~~~~~~~~~~~~~~~ +Added +----- +* Toggle ``DATADOG_DIAGNOSTICS_ENABLE`` for disabling that plugin quickly if needed. (Feature remains enabled by default.) + +Fixed +----- +* Limit the number of spans collected via new setting ``DATADOG_DIAGNOSTICS_MAX_SPANS``, defaulting to 100. This may help avoid memory leaks. +* Make accidental class variables into member variables in ``datadog_diagnostics`` + [3.4.0] - 2024-07-10 ~~~~~~~~~~~~~~~~~~~~ Added diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py index 37183df..5196718 100644 --- a/edx_arch_experiments/__init__.py +++ b/edx_arch_experiments/__init__.py @@ -2,4 +2,4 @@ A plugin to include applications under development by the architecture team at 2U. """ -__version__ = '3.4.0' +__version__ = '3.5.0' diff --git a/edx_arch_experiments/datadog_diagnostics/apps.py b/edx_arch_experiments/datadog_diagnostics/apps.py index 86494a8..de29169 100644 --- a/edx_arch_experiments/datadog_diagnostics/apps.py +++ b/edx_arch_experiments/datadog_diagnostics/apps.py @@ -5,23 +5,44 @@ import logging from django.apps import AppConfig +from django.conf import settings log = logging.getLogger(__name__) +# .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE +# .. toggle_implementation: DjangoSetting +# .. toggle_default: True +# .. toggle_description: Enables logging of Datadog diagnostics information. +# .. toggle_use_cases: circuit_breaker +# .. toggle_creation_date: 2024-07-11 +# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692 +DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True) + +# .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS +# .. setting_default: 100 +# .. setting_description: Limit of how many spans to hold onto and log +# when diagnosing Datadog tracing issues. This limits memory consumption +# avoids logging more data than is actually needed for diagnosis. +DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100) + + class MissingSpanProcessor: """Datadog span processor that logs unfinished spans at shutdown.""" - spans_started = 0 - spans_finished = 0 - open_spans = {} + + def __init__(self): + self.spans_started = 0 + self.spans_finished = 0 + self.open_spans = {} def on_span_start(self, span): self.spans_started += 1 - self.open_spans[span.span_id] = span + if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS: + self.open_spans[span.span_id] = span def on_span_finish(self, span): self.spans_finished += 1 - del self.open_spans[span.span_id] + self.open_spans.pop(span.span_id, None) # "delete if present" def shutdown(self, _timeout): log.info(f"Spans created = {self.spans_started}; spans finished = {self.spans_finished}") @@ -39,6 +60,9 @@ class DatadogDiagnostics(AppConfig): plugin_app = {} def ready(self): + if not DATADOG_DIAGNOSTICS_ENABLE: + return + try: from ddtrace import tracer # pylint: disable=import-outside-toplevel tracer._span_processors.append(MissingSpanProcessor()) # pylint: disable=protected-access diff --git a/edx_arch_experiments/datadog_diagnostics/tests/__init__.py b/edx_arch_experiments/datadog_diagnostics/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/edx_arch_experiments/datadog_diagnostics/tests/test_app.py b/edx_arch_experiments/datadog_diagnostics/tests/test_app.py new file mode 100644 index 0000000..b0ce611 --- /dev/null +++ b/edx_arch_experiments/datadog_diagnostics/tests/test_app.py @@ -0,0 +1,51 @@ +""" +Tests for plugin app. +""" + +from unittest.mock import patch + +from django.test import TestCase + +from .. import apps + + +class FakeSpan: + """A fake Span instance that just carries a span_id.""" + def __init__(self, span_id): + self.span_id = span_id + + def _pprint(self): + return f"span_id={self.span_id}" + + +class TestMissingSpanProcessor(TestCase): + """Tests for MissingSpanProcessor.""" + + @patch.object(apps, 'DATADOG_DIAGNOSTICS_MAX_SPANS', new=3) + def test_metrics(self): + proc = apps.MissingSpanProcessor() + ids = [2, 4, 6, 8, 10] + + for span_id in ids: + proc.on_span_start(FakeSpan(span_id)) + + assert {(sk, sv.span_id) for sk, sv in proc.open_spans.items()} == {(2, 2), (4, 4), (6, 6)} + assert proc.spans_started == 5 + assert proc.spans_finished == 0 + + for span_id in ids: + proc.on_span_finish(FakeSpan(span_id)) + + assert proc.open_spans.keys() == set() + assert proc.spans_started == 5 + assert proc.spans_finished == 5 + + @patch('edx_arch_experiments.datadog_diagnostics.apps.log.info') + @patch('edx_arch_experiments.datadog_diagnostics.apps.log.error') + def test_logging(self, mock_log_error, mock_log_info): + proc = apps.MissingSpanProcessor() + proc.on_span_start(FakeSpan(17)) + proc.shutdown(0) + + mock_log_info.assert_called_once_with("Spans created = 1; spans finished = 0") + mock_log_error.assert_called_once_with("Span created but not finished: span_id=17")