Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Remove early span-start logging; fix loading of Django settings #772

Merged
merged 1 commit into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,22 @@ Change Log
Unreleased
~~~~~~~~~~

[4.2.0] - 2024-08-13
~~~~~~~~~~~~~~~~~~~~
Fixed
-----
* Fixed loading of ``DATADOG_DIAGNOSTICS_ENABLE``, which was previously not loaded properly and therefore was always True. Also fixed loading of ``DATADOG_DIAGNOSTICS_MAX_SPANS``, which was presumably broken as well.

Removed
-------
* Removed early span-start logging. It never worked properly, possibly because workers are continually being destroyed and created, leading to high log volume.

[4.1.0] - 2024-08-09
~~~~~~~~~~~~~~~~~~~~
Changed
-------
* Datadog diagnostics will now log all span-starts for the first minute after server startup
* **WARNING**: Do not use this version; see 4.2.0 release notes.

[4.0.0] - 2024-08-05
~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '4.1.0'
__version__ = '4.2.0'
54 changes: 17 additions & 37 deletions edx_arch_experiments/datadog_diagnostics/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,13 @@
"""

import logging
import time

from django.apps import AppConfig
from django.conf import settings

log = logging.getLogger(__name__)


# .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE
# .. toggle_implementation: DjangoSetting
# .. toggle_default: True
# .. toggle_description: Enables logging of Datadog diagnostics information.
# .. toggle_use_cases: circuit_breaker
# .. toggle_creation_date: 2024-07-11
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True)

# .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS
# .. setting_default: 100
# .. setting_description: Limit of how many spans to hold onto and log
# when diagnosing Datadog tracing issues. This limits memory consumption
# avoids logging more data than is actually needed for diagnosis.
DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)

# .. setting_name: DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD
# .. setting_default: 60
# .. setting_description: Log all span starts for this many seconds after worker
# startup.
DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD = getattr(
settings,
'DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD',
60
)


# pylint: disable=missing-function-docstring
class MissingSpanProcessor:
"""Datadog span processor that logs unfinished spans at shutdown."""
Expand All @@ -46,20 +18,19 @@ def __init__(self):
self.spans_started = 0
self.spans_finished = 0
self.open_spans = {}
self.log_all_until = time.time() + DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD

# .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS
# .. setting_default: 100
# .. setting_description: Limit of how many spans to hold onto and log
# when diagnosing Datadog tracing issues. This limits memory consumption
# avoids logging more data than is actually needed for diagnosis.
self.DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)

def on_span_start(self, span):
self.spans_started += 1
if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS:
if len(self.open_spans) < self.DATADOG_DIAGNOSTICS_MAX_SPANS:
self.open_spans[span.span_id] = span

# We believe that the anomalous traces always come from a
# single span that is created early in the lifetime of a
# gunicorn worker. If we log *every* span-start in this early
# period, we may be able to observe something interesting.
if time.time() <= self.log_all_until:
log.info(f"Early span-start sample: {span._pprint()}") # pylint: disable=protected-access

def on_span_finish(self, span):
self.spans_finished += 1
self.open_spans.pop(span.span_id, None) # "delete if present"
Expand All @@ -80,6 +51,15 @@ class DatadogDiagnostics(AppConfig):
plugin_app = {}

def ready(self):
# .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE
# .. toggle_implementation: DjangoSetting
# .. toggle_default: True
# .. toggle_description: Enables logging of Datadog diagnostics information.
# .. toggle_use_cases: circuit_breaker
# .. toggle_creation_date: 2024-07-11
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True)

if not DATADOG_DIAGNOSTICS_ENABLE:
return

Expand Down
62 changes: 33 additions & 29 deletions edx_arch_experiments/datadog_diagnostics/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
Tests for plugin app.
"""

from unittest.mock import call, patch
from unittest.mock import patch

from django.test import TestCase
from ddtrace import tracer
from django.test import TestCase, override_settings

from .. import apps

Expand All @@ -21,7 +22,35 @@ def _pprint(self):
class TestMissingSpanProcessor(TestCase):
"""Tests for MissingSpanProcessor."""

@patch.object(apps, 'DATADOG_DIAGNOSTICS_MAX_SPANS', new=3)
def test_feature_switch(self):
"""
Regression test -- the use of override_settings ensures that we read
the setting as needed, and not once at module load time (when it's
not guaranteed to be available.)
"""
def initialize():
apps.DatadogDiagnostics('edx_arch_experiments.datadog_diagnostics', apps).ready()

def get_processor_list():
# pylint: disable=protected-access
return [type(sp).__name__ for sp in tracer._span_processors]

with override_settings(DATADOG_DIAGNOSTICS_ENABLE=False):
initialize()
assert sorted(get_processor_list()) == [
'EndpointCallCounterProcessor', 'TopLevelSpanProcessor',
]

# The True case needs to come second because the initializer
# appends to the list and there isn't an immediately obvious
# way of resetting it.
with override_settings(DATADOG_DIAGNOSTICS_ENABLE=True):
initialize()
assert sorted(get_processor_list()) == [
'EndpointCallCounterProcessor', 'MissingSpanProcessor', 'TopLevelSpanProcessor',
]

@override_settings(DATADOG_DIAGNOSTICS_MAX_SPANS=3)
def test_metrics(self):
proc = apps.MissingSpanProcessor()
ids = [2, 4, 6, 8, 10]
Expand All @@ -47,30 +76,5 @@ def test_logging(self, mock_log_error, mock_log_info):
proc.on_span_start(FakeSpan(17))
proc.shutdown(0)

assert mock_log_info.call_args_list == [
call("Early span-start sample: span_id=17"),
call("Spans created = 1; spans finished = 0"),
]
mock_log_info.assert_called_once_with("Spans created = 1; spans finished = 0")
mock_log_error.assert_called_once_with("Span created but not finished: span_id=17")

@patch('edx_arch_experiments.datadog_diagnostics.apps.log.info')
def test_early_span_logging_cutoff(self, mock_log_info):
with patch('edx_arch_experiments.datadog_diagnostics.apps.time.time', side_effect=[
# Setup
1700000000,
# Span-start time checks
1700000020,
1700000040,
1700010000,
]):
proc = apps.MissingSpanProcessor()
# Three spans are started
proc.on_span_start(FakeSpan(44))
proc.on_span_start(FakeSpan(45))
proc.on_span_start(FakeSpan(46))

# Just two early span-starts are logged
assert mock_log_info.call_args_list == [
call("Early span-start sample: span_id=44"),
call("Early span-start sample: span_id=45"),
]
Loading