Skip to content

Commit

Permalink
Merge branch 'main' into huniafatima/deprecate-edx-sphinx-theme
Browse files Browse the repository at this point in the history
  • Loading branch information
huniafatima-arbi authored Oct 2, 2024
2 parents c9792be + e32434c commit b6953f6
Show file tree
Hide file tree
Showing 18 changed files with 451 additions and 165 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,18 @@ Change Log
Unreleased
~~~~~~~~~~

[4.5.0] - 2024-09-19
~~~~~~~~~~~~~~~~~~~~
Added
-----
* Datadog diagnostics middleware can now attempt to close anomalous spans. Can be enabled via Waffle flag ``datadog.diagnostics.close_anomalous_spans`` (controlled separately from logging feature).

[4.4.0] - 2024-09-10
~~~~~~~~~~~~~~~~~~~~
Changed
-------
* Datadog diagnostics now logs ancestor spans when an anomaly is encountered, up to a limit of 10 (controlled by new Django setting ``DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH``). Spans are logged in full and on separate lines, so this logging is now much more verbose; consider only enabling this logging for short periods. Log format of first line has also changed slightly.

[4.3.0] - 2024-08-22
~~~~~~~~~~~~~~~~~~~~
Added
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '4.3.0'
__version__ = '4.5.0'
111 changes: 103 additions & 8 deletions edx_arch_experiments/datadog_diagnostics/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import logging
import time

from django.conf import settings
from django.core.exceptions import MiddlewareNotUsed
from edx_toggles.toggles import WaffleFlag

Expand All @@ -26,6 +27,16 @@
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
DETECT_ANOMALOUS_TRACE = WaffleFlag('datadog.diagnostics.detect_anomalous_trace', module_name=__name__)

# .. toggle_name: datadog.diagnostics.close_anomalous_spans
# .. toggle_implementation: WaffleFlag
# .. toggle_default: False
# .. toggle_description: Close anomalous spans that are ancestors of the django.request span.
# .. toggle_use_cases: temporary
# .. toggle_creation_date: 2024-09-19
# .. toggle_target_removal_date: 2024-12-01
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
CLOSE_ANOMALOUS_SPANS = WaffleFlag('datadog.diagnostics.close_anomalous_spans', module_name=__name__)

# .. toggle_name: datadog.diagnostics.log_root_span
# .. toggle_implementation: WaffleFlag
# .. toggle_default: False
Expand Down Expand Up @@ -61,13 +72,22 @@ def __init__(self, get_response):
raise MiddlewareNotUsed # pylint: disable=raise-missing-from

self.worker_start_epoch = time.time()
# .. setting_name: DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH
# .. setting_default: 10
# .. setting_description: Controls how many ancestors spans are logged
# when anomalous traces are detected. This limits log size when very
# deep span trees are present (such as in anomalous traces, or even
# just when each middleware is given a span).
self.log_span_ancestors_depth = getattr(settings, "DATADOG_DIAGNOSTICS_LOG_SPAN_DEPTH", 10)

def __call__(self, request):
return self.get_response(request)

def process_view(self, request, _view_func, _view_args, _view_kwargs):
try:
self.log_diagnostics(request)
if CLOSE_ANOMALOUS_SPANS.is_enabled():
self.close_anomalous_spans(request)
except BaseException as e:
# If there's an error, it will probably hit every request,
# so let's just log it once.
Expand All @@ -78,29 +98,104 @@ def process_view(self, request, _view_func, _view_args, _view_kwargs):
f"(suppressing further errors): {e!r}"
)

# pylint: disable=protected-access
def close_anomalous_spans(self, request):
"""
Detect anomalous spans and close them.
This closes any open spans that are ancestors of the current
request. The trace will still have two requests concatenated
together, but the problematic spans should not affect
future requests.
Only activates if the root span is itself closed, which is a
cheap thing to check.
"""
# If the root span is still open, probably not an anomalous trace.
if self.dd_tracer.current_root_span().duration is None:
return # nothing to do!

# Walk upwards until we find the django.request span.
walk_span = self.dd_tracer.current_span()
while walk_span.name != 'django.request':
walk_span = walk_span._parent
if walk_span is None:
# If we can't find the django.request root, there's
# something bad about our assumptions and we should
# not attempt a fix.
log.error(
"Did not find django.request span when walking anomalous trace "
"to root. Not attempting a fix."
)
return

# Go "above" the request span
walk_span = walk_span._parent

# Now close everything above the current request span that's
# still open, logging as we go.
while walk_span is not None:
# We call finish() individually rather than
# finish_with_ancestors() because this gives us a chance
# to log each one.
if walk_span.duration is None:
walk_span.finish()
log.info(
f"Closed span in anomalous trace: name={walk_span.name} "
f"id={walk_span.span_id:x} trace={walk_span.trace_id:x}"
)
# Keep walking up even if we discover closed spans; we've
# previously seen multiple contiguous segments of open
# spans separated by closed ones.
walk_span = walk_span._parent

def log_diagnostics(self, request):
"""
Contains all the actual logging logic.
"""
current_span = self.dd_tracer.current_span()
local_root_span = self.dd_tracer.current_root_span()

if DETECT_ANOMALOUS_TRACE.is_enabled():
# For testing, uncomment this line to fake an anomalous span:
# local_root_span.finish()
root_duration_s = local_root_span.duration
if root_duration_s is not None:
worker_run_time_s = time.time() - self.worker_start_epoch
log.warning(
f"Anomalous Datadog local root span (duration already set): "
f"id = {local_root_span.trace_id:x}; "
f"duration = {root_duration_s:0.3f} sec; "
f"worker age = {worker_run_time_s:0.3f} sec"
)
self.log_anomalous_trace(current_span, local_root_span)

if LOG_ROOT_SPAN.is_enabled():
route_pattern = getattr(request.resolver_match, 'route', None)
current_span = self.dd_tracer.current_span()
# pylint: disable=protected-access
log.info(
f"Datadog span diagnostics: Route = {route_pattern}; "
f"local root span = {local_root_span._pprint()}; "
f"current span = {current_span._pprint()}"
)

def log_anomalous_trace(self, current_span, local_root_span):
worker_run_time_s = time.time() - self.worker_start_epoch

# Build up a list of spans from current back towards the root, up to a limit.
ancestors = []
walk_span = current_span
while len(ancestors) < self.log_span_ancestors_depth and walk_span is not None:
ancestors.insert(0, walk_span._pprint()) # pylint: disable=protected-access
walk_span = walk_span._parent # pylint: disable=protected-access

trunc = "(ancestors truncated)\n" if walk_span else ""

if local_root_span.duration:
duration_msg = f"duration={local_root_span.duration:0.3f}"
else:
# Should only occur during local testing of this
# middleware, when forcing this code path to run.
duration_msg = "duration not set"

msg = (
"Anomalous Datadog local root span: "
f"trace_id={local_root_span.trace_id:x}; "
f"{duration_msg}; "
f"worker_age={worker_run_time_s:0.3f}; span ancestry:"
)

log.warning(msg + "\n" + trunc + "\n".join(ancestors)) # pylint: disable=logging-not-lazy
5 changes: 5 additions & 0 deletions edx_arch_experiments/datadog_diagnostics/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ def _pprint(self):
class TestMissingSpanProcessor(TestCase):
"""Tests for MissingSpanProcessor."""

def setUp(self):
# Remove custom span processor from previous runs.
# pylint: disable=protected-access
tracer._span_processors = [sp for sp in tracer._span_processors if type(sp).__name__ != 'MissingSpanProcessor']

def test_feature_switch(self):
"""
Regression test -- the use of override_settings ensures that we read
Expand Down
Loading

0 comments on commit b6953f6

Please sign in to comment.