Merge branch 'main' into huniafatima/deprecate-edx-sphinx-theme

edx · Sep 2, 2024 · c9792be · c9792be
2 parents d49edda + 560a6fb
commit c9792be
Show file tree

Hide file tree

Showing 14 changed files with 587 additions and 150 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,29 @@ Change Log
 Unreleased
 ~~~~~~~~~~
 
+[4.3.0] - 2024-08-22
+~~~~~~~~~~~~~~~~~~~~
+Added
+-----
+* Added celery lifecycle logging for Datadog diagnostics, to be enabled using ``DATADOG_DIAGNOSTICS_CELERY_LOG_SIGNALS``.
+
+[4.2.0] - 2024-08-13
+~~~~~~~~~~~~~~~~~~~~
+Fixed
+-----
+* Fixed loading of ``DATADOG_DIAGNOSTICS_ENABLE``, which was previously not loaded properly and therefore was always True. Also fixed loading of ``DATADOG_DIAGNOSTICS_MAX_SPANS``, which was presumably broken as well.
+
+Removed
+-------
+* Removed early span-start logging. It never worked properly, possibly because workers are continually being destroyed and created, leading to high log volume.
+
+[4.1.0] - 2024-08-09
+~~~~~~~~~~~~~~~~~~~~
+Changed
+-------
+* Datadog diagnostics will now log all span-starts for the first minute after server startup
+* **WARNING**: Do not use this version; see 4.2.0 release notes.
+
 [4.0.0] - 2024-08-05
 ~~~~~~~~~~~~~~~~~~~~
 Changed

diff --git a/edx_arch_experiments/__init__.py b/edx_arch_experiments/__init__.py
@@ -2,4 +2,4 @@
 A plugin to include applications under development by the architecture team at 2U.
 """
 
-__version__ = '4.0.0'
+__version__ = '4.3.0'
diff --git a/edx_arch_experiments/datadog_diagnostics/apps.py b/edx_arch_experiments/datadog_diagnostics/apps.py
@@ -3,30 +3,15 @@
 """
 
 import logging
+import re
 
 from django.apps import AppConfig
 from django.conf import settings
 
 log = logging.getLogger(__name__)
 
 
-# .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE
-# .. toggle_implementation: DjangoSetting
-# .. toggle_default: True
-# .. toggle_description: Enables logging of Datadog diagnostics information.
-# .. toggle_use_cases: circuit_breaker
-# .. toggle_creation_date: 2024-07-11
-# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
-DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True)
-
-# .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS
-# .. setting_default: 100
-# .. setting_description: Limit of how many spans to hold onto and log
-#   when diagnosing Datadog tracing issues. This limits memory consumption
-#   avoids logging more data than is actually needed for diagnosis.
-DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)
-
-
+# pylint: disable=missing-function-docstring
 class MissingSpanProcessor:
     """Datadog span processor that logs unfinished spans at shutdown."""
 
@@ -35,9 +20,16 @@ def __init__(self):
         self.spans_finished = 0
         self.open_spans = {}
 
+        # .. setting_name: DATADOG_DIAGNOSTICS_MAX_SPANS
+        # .. setting_default: 100
+        # .. setting_description: Limit of how many spans to hold onto and log
+        #   when diagnosing Datadog tracing issues. This limits memory consumption
+        #   avoids logging more data than is actually needed for diagnosis.
+        self.DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)
+
     def on_span_start(self, span):
         self.spans_started += 1
-        if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS:
+        if len(self.open_spans) < self.DATADOG_DIAGNOSTICS_MAX_SPANS:
             self.open_spans[span.span_id] = span
 
     def on_span_finish(self, span):
@@ -50,6 +42,112 @@ def shutdown(self, _timeout):
             log.error(f"Span created but not finished: {span._pprint()}")  # pylint: disable=protected-access
 
 
+# Dictionary of Celery signal names to a task information extractor.
+# The latter is a lambda accepting the signal receiver's kwargs dict
+# and returning a minimal dict of info for tracking task lifecycle.
+# Celery signal params vary quite a bit in how they convey the
+# information we need, so this is probably better than trying to use
+# one set of heuristics to get the task ID and name from all signals.
+#
+# Docs: https://docs.celeryq.dev/en/stable/userguide/signals.html
+CELERY_SIGNAL_CONFIG = {
+    'before_task_publish': lambda kwargs: {'name': kwargs['sender']},
+    'after_task_publish': lambda kwargs: {'name': kwargs['sender']},
+    'task_prerun': lambda kwargs: {'name': kwargs['task'].name, 'id': kwargs['task_id']},
+    'task_postrun': lambda kwargs: {'name': kwargs['task'].name, 'id': kwargs['task_id']},
+    'task_retry': lambda kwargs: {'name': kwargs['sender'].name, 'id': kwargs['request'].id},
+    'task_success': lambda kwargs: {'name': kwargs['sender'].name},
+    'task_failure': lambda kwargs: {'name': kwargs['sender'].name, 'id': kwargs['task_id']},
+    'task_internal_error': lambda kwargs: {'name': kwargs['sender'].name, 'id': kwargs['task_id']},
+    'task_received': lambda kwargs: {'name': kwargs['request'].name, 'id': kwargs['request'].id},
+    'task_revoked': lambda kwargs: {'name': kwargs['sender'].name, 'id': kwargs['request'].id},
+    'task_unknown': lambda kwargs: {'name': kwargs['name'], 'id': kwargs['id']},
+    'task_rejected': lambda _kwargs: {},
+}
+
+
+def _connect_celery_handler(signal, signal_name, extractor):
+    """
+    Register one Celery signal receiver.
+
+    This serves as a closure to capture the config (and some state) for one signal.
+    If the extractor ever throws, log the error just once and don't try calling it
+    again for the remaining life of the process (as it will likely continue failing
+    the same way.)
+
+    Args:
+        signal: Django Signal instance to register a handler for
+        signal_name: Name of signal in Celery signals module (used for logging)
+        extractor: Function to take signal receiver's entire kwargs and return
+            a dict optionally containing 'id' and 'name' keys.
+    """
+    errored = False
+
+    def log_celery_signal(**kwargs):
+        nonlocal errored
+        info = None
+        try:
+            if not errored:
+                info = extractor(kwargs)
+        except BaseException:
+            errored = True
+            log.exception(
+                f"Error while extracting Celery signal info for '{signal_name}'; "
+                "will not attempt for future calls to this signal."
+            )
+
+        if info is None:
+            extra = "(skipped data extraction)"
+        else:
+            extra = f"with name={info.get('name')} id={info.get('id')}"
+        log.info(f"Celery signal called: '{signal_name}' {extra}")
+
+    signal.connect(log_celery_signal, weak=False)
+
+
+def connect_celery_logging():
+    """
+    Set up logging of configured Celery signals.
+
+    Throws if celery is not installed.
+    """
+    import celery.signals  # pylint: disable=import-outside-toplevel
+
+    # .. setting_name: DATADOG_DIAGNOSTICS_CELERY_LOG_SIGNALS
+    # .. setting_default: ''
+    # .. setting_description: Log calls to these Celery signals (signal name as well
+    #   as task name and id, if available). Specify as a comma and/or whitespace delimited
+    #   list of names from the celery.signals module. Full list of available signals:
+    #   "after_task_publish, before_task_publish, task_failure, task_internal_error,
+    #   task_postrun, task_prerun, task_received, task_rejected, task_retry,
+    #   task_revoked, task_success, task_unknown"
+    DATADOG_DIAGNOSTICS_CELERY_LOG_SIGNALS = getattr(
+        settings,
+        'DATADOG_DIAGNOSTICS_CELERY_LOG_SIGNALS',
+        ''
+    )
+
+    connected_names = []
+    for signal_name in re.split(r'[,\s]+', DATADOG_DIAGNOSTICS_CELERY_LOG_SIGNALS):
+        if not signal_name:  # artifacts from splitting
+            continue
+
+        signal = getattr(celery.signals, signal_name, None)
+        if not signal:
+            log.warning(f"Could not connect receiver to unknown Celery signal '{signal_name}'")
+            continue
+
+        extractor = CELERY_SIGNAL_CONFIG.get(signal_name)
+        if not extractor:
+            log.warning(f"Don't know how to extract info for Celery signal '{signal_name}'; ignoring.")
+            continue
+
+        _connect_celery_handler(signal, signal_name, extractor)
+        connected_names.append(signal_name)
+
+    log.info(f"Logging lifecycle info for these celery signals: {connected_names!r}")
+
+
 class DatadogDiagnostics(AppConfig):
     """
     Django application to log diagnostic information for Datadog.
@@ -60,6 +158,15 @@ class DatadogDiagnostics(AppConfig):
     plugin_app = {}
 
     def ready(self):
+        # .. toggle_name: DATADOG_DIAGNOSTICS_ENABLE
+        # .. toggle_implementation: DjangoSetting
+        # .. toggle_default: True
+        # .. toggle_description: Enables logging of Datadog diagnostics information.
+        # .. toggle_use_cases: circuit_breaker
+        # .. toggle_creation_date: 2024-07-11
+        # .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
+        DATADOG_DIAGNOSTICS_ENABLE = getattr(settings, 'DATADOG_DIAGNOSTICS_ENABLE', True)
+
         if not DATADOG_DIAGNOSTICS_ENABLE:
             return
 
@@ -72,3 +179,12 @@ def ready(self):
                 "Unable to attach MissingSpanProcessor for Datadog diagnostics"
                 " -- ddtrace module not found."
             )
+
+        # We think that something related to Celery instrumentation is involved
+        # in causing trace concatenation in Datadog. DD Support has requested that
+        # we log lifecycle information of Celery tasks to see if all of the needed
+        # signals are being emitted for their span construction.
+        try:
+            connect_celery_logging()
+        except BaseException:
+            log.exception("Unable to subscribe to Celery task signals")