Skip to content

Commit

Permalink
feat(alerts): check alerts earlier (#26843)
Browse files Browse the repository at this point in the history
  • Loading branch information
anirudhpillai authored Dec 12, 2024
1 parent aad8698 commit 0c6535e
Show file tree
Hide file tree
Showing 4 changed files with 345 additions and 97 deletions.
39 changes: 2 additions & 37 deletions posthog/tasks/alerts/checks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import time
import traceback

from datetime import datetime, timedelta, UTC
Expand All @@ -25,16 +24,15 @@
AlertState,
)
from posthog.utils import get_from_dict_or_attr
from prometheus_client import Counter, Gauge
from django.db.models import Q, F
from collections import defaultdict
from posthog.tasks.alerts.utils import (
AlertEvaluationResult,
calculation_interval_to_order,
next_check_time,
send_notifications_for_breaches,
send_notifications_for_errors,
WRAPPER_NODE_KINDS,
alert_calculation_interval_to_relativedelta,
)
from posthog.tasks.alerts.trends import check_trends_alert
from posthog.ph_client import ph_us_client
Expand All @@ -54,26 +52,6 @@ def __init__(self, err: Exception):
self.__traceback__ = err.__traceback__


HOURLY_ALERTS_BACKLOG_GAUGE = Gauge(
"hourly_alerts_backlog",
"Number of hourly alerts that are not being checked in the last hour.",
)

DAILY_ALERTS_BACKLOG_GAUGE = Gauge(
"daily_alerts_backlog",
"Number of daily alerts that are not being checked in the last 24 hours.",
)

ALERT_CHECK_ERROR_COUNTER = Counter(
"alerts_check_failures",
"Number of alert check errors that don't notify the user",
)

ALERT_COMPUTED_COUNTER = Counter(
"alerts_computed",
"Number of alerts we calculated",
)

ANIRUDH_DISTINCT_ID = "wcPbDRs08GtNzrNIXfzHvYAkwUaekW7UrAo4y3coznT"


Expand Down Expand Up @@ -102,8 +80,6 @@ def alerts_backlog_task() -> None:
)
).count()

HOURLY_ALERTS_BACKLOG_GAUGE.set(hourly_alerts_breaching_sla)

now = datetime.now(UTC)

daily_alerts_breaching_sla = AlertConfiguration.objects.filter(
Expand All @@ -114,8 +90,6 @@ def alerts_backlog_task() -> None:
)
).count()

DAILY_ALERTS_BACKLOG_GAUGE.set(daily_alerts_breaching_sla)

with ph_us_client() as capture_ph_event:
capture_ph_event(
ANIRUDH_DISTINCT_ID,
Expand All @@ -135,9 +109,6 @@ def alerts_backlog_task() -> None:
},
)

# sleeping 30s for prometheus to pick up the metrics sent during task
time.sleep(30)


@shared_task(
ignore_result=True,
Expand Down Expand Up @@ -266,7 +237,6 @@ def check_alert(alert_id: str, capture_ph_event: Callable = lambda *args, **kwar
try:
check_alert_and_notify_atomically(alert, capture_ph_event)
except Exception as err:
ALERT_CHECK_ERROR_COUNTER.inc()
user = cast(User, alert.created_by)

capture_ph_event(
Expand Down Expand Up @@ -309,9 +279,6 @@ def check_alert_and_notify_atomically(alert: AlertConfiguration, capture_ph_even
so we can retry notification without re-computing insight.
"""
set_tag("alert_config_id", alert.id)

ALERT_COMPUTED_COUNTER.inc()

user = cast(User, alert.created_by)

# Event to count alert checks
Expand Down Expand Up @@ -426,9 +393,7 @@ def add_alert_check(

# IMPORTANT: update next_check_at according to interval
# ensure we don't recheck alert until the next interval is due
alert.next_check_at = (alert.next_check_at or now) + alert_calculation_interval_to_relativedelta(
cast(AlertCalculationInterval, alert.calculation_interval)
)
alert.next_check_at = next_check_time(alert)

if notify:
alert.last_notified_at = now
Expand Down
130 changes: 111 additions & 19 deletions posthog/tasks/alerts/test/test_trends_absolute_alerts.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from typing import Optional, Any
from unittest.mock import ANY, MagicMock, patch
import dateutil

from freezegun import freeze_time

import dateutil
import pytz
import datetime

from posthog.models.alert import AlertCheck
from posthog.models.instance_setting import set_instance_setting
from posthog.tasks.alerts.checks import check_alert
Expand Down Expand Up @@ -41,7 +43,12 @@ def setUp(self) -> None:
self.dashboard_api = DashboardAPI(self.client, self.team, self.assertEqual)

def create_alert(
self, insight: dict, series_index: int, lower: Optional[int] = None, upper: Optional[int] = None
self,
insight: dict,
series_index: int,
lower: Optional[int] = None,
upper: Optional[int] = None,
calculation_interval: AlertCalculationInterval = AlertCalculationInterval.DAILY,
) -> dict:
alert = self.client.post(
f"/api/projects/{self.team.id}/alerts",
Expand All @@ -54,7 +61,7 @@ def create_alert(
"series_index": series_index,
},
"condition": {"type": "absolute_value"},
"calculation_interval": AlertCalculationInterval.DAILY,
"calculation_interval": calculation_interval,
"threshold": {"configuration": {"type": "absolute", "bounds": {"lower": lower, "upper": upper}}},
},
).json()
Expand Down Expand Up @@ -134,7 +141,11 @@ def test_alert_lower_threshold_breached(self, mock_send_breaches: MagicMock, moc
assert updated_alert.state == AlertState.FIRING
assert updated_alert.last_checked_at == FROZEN_TIME
assert updated_alert.last_notified_at == FROZEN_TIME
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 0
Expand Down Expand Up @@ -168,7 +179,11 @@ def test_trend_high_threshold_breached(self, mock_send_breaches: MagicMock, mock

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand All @@ -181,9 +196,11 @@ def test_trend_high_threshold_breached(self, mock_send_breaches: MagicMock, mock

def test_trend_no_threshold_breached(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(insight, series_index=0, lower=0, upper=2)
alert = self.create_alert(
insight, series_index=0, lower=0, upper=2, calculation_interval=AlertCalculationInterval.MONTHLY
)

with freeze_time(FROZEN_TIME - dateutil.relativedelta.relativedelta(days=1)):
with freeze_time(FROZEN_TIME):
_create_event(
team=self.team,
event="signed_up",
Expand All @@ -196,10 +213,49 @@ def test_trend_no_threshold_breached(self, mock_send_breaches: MagicMock, mock_s

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.NOT_FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = datetime.datetime(2024, 7, 1, 4, 0, tzinfo=pytz.UTC)
# first day of next month at around 4 AM
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 1
assert alert_check.calculated_value == 0
assert alert_check.state == AlertState.NOT_FIRING
assert alert_check.error is None

def test_trend_no_threshold_breached_weekly(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(
insight, series_index=0, lower=0, upper=2, calculation_interval=AlertCalculationInterval.WEEKLY
)

with freeze_time(FROZEN_TIME):
_create_event(
team=self.team,
event="signed_up",
distinct_id="1",
properties={"$browser": "Chrome"},
)
flush_persons_and_events()

check_alert(alert["id"])

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.NOT_FIRING

next_check = (
FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1, weekday=dateutil.relativedelta.MO(1))
).replace(hour=3, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 0
assert alert_check.state == AlertState.NOT_FIRING
assert alert_check.error is None

Expand Down Expand Up @@ -234,7 +290,11 @@ def test_trend_breakdown_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand Down Expand Up @@ -276,7 +336,11 @@ def test_trend_breakdown_low_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 1
Expand Down Expand Up @@ -318,7 +382,11 @@ def test_trend_breakdown_no_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.NOT_FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value is None
Expand Down Expand Up @@ -358,7 +426,11 @@ def test_aggregate_trend_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 3
Expand Down Expand Up @@ -400,7 +472,11 @@ def test_aggregate_trend_with_breakdown_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand Down Expand Up @@ -437,7 +513,11 @@ def test_trend_current_interval_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand Down Expand Up @@ -484,7 +564,11 @@ def test_trend_current_interval_fallback_to_previous_high_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 2
Expand Down Expand Up @@ -524,7 +608,11 @@ def test_trend_current_interval_no_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.NOT_FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
assert alert_check.calculated_value == 0
Expand All @@ -551,7 +639,11 @@ def test_trend_current_interval_low_threshold_breached(

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

next_check = (FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)).replace(hour=1, tzinfo=pytz.UTC)
assert updated_alert.next_check_at is not None
assert updated_alert.next_check_at.hour == next_check.hour
assert updated_alert.next_check_at.date() == next_check.date()

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
# will be 0 even thought for current day it's 1
Expand Down
Loading

0 comments on commit 0c6535e

Please sign in to comment.