From 76d9ff3a5824013ad3bf091899c9cca2ccad7a77 Mon Sep 17 00:00:00 2001 From: Gagan Trivedi Date: Fri, 15 Dec 2023 15:07:08 +0530 Subject: [PATCH] feat(postgres/analytics): Add task to clean-up old data --- api/app/settings/common.py | 4 + api/app_analytics/tasks.py | 26 ++++++ api/tests/unit/app_analytics/test_tasks.py | 99 ++++++++++++++++++++++ 3 files changed, 129 insertions(+) diff --git a/api/app/settings/common.py b/api/app/settings/common.py index 4ebfe4163bcdd..7c292e3b6541e 100644 --- a/api/app/settings/common.py +++ b/api/app/settings/common.py @@ -916,6 +916,10 @@ AWS_SSE_LOGS_BUCKET_NAME = env.str("AWS_SSE_LOGS_BUCKET_NAME", None) SSE_INFLUXDB_BUCKET = env.str("SSE_INFLUXDB_BUCKET", None) +RAW_ANALYTICS_DATA_RETENTION_DAYS = env.int("RAW_ANALYTICS_DATA_RETENTION_DAYS", 30) +BUCKETED_ANALYTICS_DATA_RETENTION_DAYS = env.int( + "BUCKETED_ANALYTICS_DATA_RETENTION_DAYS", 90 +) DISABLE_INVITE_LINKS = env.bool("DISABLE_INVITE_LINKS", False) diff --git a/api/app_analytics/tasks.py b/api/app_analytics/tasks.py index 0c638f605b333..90ae17bf5bfbc 100644 --- a/api/app_analytics/tasks.py +++ b/api/app_analytics/tasks.py @@ -35,6 +35,32 @@ def populate_bucket( populate_feature_evaluation_bucket(bucket_size, run_every, source_bucket_size) +@register_recurring_task( + run_every=timedelta(days=1), +) +def clean_up_old_analytics_data(): + # delete raw analytics data older than `RAW_ANALYTICS_DATA_RETENTION_DAYS` + APIUsageRaw.objects.filter( + created_at__lt=timezone.now() + - timedelta(days=settings.RAW_ANALYTICS_DATA_RETENTION_DAYS) + ).delete() + FeatureEvaluationRaw.objects.filter( + created_at__lt=timezone.now() + - timedelta(days=settings.RAW_ANALYTICS_DATA_RETENTION_DAYS) + ).delete() + + # delete bucketed analytics data older than `BUCKETED_ANALYTICS_DATA_RETENTION_DAYS` + APIUsageBucket.objects.filter( + created_at__lt=timezone.now() + - timedelta(days=settings.BUCKETED_ANALYTICS_DATA_RETENTION_DAYS) + ).delete() + + FeatureEvaluationBucket.objects.filter( + created_at__lt=timezone.now() + - timedelta(days=settings.BUCKETED_ANALYTICS_DATA_RETENTION_DAYS) + ).delete() + + @register_task_handler() def track_feature_evaluation(environment_id, feature_evaluations): feature_evaluation_objects = [] diff --git a/api/tests/unit/app_analytics/test_tasks.py b/api/tests/unit/app_analytics/test_tasks.py index 5295fe049f697..d4b60e4a67310 100644 --- a/api/tests/unit/app_analytics/test_tasks.py +++ b/api/tests/unit/app_analytics/test_tasks.py @@ -9,6 +9,7 @@ Resource, ) from app_analytics.tasks import ( + clean_up_old_analytics_data, populate_api_usage_bucket, populate_feature_evaluation_bucket, track_feature_evaluation, @@ -16,6 +17,7 @@ ) from django.conf import settings from django.utils import timezone +from pytest_django.fixtures import SettingsWrapper if "analytics" not in settings.DATABASES: pytest.skip( @@ -315,3 +317,100 @@ def _create_feature_evaluation_event(environment_id, feature_name, count, when): event.save() return event + + +@pytest.mark.django_db(databases=["analytics"]) +def test_clean_up_old_analytics_data_does_nothing_if_no_data() -> None: + # Given + # When + clean_up_old_analytics_data() + + # Then + # no exception was raised + + +@pytest.mark.django_db(databases=["analytics"]) +def test_clean_up_old_analytics_data_removes_old_data( + settings: SettingsWrapper, +) -> None: + # Given + now = timezone.now() + settings.RAW_ANALYTICS_DATA_RETENTION_DAYS = 2 + settings.BUCKETED_ANALYTICS_DATA_RETENTION_DAYS = 4 + + environment_id = 1 + + # APIUsageRaw data that should not be removed + new_api_usage_raw_data = [] + new_api_usage_raw_data.append(_create_api_usage_event(environment_id, now)) + new_api_usage_raw_data.append( + _create_api_usage_event(environment_id, now - timezone.timedelta(days=1)) + ) + + # APIUsageRaw data that should be removed + _create_api_usage_event(environment_id, now - timezone.timedelta(days=2)) + _create_api_usage_event(environment_id, now - timezone.timedelta(days=3)) + + # APIUsageBucket data that should not be removed + new_api_usage_bucket = APIUsageBucket.objects.create( + environment_id=environment_id, + resource=Resource.FLAGS, + total_count=100, + created_at=now, + bucket_size=5, + ) + # APIUsageBucket data that should be removed + APIUsageBucket.objects.create( + environment_id=environment_id, + resource=Resource.FLAGS, + total_count=100, + created_at=now - timezone.timedelta(days=5), + bucket_size=5, + ) + + # FeatureEvaluationRaw data that should not be removed + new_feature_evaluation_raw_data = [] + new_feature_evaluation_raw_data.append( + _create_feature_evaluation_event(environment_id, "feature1", 1, now) + ) + new_feature_evaluation_raw_data.append( + _create_feature_evaluation_event( + environment_id, "feature1", 1, now - timezone.timedelta(days=1) + ) + ) + + # FeatureEvaluationRaw data that should be removed + _create_feature_evaluation_event( + environment_id, "feature1", 1, now - timezone.timedelta(days=3) + ) + _create_feature_evaluation_event( + environment_id, "feature1", 1, now - timezone.timedelta(days=2) + ) + + # FeatureEvaluationBucket data that should not be removed + new_feature_evaluation_bucket = FeatureEvaluationBucket.objects.create( + environment_id=environment_id, + feature_name="feature1", + total_count=100, + created_at=now, + bucket_size=5, + ) + + # FeatureEvaluationBucket data that should be removed + FeatureEvaluationBucket.objects.create( + environment_id=environment_id, + feature_name="feature1", + total_count=100, + created_at=now - timezone.timedelta(days=5), + bucket_size=5, + ) + # When + clean_up_old_analytics_data() + + # Then + assert list(APIUsageRaw.objects.all()) == new_api_usage_raw_data + assert list(FeatureEvaluationRaw.objects.all()) == new_feature_evaluation_raw_data + assert list(FeatureEvaluationBucket.objects.all()) == [ + new_feature_evaluation_bucket + ] + assert list(APIUsageBucket.objects.all()) == [new_api_usage_bucket]