From c07bc24c2c8e4596111ccbfe6e8917d270929963 Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Tue, 3 Dec 2024 11:50:12 +0000 Subject: [PATCH 1/2] Change provider balance on slow delivery every 5 minutes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of every 10 minutes - this way we react faster, and fewer text messages get delayed. This becomes more important as we now send lots of login codes for various organisations. Here are some considerations for the change: 1. if we move from 10 minutes to 5 minutes, wouldn’t we use the notifications in our decision that were used in previous weighing too? This is a valid question. This is happening already with current set up tho - as we look for last 15 minutes of sms, but we are changing weighing every 10 minutes. So this wouldn’t be a new issue. Also this can be solved to either prioritise the best possible service for our users, or to prioritise fairest possible outcome for our providers. I believe getting text messages delivered promptly is more important than if a provider briefly loses more traffic because they were slow within last 15 minutes. Also if the slow down was just a short blip, on a subsequent delivery speed check some of the notifications will be from after the blip passed, and that should mean their priority has less chance of reducing further. 2. we should learn more before we merge it On one side, fair, on the other, I kinda think it’s not a fundamental change, but rather a tuning, and we did give it some thought already, no? We identified a problem (we adjust ratios too slowly when there is a period of slow delivery for one of our providers), we got an adjustment that should improve this, and we thought of potential flip-sides above (providers ratio will be lowered more quickly when they are slow, that means we will lower their ratio one more time than we would do before after their delivery speed recovers). --- app/celery/scheduled_tasks.py | 6 +++--- tests/app/celery/test_scheduled_tasks.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index 1c73e168c9..3edc2f2051 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -131,8 +131,8 @@ def delete_invitations(): def switch_current_sms_provider_on_slow_delivery(): """ Reduce provider's priority if at least 15% of notifications took more than 5 minutes to be delivered - in the last ten minutes. If both providers are slow, don't do anything. If we changed the providers in the - last ten minutes, then don't update them again either. + in the last 15 minutes. If both providers are slow, don't do anything. If we changed the providers in the + last 5 minutes, then don't update them again either. """ slow_delivery_notifications = is_delivery_slow_for_providers( created_within_minutes=15, @@ -146,7 +146,7 @@ def switch_current_sms_provider_on_slow_delivery(): for provider_name, is_slow in slow_delivery_notifications.items(): if is_slow: current_app.logger.warning("Slow delivery notifications detected for provider %s", provider_name) - dao_reduce_sms_provider_priority(provider_name, time_threshold=timedelta(minutes=10)) + dao_reduce_sms_provider_priority(provider_name, time_threshold=timedelta(minutes=5)) def _check_slow_text_message_delivery_reports_and_raise_error_if_needed(reports: list[SlowProviderDeliveryReport]): diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index 771e4b8cea..3f64a80ace 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -139,7 +139,7 @@ def test_switch_current_sms_provider_on_slow_delivery_switches_when_one_provider switch_current_sms_provider_on_slow_delivery() mock_is_slow.assert_called_once_with(created_within_minutes=15, delivered_within_minutes=5, threshold=0.15) - mock_reduce.assert_called_once_with("firetext", time_threshold=timedelta(minutes=10)) + mock_reduce.assert_called_once_with("firetext", time_threshold=timedelta(minutes=5)) @freeze_time("2017-05-01 14:00:00") From c3822c1e92a0cdba78c92ecf346f3a10e6943c78 Mon Sep 17 00:00:00 2001 From: Mervi Tyczynska Date: Fri, 6 Dec 2024 17:16:53 +0000 Subject: [PATCH 2/2] Move sms providers balance back to the middle more frequently We used to only do it every one hour - let's go down to every 15 minutes, to balance with shorter intervals for taking traffic away on slow delivery. --- app/dao/provider_details_dao.py | 6 +++--- tests/app/dao/test_provider_details_dao.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/app/dao/provider_details_dao.py b/app/dao/provider_details_dao.py index eaccfdac15..81a9ba0187 100644 --- a/app/dao/provider_details_dao.py +++ b/app/dao/provider_details_dao.py @@ -107,11 +107,11 @@ def dao_reduce_sms_provider_priority(identifier, *, time_threshold): @autocommit def dao_adjust_provider_priority_back_to_resting_points(): """ - Provided that neither SMS provider has been modified in the last hour, move both providers by 10 percentage points - each towards their defined resting points (set in SMS_PROVIDER_RESTING_POINTS in config.py). + Provided that neither SMS provider has been modified in the last 15 minutes, move both providers + by 10 percentage points each towards their defined resting points (set in SMS_PROVIDER_RESTING_POINTS in config.py). """ amount_to_reduce_by = 10 - time_threshold = timedelta(hours=1) + time_threshold = timedelta(minutes=15) providers = _get_sms_providers_for_update(time_threshold) diff --git a/tests/app/dao/test_provider_details_dao.py b/tests/app/dao/test_provider_details_dao.py index 3849f2d0d0..6be52aab6c 100644 --- a/tests/app/dao/test_provider_details_dao.py +++ b/tests/app/dao/test_provider_details_dao.py @@ -285,7 +285,7 @@ def test_adjust_provider_priority_back_to_resting_points_updates_all_providers( dao_adjust_provider_priority_back_to_resting_points() - mock_get_providers.assert_called_once_with(timedelta(hours=1)) + mock_get_providers.assert_called_once_with(timedelta(minutes=15)) mock_adjust.assert_any_call(mmg, new_mmg) mock_adjust.assert_any_call(firetext, new_firetext)