From 66b73fba94a8b67c2daf651fb4a0fa150ae46901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Klara=20R=C3=B6hrl?= Date: Wed, 21 Jul 2021 09:27:19 +0200 Subject: [PATCH] Calculate rapid test rates after smoothing and averaging (#123) --- .../task_plot_rapid_test_statistics.py | 8 +- .../task_process_rapid_test_statistics.py | 137 ++++++++++++++---- src/testing/create_rapid_test_statistics.py | 20 +-- ...nd_plot_share_of_tests_for_symptomatics.py | 18 +++ tests/test_create_rapid_test_statistics.py | 20 --- 5 files changed, 139 insertions(+), 64 deletions(-) diff --git a/src/plotting/task_plot_rapid_test_statistics.py b/src/plotting/task_plot_rapid_test_statistics.py index 031093dc..14d939bb 100644 --- a/src/plotting/task_plot_rapid_test_statistics.py +++ b/src/plotting/task_plot_rapid_test_statistics.py @@ -15,9 +15,15 @@ ) from src.simulation.task_process_rapid_test_statistics import CHANNELS from src.simulation.task_process_rapid_test_statistics import OUTCOMES -from src.simulation.task_process_rapid_test_statistics import RATES from src.simulation.task_process_rapid_test_statistics import SHARE_TYPES +RATES = [ + "true_positive_rate", + "false_positive_rate", + "true_negative_rate", + "false_negative_rate", +] + def _create_rapid_test_plot_parametrization(): signature = "depends_on, plot_single_runs, ylabel, produces" diff --git a/src/simulation/task_process_rapid_test_statistics.py b/src/simulation/task_process_rapid_test_statistics.py index ba94eef9..b1d12001 100644 --- a/src/simulation/task_process_rapid_test_statistics.py +++ b/src/simulation/task_process_rapid_test_statistics.py @@ -5,7 +5,7 @@ from src.config import BLD from src.simulation.scenario_config import ( - create_path_to_rapid_test_statistic_time_series, + create_path_to_rapid_test_statistic_time_series as get_ts_path, ) from src.simulation.scenario_config import create_path_to_raw_rapid_test_statistics from src.simulation.scenario_config import get_named_scenarios @@ -29,32 +29,18 @@ ] SHARE_TYPES = ["number", "popshare", "testshare"] -RATES = [ - "false_negative_rate", - "false_positive_rate", - "true_negative_rate", - "true_positive_rate", -] - RAPID_TEST_STATISTICS = [] for out, channel, share_type in product(OUTCOMES, CHANNELS, SHARE_TYPES): RAPID_TEST_STATISTICS.append(f"{share_type}_{out}_by_{channel}") -for out, channel in product(RATES, CHANNELS): - RAPID_TEST_STATISTICS.append(f"{out}_by_{channel}") - -_PARAMETRIZATION = [ - ( - column, - create_path_to_rapid_test_statistic_time_series("spring_baseline", column), - ) - for column in RAPID_TEST_STATISTICS +_SINGLE_COL_PARAMETRIZATION = [ + (column, get_ts_path("spring_baseline", column)) for column in RAPID_TEST_STATISTICS ] @pytask.mark.skipif(_N_SEEDS == 0, reason="spring baseline did not run.") @pytask.mark.depends_on(_DEPENDENCIES) -@pytask.mark.parametrize("column, produces", _PARAMETRIZATION) +@pytask.mark.parametrize("column, produces", _SINGLE_COL_PARAMETRIZATION) def task_process_rapid_test_statistics(depends_on, column, produces): dfs = { seed: pd.read_csv(path, parse_dates=["date"], index_col="date") @@ -70,6 +56,113 @@ def task_process_rapid_test_statistics(depends_on, column, produces): df.to_pickle(produces) +def _get_rate_parametrization(channels): + rate_parametrization = [] + for channel in channels: + rate_parametrization += [ + ( + f"true_positive_rate_by_{channel}", + { + "numerator": get_ts_path( + "spring_baseline", f"number_true_positive_by_{channel}" + ), + "denominator": get_ts_path( + "spring_baseline", f"number_tested_positive_by_{channel}" + ), + }, + get_ts_path("spring_baseline", f"true_positive_rate_by_{channel}"), + ), + ( + f"false_positive_rate_by_{channel}", + { + "numerator": get_ts_path( + "spring_baseline", f"number_false_positive_by_{channel}" + ), + "denominator": get_ts_path( + "spring_baseline", f"number_tested_positive_by_{channel}" + ), + }, + get_ts_path("spring_baseline", f"false_positive_rate_by_{channel}"), + ), + ( + f"true_negative_rate_by_{channel}", + { + "numerator": get_ts_path( + "spring_baseline", f"number_true_negative_by_{channel}" + ), + "denominator": get_ts_path( + "spring_baseline", f"number_tested_negative_by_{channel}" + ), + }, + get_ts_path("spring_baseline", f"true_negative_rate_by_{channel}"), + ), + ( + f"false_negative_rate_by_{channel}", + { + "numerator": get_ts_path( + "spring_baseline", f"number_false_negative_by_{channel}" + ), + "denominator": get_ts_path( + "spring_baseline", f"number_tested_negative_by_{channel}" + ), + }, + get_ts_path("spring_baseline", f"false_negative_rate_by_{channel}"), + ), + ] + return rate_parametrization + + +_RATE_PARAMETRIZATION = _get_rate_parametrization(CHANNELS) + + +@pytask.mark.parametrize("name, depends_on, produces", _RATE_PARAMETRIZATION) +def task_create_rapid_test_statistic_ratios(name, depends_on, produces): + numerator = pd.read_pickle(depends_on["numerator"]) + denominator = pd.read_pickle(depends_on["denominator"]) + + seeds = list(range(_N_SEEDS)) + rate_df = pd.DataFrame() + # needed for plotting single runs + for s in seeds: + smooth_num = numerator[s].rolling(window=7, min_periods=1, center=False).mean() + smooth_denom = ( + denominator[s].rolling(window=7, min_periods=1, center=False).mean() + ) + rate_df[s] = smooth_num / smooth_denom + + # it's important to first average and smooth and **then** divide to get rid of noise + # before the division. + rate_df[name] = ( + # use that the mean is created **after** the seeds have been added + numerator[numerator.columns[-1]] + / denominator[denominator.columns[-1]] + ) + rate_df.to_pickle(produces) + + +_ALL_RAPID_TEST_STATISTICS = [path for col, path in _SINGLE_COL_PARAMETRIZATION] + [ + spec[-1] for spec in _RATE_PARAMETRIZATION +] + + +@pytask.mark.depends_on(_ALL_RAPID_TEST_STATISTICS) +@pytask.mark.produces(BLD / "tables" / "rapid_test_statistics.csv") +def task_create_nice_rapid_test_statistic_table_for_lookup(produces): + column_names = [col for col, _ in _SINGLE_COL_PARAMETRIZATION] + [ + spec[0] for spec in _RATE_PARAMETRIZATION + ] + assert len(set(column_names)) == len(column_names), ( + "There are duplicate names in the rapid test statistic columns. " + "You probably forgot to specify a channel as part of the column name." + ) + + to_concat = [ + pd.read_pickle(path)[[column]] for column, path in _SINGLE_COL_PARAMETRIZATION + ] + [pd.read_pickle(path)[[column]] for column, _, path in _RATE_PARAMETRIZATION] + df = pd.concat(to_concat, axis=1) + df.round(4).to_csv(produces) + + @pytask.mark.depends_on(_DEPENDENCIES) def task_check_that_a_table_was_created_for_each_rapid_test_statistic(depends_on): statistics_saved_by_sid = pd.read_csv(depends_on[0]).columns @@ -78,11 +171,3 @@ def task_check_that_a_table_was_created_for_each_rapid_test_statistic(depends_on assert set(should_have_a_table) == set( RAPID_TEST_STATISTICS ), "Some rapid test statistic columns that should have a table do not." - - -@pytask.mark.depends_on([path for col, path in _PARAMETRIZATION]) -@pytask.mark.produces(BLD / "tables" / "rapid_test_statistics.csv") -def task_create_nice_rapid_test_statistic_table_for_lookup(produces): - to_concat = [pd.read_pickle(path)[[column]] for column, path in _PARAMETRIZATION] - df = pd.concat(to_concat, axis=1) - df.round(4).to_csv(produces) diff --git a/src/testing/create_rapid_test_statistics.py b/src/testing/create_rapid_test_statistics.py index 537ca011..433c86d0 100644 --- a/src/testing/create_rapid_test_statistics.py +++ b/src/testing/create_rapid_test_statistics.py @@ -67,10 +67,9 @@ def _calculate_rapid_test_statistics_by_channel( Naming convention for the denominators: - - testshare -> n_tests - - popshare -> n_people - - number -> n_people / POPULATION_GERMANY - - rate -> n_{pos/neg}_tests + - testshare -> n_tests + - popshare -> n_people + - number -> n_people / POPULATION_GERMANY Args: states (pandas.DataFrame): sid states DataFrame. @@ -113,17 +112,4 @@ def _calculate_rapid_test_statistics_by_channel( statistics[f"popshare_{name}_by_{channel_name}"] = sr.sum() / n_obs statistics[f"testshare_{name}_by_{channel_name}"] = sr.sum() / n_tested - statistics[f"true_positive_rate_by_{channel_name}"] = ( - individual_outcomes["true_positive"].sum() / n_tested_positive - ) - statistics[f"true_negative_rate_by_{channel_name}"] = ( - individual_outcomes["true_negative"].sum() / n_tested_negative - ) - statistics[f"false_positive_rate_by_{channel_name}"] = ( - individual_outcomes["false_positive"].sum() / n_tested_positive - ) - statistics[f"false_negative_rate_by_{channel_name}"] = ( - individual_outcomes["false_negative"].sum() / n_tested_negative - ) - return statistics diff --git a/src/testing/task_get_and_plot_share_of_tests_for_symptomatics.py b/src/testing/task_get_and_plot_share_of_tests_for_symptomatics.py index edea149e..4120724f 100644 --- a/src/testing/task_get_and_plot_share_of_tests_for_symptomatics.py +++ b/src/testing/task_get_and_plot_share_of_tests_for_symptomatics.py @@ -9,6 +9,7 @@ from src.config import PLOT_SIZE from src.config import PLOT_START_DATE from src.config import SRC +from src.plotting.plotting import BLUE from src.plotting.plotting import style_plot from src.testing.shared import convert_weekly_to_daily from src.testing.shared import get_date_from_year_and_week @@ -46,6 +47,11 @@ / "data" / "testing" / "share_of_pcr_tests_going_to_symptomatics.pdf", + "used_share_pcr_going_to_symptomatic": BLD + / "figures" + / "data" + / "testing" + / "used_share_of_pcr_tests_going_to_symptomatics.pdf", } ) def task_prepare_characteristics_of_the_tested(depends_on, produces): @@ -106,6 +112,18 @@ def task_prepare_characteristics_of_the_tested(depends_on, produces): df = df.reset_index().rename(columns={"index": "date"}) df.to_csv(produces["data"]) + fig, ax = plt.subplots(figsize=PLOT_SIZE) + sns.lineplot( + x=share_of_tests_for_symptomatics_series.index, + y=share_of_tests_for_symptomatics_series, + color=BLUE, + linewidth=3.0, + alpha=0.6, + ) + fig, ax = style_plot(fig, ax) + fig.tight_layout() + fig.savefig(produces["used_share_pcr_going_to_symptomatic"]) + def _clean_data(df): share_sym_de = "Anteil keine, bzw. keine für COVID-19 bedeutsamen Symptome" diff --git a/tests/test_create_rapid_test_statistics.py b/tests/test_create_rapid_test_statistics.py index a38563aa..f9b721bf 100644 --- a/tests/test_create_rapid_test_statistics.py +++ b/tests/test_create_rapid_test_statistics.py @@ -1,4 +1,3 @@ -import numpy as np import pandas as pd from pandas.testing import assert_series_equal @@ -61,21 +60,6 @@ def mocked_sample_test_outcome(states, receives_rapid_test, params, seed): { 0: { "date": date, - # overall shares - "true_positive_rate_by_overall": 0.5, - "true_negative_rate_by_overall": 0.5, - "false_negative_rate_by_overall": 0.5, - "false_positive_rate_by_overall": 0.5, - # shares in a - "true_positive_rate_by_a": 0.5, - "true_negative_rate_by_a": 0.0, - "false_negative_rate_by_a": 1.0, - "false_positive_rate_by_a": 0.5, - # shares in b - "true_positive_rate_by_b": np.nan, - "true_negative_rate_by_b": 0.5, - "false_negative_rate_by_b": 0.5, - "false_positive_rate_by_b": np.nan, # numbers "number_false_negative_by_a": 2 * scaling, "number_false_negative_by_b": 2 * scaling, @@ -197,10 +181,6 @@ def test_calculate_rapid_test_statistics_by_channel(): "testshare_false_negative_by_channel": 2 / 5, "testshare_true_positive_by_channel": 1 / 5, "testshare_true_negative_by_channel": 1 / 5, - "true_positive_rate_by_channel": 1 / 2, - "true_negative_rate_by_channel": 1 / 3, - "false_positive_rate_by_channel": 1 / 2, - "false_negative_rate_by_channel": 2 / 3, } ) assert_series_equal(res.loc[expected.index], expected, check_names=False)