Calculate rapid test rates after smoothing and averaging (#123)

covid-19-impact-lab · Jul 21, 2021 · 66b73fb · 66b73fb
1 parent baba22e
commit 66b73fb
Show file tree

Hide file tree

Showing 5 changed files with 139 additions and 64 deletions.
diff --git a/src/plotting/task_plot_rapid_test_statistics.py b/src/plotting/task_plot_rapid_test_statistics.py
@@ -15,9 +15,15 @@
 )
 from src.simulation.task_process_rapid_test_statistics import CHANNELS
 from src.simulation.task_process_rapid_test_statistics import OUTCOMES
-from src.simulation.task_process_rapid_test_statistics import RATES
 from src.simulation.task_process_rapid_test_statistics import SHARE_TYPES
 
+RATES = [
+    "true_positive_rate",
+    "false_positive_rate",
+    "true_negative_rate",
+    "false_negative_rate",
+]
+
 
 def _create_rapid_test_plot_parametrization():
     signature = "depends_on, plot_single_runs, ylabel, produces"

diff --git a/src/simulation/task_process_rapid_test_statistics.py b/src/simulation/task_process_rapid_test_statistics.py
@@ -5,7 +5,7 @@
 
 from src.config import BLD
 from src.simulation.scenario_config import (
-    create_path_to_rapid_test_statistic_time_series,
+    create_path_to_rapid_test_statistic_time_series as get_ts_path,
 )
 from src.simulation.scenario_config import create_path_to_raw_rapid_test_statistics
 from src.simulation.scenario_config import get_named_scenarios
@@ -29,32 +29,18 @@
 ]
 SHARE_TYPES = ["number", "popshare", "testshare"]
 
-RATES = [
-    "false_negative_rate",
-    "false_positive_rate",
-    "true_negative_rate",
-    "true_positive_rate",
-]
-
 RAPID_TEST_STATISTICS = []
 for out, channel, share_type in product(OUTCOMES, CHANNELS, SHARE_TYPES):
     RAPID_TEST_STATISTICS.append(f"{share_type}_{out}_by_{channel}")
-for out, channel in product(RATES, CHANNELS):
-    RAPID_TEST_STATISTICS.append(f"{out}_by_{channel}")
 
-
-_PARAMETRIZATION = [
-    (
-        column,
-        create_path_to_rapid_test_statistic_time_series("spring_baseline", column),
-    )
-    for column in RAPID_TEST_STATISTICS
+_SINGLE_COL_PARAMETRIZATION = [
+    (column, get_ts_path("spring_baseline", column)) for column in RAPID_TEST_STATISTICS
 ]
 
 
 @pytask.mark.skipif(_N_SEEDS == 0, reason="spring baseline did not run.")
 @pytask.mark.depends_on(_DEPENDENCIES)
-@pytask.mark.parametrize("column, produces", _PARAMETRIZATION)
+@pytask.mark.parametrize("column, produces", _SINGLE_COL_PARAMETRIZATION)
 def task_process_rapid_test_statistics(depends_on, column, produces):
     dfs = {
         seed: pd.read_csv(path, parse_dates=["date"], index_col="date")
@@ -70,6 +56,113 @@ def task_process_rapid_test_statistics(depends_on, column, produces):
     df.to_pickle(produces)
 
 
+def _get_rate_parametrization(channels):
+    rate_parametrization = []
+    for channel in channels:
+        rate_parametrization += [
+            (
+                f"true_positive_rate_by_{channel}",
+                {
+                    "numerator": get_ts_path(
+                        "spring_baseline", f"number_true_positive_by_{channel}"
+                    ),
+                    "denominator": get_ts_path(
+                        "spring_baseline", f"number_tested_positive_by_{channel}"
+                    ),
+                },
+                get_ts_path("spring_baseline", f"true_positive_rate_by_{channel}"),
+            ),
+            (
+                f"false_positive_rate_by_{channel}",
+                {
+                    "numerator": get_ts_path(
+                        "spring_baseline", f"number_false_positive_by_{channel}"
+                    ),
+                    "denominator": get_ts_path(
+                        "spring_baseline", f"number_tested_positive_by_{channel}"
+                    ),
+                },
+                get_ts_path("spring_baseline", f"false_positive_rate_by_{channel}"),
+            ),
+            (
+                f"true_negative_rate_by_{channel}",
+                {
+                    "numerator": get_ts_path(
+                        "spring_baseline", f"number_true_negative_by_{channel}"
+                    ),
+                    "denominator": get_ts_path(
+                        "spring_baseline", f"number_tested_negative_by_{channel}"
+                    ),
+                },
+                get_ts_path("spring_baseline", f"true_negative_rate_by_{channel}"),
+            ),
+            (
+                f"false_negative_rate_by_{channel}",
+                {
+                    "numerator": get_ts_path(
+                        "spring_baseline", f"number_false_negative_by_{channel}"
+                    ),
+                    "denominator": get_ts_path(
+                        "spring_baseline", f"number_tested_negative_by_{channel}"
+                    ),
+                },
+                get_ts_path("spring_baseline", f"false_negative_rate_by_{channel}"),
+            ),
+        ]
+    return rate_parametrization
+
+
+_RATE_PARAMETRIZATION = _get_rate_parametrization(CHANNELS)
+
+
+@pytask.mark.parametrize("name, depends_on, produces", _RATE_PARAMETRIZATION)
+def task_create_rapid_test_statistic_ratios(name, depends_on, produces):
+    numerator = pd.read_pickle(depends_on["numerator"])
+    denominator = pd.read_pickle(depends_on["denominator"])
+
+    seeds = list(range(_N_SEEDS))
+    rate_df = pd.DataFrame()
+    # needed for plotting single runs
+    for s in seeds:
+        smooth_num = numerator[s].rolling(window=7, min_periods=1, center=False).mean()
+        smooth_denom = (
+            denominator[s].rolling(window=7, min_periods=1, center=False).mean()
+        )
+        rate_df[s] = smooth_num / smooth_denom
+
+    # it's important to first average and smooth and **then** divide to get rid of noise
+    # before the division.
+    rate_df[name] = (
+        # use that the mean is created **after** the seeds have been added
+        numerator[numerator.columns[-1]]
+        / denominator[denominator.columns[-1]]
+    )
+    rate_df.to_pickle(produces)
+
+
+_ALL_RAPID_TEST_STATISTICS = [path for col, path in _SINGLE_COL_PARAMETRIZATION] + [
+    spec[-1] for spec in _RATE_PARAMETRIZATION
+]
+
+
+@pytask.mark.depends_on(_ALL_RAPID_TEST_STATISTICS)
+@pytask.mark.produces(BLD / "tables" / "rapid_test_statistics.csv")
+def task_create_nice_rapid_test_statistic_table_for_lookup(produces):
+    column_names = [col for col, _ in _SINGLE_COL_PARAMETRIZATION] + [
+        spec[0] for spec in _RATE_PARAMETRIZATION
+    ]
+    assert len(set(column_names)) == len(column_names), (
+        "There are duplicate names in the rapid test statistic columns. "
+        "You probably forgot to specify a channel as part of the column name."
+    )
+
+    to_concat = [
+        pd.read_pickle(path)[[column]] for column, path in _SINGLE_COL_PARAMETRIZATION
+    ] + [pd.read_pickle(path)[[column]] for column, _, path in _RATE_PARAMETRIZATION]
+    df = pd.concat(to_concat, axis=1)
+    df.round(4).to_csv(produces)
+
+
 @pytask.mark.depends_on(_DEPENDENCIES)
 def task_check_that_a_table_was_created_for_each_rapid_test_statistic(depends_on):
     statistics_saved_by_sid = pd.read_csv(depends_on[0]).columns
@@ -78,11 +171,3 @@ def task_check_that_a_table_was_created_for_each_rapid_test_statistic(depends_on
     assert set(should_have_a_table) == set(
         RAPID_TEST_STATISTICS
     ), "Some rapid test statistic columns that should have a table do not."
-
-
-@pytask.mark.depends_on([path for col, path in _PARAMETRIZATION])
-@pytask.mark.produces(BLD / "tables" / "rapid_test_statistics.csv")
-def task_create_nice_rapid_test_statistic_table_for_lookup(produces):
-    to_concat = [pd.read_pickle(path)[[column]] for column, path in _PARAMETRIZATION]
-    df = pd.concat(to_concat, axis=1)
-    df.round(4).to_csv(produces)
diff --git a/src/testing/create_rapid_test_statistics.py b/src/testing/create_rapid_test_statistics.py
@@ -67,10 +67,9 @@ def _calculate_rapid_test_statistics_by_channel(
 
     Naming convention for the denominators:
 
-    - testshare             -> n_tests
-    - popshare              -> n_people
-    - number                -> n_people / POPULATION_GERMANY
-    - rate                  -> n_{pos/neg}_tests
+        - testshare             -> n_tests
+        - popshare              -> n_people
+        - number                -> n_people / POPULATION_GERMANY
 
     Args:
         states (pandas.DataFrame): sid states DataFrame.
@@ -113,17 +112,4 @@ def _calculate_rapid_test_statistics_by_channel(
         statistics[f"popshare_{name}_by_{channel_name}"] = sr.sum() / n_obs
         statistics[f"testshare_{name}_by_{channel_name}"] = sr.sum() / n_tested
 
-    statistics[f"true_positive_rate_by_{channel_name}"] = (
-        individual_outcomes["true_positive"].sum() / n_tested_positive
-    )
-    statistics[f"true_negative_rate_by_{channel_name}"] = (
-        individual_outcomes["true_negative"].sum() / n_tested_negative
-    )
-    statistics[f"false_positive_rate_by_{channel_name}"] = (
-        individual_outcomes["false_positive"].sum() / n_tested_positive
-    )
-    statistics[f"false_negative_rate_by_{channel_name}"] = (
-        individual_outcomes["false_negative"].sum() / n_tested_negative
-    )
-
     return statistics
diff --git a/src/testing/task_get_and_plot_share_of_tests_for_symptomatics.py b/src/testing/task_get_and_plot_share_of_tests_for_symptomatics.py
@@ -9,6 +9,7 @@
 from src.config import PLOT_SIZE
 from src.config import PLOT_START_DATE
 from src.config import SRC
+from src.plotting.plotting import BLUE
 from src.plotting.plotting import style_plot
 from src.testing.shared import convert_weekly_to_daily
 from src.testing.shared import get_date_from_year_and_week
@@ -46,6 +47,11 @@
         / "data"
         / "testing"
         / "share_of_pcr_tests_going_to_symptomatics.pdf",
+        "used_share_pcr_going_to_symptomatic": BLD
+        / "figures"
+        / "data"
+        / "testing"
+        / "used_share_of_pcr_tests_going_to_symptomatics.pdf",
     }
 )
 def task_prepare_characteristics_of_the_tested(depends_on, produces):
@@ -106,6 +112,18 @@ def task_prepare_characteristics_of_the_tested(depends_on, produces):
     df = df.reset_index().rename(columns={"index": "date"})
     df.to_csv(produces["data"])
 
+    fig, ax = plt.subplots(figsize=PLOT_SIZE)
+    sns.lineplot(
+        x=share_of_tests_for_symptomatics_series.index,
+        y=share_of_tests_for_symptomatics_series,
+        color=BLUE,
+        linewidth=3.0,
+        alpha=0.6,
+    )
+    fig, ax = style_plot(fig, ax)
+    fig.tight_layout()
+    fig.savefig(produces["used_share_pcr_going_to_symptomatic"])
+
 
 def _clean_data(df):
     share_sym_de = "Anteil keine, bzw. keine für COVID-19 bedeutsamen Symptome"

diff --git a/tests/test_create_rapid_test_statistics.py b/tests/test_create_rapid_test_statistics.py
@@ -1,4 +1,3 @@
-import numpy as np
 import pandas as pd
 from pandas.testing import assert_series_equal
 
@@ -61,21 +60,6 @@ def mocked_sample_test_outcome(states, receives_rapid_test, params, seed):
         {
             0: {
                 "date": date,
-                # overall shares
-                "true_positive_rate_by_overall": 0.5,
-                "true_negative_rate_by_overall": 0.5,
-                "false_negative_rate_by_overall": 0.5,
-                "false_positive_rate_by_overall": 0.5,
-                # shares in a
-                "true_positive_rate_by_a": 0.5,
-                "true_negative_rate_by_a": 0.0,
-                "false_negative_rate_by_a": 1.0,
-                "false_positive_rate_by_a": 0.5,
-                # shares in b
-                "true_positive_rate_by_b": np.nan,
-                "true_negative_rate_by_b": 0.5,
-                "false_negative_rate_by_b": 0.5,
-                "false_positive_rate_by_b": np.nan,
                 # numbers
                 "number_false_negative_by_a": 2 * scaling,
                 "number_false_negative_by_b": 2 * scaling,
@@ -197,10 +181,6 @@ def test_calculate_rapid_test_statistics_by_channel():
             "testshare_false_negative_by_channel": 2 / 5,
             "testshare_true_positive_by_channel": 1 / 5,
             "testshare_true_negative_by_channel": 1 / 5,
-            "true_positive_rate_by_channel": 1 / 2,
-            "true_negative_rate_by_channel": 1 / 3,
-            "false_positive_rate_by_channel": 1 / 2,
-            "false_negative_rate_by_channel": 2 / 3,
         }
     )
     assert_series_equal(res.loc[expected.index], expected, check_names=False)