Explain confidence interval (#169)

* confidence now at the top level * Add confidence interval note in UI and generated notebook
opendp · Nov 21, 2024 · d455df5 · d455df5
1 parent 45cd514
commit d455df5
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 21 deletions.
diff --git a/dp_wizard/app/analysis_panel.py b/dp_wizard/app/analysis_panel.py
@@ -6,6 +6,7 @@
 from dp_wizard.app.components.inputs import log_slider
 from dp_wizard.app.components.column_module import column_ui, column_server
 from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
+from dp_wizard.utils.dp_helper import confidence
 from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
 from dp_wizard.utils.code_generators import make_privacy_loss_block
 from dp_wizard.app.components.column_module import col_widths
@@ -113,6 +114,14 @@ def columns_ui():
                 weights=weights,
                 is_demo=is_demo,
             )
+        confidence_percent = f"{int(confidence * 100)}%"
+        note_md = f"""
+        This simulation assumes a normal distribution between the specified
+        lower and upper bounds. Your CSV has not been read except to
+        determine the columns.
+
+        The confidence interval is {confidence_percent}.
+        """
         return [
             [
                 [
@@ -125,17 +134,7 @@ def columns_ui():
                 (
                     ui.layout_columns(
                         [],
-                        [
-                            ui.markdown(
-                                """
-                            This simulation assumes a normal
-                            distribution between the specified
-                            lower and upper bounds. Your data
-                            file has not been read except to
-                            determine the columns.
-                            """
-                            )
-                        ],
+                        [ui.markdown(note_md)],
                         col_widths=col_widths,  # type: ignore
                     )
                     if column_ids

diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py
@@ -2,7 +2,7 @@
 
 from shiny import ui, render, module, reactive, Inputs, Outputs, Session
 
-from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram
+from dp_wizard.utils.dp_helper import make_accuracy_histogram
 from dp_wizard.utils.shared import plot_histogram
 from dp_wizard.utils.code_generators import make_column_config_block
 from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
@@ -155,7 +155,7 @@ def column_plot():
             # This function is triggered when column is removed;
             # Exit early to avoid divide-by-zero.
             return None
-        _confidence, accuracy, histogram = make_confidence_accuracy_histogram(
+        accuracy, histogram = make_accuracy_histogram(
             lower=lower_x,
             upper=upper_x,
             bin_count=bin_count,

diff --git a/dp_wizard/utils/code_generators/__init__.py b/dp_wizard/utils/code_generators/__init__.py
@@ -4,6 +4,7 @@
 import re
 from dp_wizard.utils.csv_helper import name_to_identifier
 from dp_wizard.utils.code_generators._template import Template
+from dp_wizard.utils.dp_helper import confidence
 
 
 class AnalysisPlanColumn(NamedTuple):
@@ -77,7 +78,11 @@ def _make_columns(self, columns: dict[str, AnalysisPlanColumn]):
         )
 
     def _make_queries(self, column_names: Iterable[str]):
-        return "confidence = 0.95\n\n" + "\n".join(
+        confidence_note = (
+            "The actual value is within the shown range "
+            f"with {int(confidence * 100)}% confidence."
+        )
+        return f"confidence = {confidence} # {confidence_note}\n\n" + "\n".join(
             _make_query(column_name) for column_name in column_names
         )
 

diff --git a/dp_wizard/utils/dp_helper.py b/dp_wizard/utils/dp_helper.py
@@ -9,19 +9,20 @@
 dp.enable_features("contrib")
 
 
-def make_confidence_accuracy_histogram(
+confidence = 0.95
+
+
+def make_accuracy_histogram(
     lower: float,
     upper: float,
     bin_count: int,
     contributions: int,
     weighted_epsilon: float,
-) -> tuple[float, float, Any]:
+) -> tuple[float, Any]:
     """
     Creates fake data between lower and upper, and then returns a DP histogram from it.
-    >>> confidence, accuracy, histogram = make_confidence_accuracy_histogram(
+    >>> accuracy, histogram = make_accuracy_histogram(
     ...     lower=0, upper=10, bin_count=5, contributions=1, weighted_epsilon=1)
-    >>> confidence
-    0.95
     >>> accuracy
     3.37...
     >>> histogram
@@ -74,9 +75,8 @@ def make_confidence_accuracy_histogram(
     )
     query = context.query().group_by("bin").agg(pl.len().dp.noise())  # type: ignore
 
-    confidence = 0.95
     accuracy = query.summarize(alpha=1 - confidence)["accuracy"].item()  # type: ignore
     # The sort is alphabetical. df_to_columns needs to be used
     # downstream to parse interval and sort by numeric value.
     histogram = query.release().collect().sort("bin")
-    return (confidence, accuracy, histogram)
+    return (accuracy, histogram)