Skip to content

Commit

Permalink
Explain confidence interval (#169)
Browse files Browse the repository at this point in the history
* confidence now at the top level

* Add confidence interval note in UI and generated notebook
  • Loading branch information
mccalluc authored Nov 21, 2024
1 parent 45cd514 commit d455df5
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 21 deletions.
21 changes: 10 additions & 11 deletions dp_wizard/app/analysis_panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dp_wizard.app.components.inputs import log_slider
from dp_wizard.app.components.column_module import column_ui, column_server
from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
from dp_wizard.utils.dp_helper import confidence
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
from dp_wizard.utils.code_generators import make_privacy_loss_block
from dp_wizard.app.components.column_module import col_widths
Expand Down Expand Up @@ -113,6 +114,14 @@ def columns_ui():
weights=weights,
is_demo=is_demo,
)
confidence_percent = f"{int(confidence * 100)}%"
note_md = f"""
This simulation assumes a normal distribution between the specified
lower and upper bounds. Your CSV has not been read except to
determine the columns.
The confidence interval is {confidence_percent}.
"""
return [
[
[
Expand All @@ -125,17 +134,7 @@ def columns_ui():
(
ui.layout_columns(
[],
[
ui.markdown(
"""
This simulation assumes a normal
distribution between the specified
lower and upper bounds. Your data
file has not been read except to
determine the columns.
"""
)
],
[ui.markdown(note_md)],
col_widths=col_widths, # type: ignore
)
if column_ids
Expand Down
4 changes: 2 additions & 2 deletions dp_wizard/app/components/column_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from shiny import ui, render, module, reactive, Inputs, Outputs, Session

from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram
from dp_wizard.utils.dp_helper import make_accuracy_histogram
from dp_wizard.utils.shared import plot_histogram
from dp_wizard.utils.code_generators import make_column_config_block
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
Expand Down Expand Up @@ -155,7 +155,7 @@ def column_plot():
# This function is triggered when column is removed;
# Exit early to avoid divide-by-zero.
return None
_confidence, accuracy, histogram = make_confidence_accuracy_histogram(
accuracy, histogram = make_accuracy_histogram(
lower=lower_x,
upper=upper_x,
bin_count=bin_count,
Expand Down
7 changes: 6 additions & 1 deletion dp_wizard/utils/code_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from dp_wizard.utils.csv_helper import name_to_identifier
from dp_wizard.utils.code_generators._template import Template
from dp_wizard.utils.dp_helper import confidence


class AnalysisPlanColumn(NamedTuple):
Expand Down Expand Up @@ -77,7 +78,11 @@ def _make_columns(self, columns: dict[str, AnalysisPlanColumn]):
)

def _make_queries(self, column_names: Iterable[str]):
return "confidence = 0.95\n\n" + "\n".join(
confidence_note = (
"The actual value is within the shown range "
f"with {int(confidence * 100)}% confidence."
)
return f"confidence = {confidence} # {confidence_note}\n\n" + "\n".join(
_make_query(column_name) for column_name in column_names
)

Expand Down
14 changes: 7 additions & 7 deletions dp_wizard/utils/dp_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@
dp.enable_features("contrib")


def make_confidence_accuracy_histogram(
confidence = 0.95


def make_accuracy_histogram(
lower: float,
upper: float,
bin_count: int,
contributions: int,
weighted_epsilon: float,
) -> tuple[float, float, Any]:
) -> tuple[float, Any]:
"""
Creates fake data between lower and upper, and then returns a DP histogram from it.
>>> confidence, accuracy, histogram = make_confidence_accuracy_histogram(
>>> accuracy, histogram = make_accuracy_histogram(
... lower=0, upper=10, bin_count=5, contributions=1, weighted_epsilon=1)
>>> confidence
0.95
>>> accuracy
3.37...
>>> histogram
Expand Down Expand Up @@ -74,9 +75,8 @@ def make_confidence_accuracy_histogram(
)
query = context.query().group_by("bin").agg(pl.len().dp.noise()) # type: ignore

confidence = 0.95
accuracy = query.summarize(alpha=1 - confidence)["accuracy"].item() # type: ignore
# The sort is alphabetical. df_to_columns needs to be used
# downstream to parse interval and sort by numeric value.
histogram = query.release().collect().sort("bin")
return (confidence, accuracy, histogram)
return (accuracy, histogram)

0 comments on commit d455df5

Please sign in to comment.