diff --git a/dp_wizard/app/analysis_panel.py b/dp_wizard/app/analysis_panel.py index a0833e5..b1ca2da 100644 --- a/dp_wizard/app/analysis_panel.py +++ b/dp_wizard/app/analysis_panel.py @@ -6,6 +6,7 @@ from dp_wizard.app.components.inputs import log_slider from dp_wizard.app.components.column_module import column_ui, column_server from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names +from dp_wizard.utils.dp_helper import confidence from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip from dp_wizard.utils.code_generators import make_privacy_loss_block from dp_wizard.app.components.column_module import col_widths @@ -113,6 +114,14 @@ def columns_ui(): weights=weights, is_demo=is_demo, ) + confidence_percent = f"{int(confidence * 100)}%" + note_md = f""" + This simulation assumes a normal distribution between the specified + lower and upper bounds. Your CSV has not been read except to + determine the columns. + + The confidence interval is {confidence_percent}. + """ return [ [ [ @@ -125,17 +134,7 @@ def columns_ui(): ( ui.layout_columns( [], - [ - ui.markdown( - """ - This simulation assumes a normal - distribution between the specified - lower and upper bounds. Your data - file has not been read except to - determine the columns. - """ - ) - ], + [ui.markdown(note_md)], col_widths=col_widths, # type: ignore ) if column_ids diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py index 4578887..37ece2f 100644 --- a/dp_wizard/app/components/column_module.py +++ b/dp_wizard/app/components/column_module.py @@ -2,7 +2,7 @@ from shiny import ui, render, module, reactive, Inputs, Outputs, Session -from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram +from dp_wizard.utils.dp_helper import make_accuracy_histogram from dp_wizard.utils.shared import plot_histogram from dp_wizard.utils.code_generators import make_column_config_block from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip @@ -155,7 +155,7 @@ def column_plot(): # This function is triggered when column is removed; # Exit early to avoid divide-by-zero. return None - _confidence, accuracy, histogram = make_confidence_accuracy_histogram( + accuracy, histogram = make_accuracy_histogram( lower=lower_x, upper=upper_x, bin_count=bin_count, diff --git a/dp_wizard/utils/code_generators/__init__.py b/dp_wizard/utils/code_generators/__init__.py index a38478e..9bf991a 100644 --- a/dp_wizard/utils/code_generators/__init__.py +++ b/dp_wizard/utils/code_generators/__init__.py @@ -4,6 +4,7 @@ import re from dp_wizard.utils.csv_helper import name_to_identifier from dp_wizard.utils.code_generators._template import Template +from dp_wizard.utils.dp_helper import confidence class AnalysisPlanColumn(NamedTuple): @@ -77,7 +78,11 @@ def _make_columns(self, columns: dict[str, AnalysisPlanColumn]): ) def _make_queries(self, column_names: Iterable[str]): - return "confidence = 0.95\n\n" + "\n".join( + confidence_note = ( + "The actual value is within the shown range " + f"with {int(confidence * 100)}% confidence." + ) + return f"confidence = {confidence} # {confidence_note}\n\n" + "\n".join( _make_query(column_name) for column_name in column_names ) diff --git a/dp_wizard/utils/dp_helper.py b/dp_wizard/utils/dp_helper.py index 25fae1e..31ca491 100644 --- a/dp_wizard/utils/dp_helper.py +++ b/dp_wizard/utils/dp_helper.py @@ -9,19 +9,20 @@ dp.enable_features("contrib") -def make_confidence_accuracy_histogram( +confidence = 0.95 + + +def make_accuracy_histogram( lower: float, upper: float, bin_count: int, contributions: int, weighted_epsilon: float, -) -> tuple[float, float, Any]: +) -> tuple[float, Any]: """ Creates fake data between lower and upper, and then returns a DP histogram from it. - >>> confidence, accuracy, histogram = make_confidence_accuracy_histogram( + >>> accuracy, histogram = make_accuracy_histogram( ... lower=0, upper=10, bin_count=5, contributions=1, weighted_epsilon=1) - >>> confidence - 0.95 >>> accuracy 3.37... >>> histogram @@ -74,9 +75,8 @@ def make_confidence_accuracy_histogram( ) query = context.query().group_by("bin").agg(pl.len().dp.noise()) # type: ignore - confidence = 0.95 accuracy = query.summarize(alpha=1 - confidence)["accuracy"].item() # type: ignore # The sort is alphabetical. df_to_columns needs to be used # downstream to parse interval and sort by numeric value. histogram = query.release().collect().sort("bin") - return (confidence, accuracy, histogram) + return (accuracy, histogram)