From f4a3413bd48b0eebc181d2a43b3ab3c7c6a0f7eb Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 4 Nov 2024 16:44:38 -0500 Subject: [PATCH 01/29] lower and upper more consistently --- dp_creator_ii/app/components/column_module.py | 22 +++++++++---------- dp_creator_ii/utils/dp_helper.py | 2 +- dp_creator_ii/utils/mock_data.py | 14 ++++++------ dp_creator_ii/utils/templates/__init__.py | 10 ++++----- .../templates/no-tests/_column_config.py | 2 +- tests/test_app.py | 4 ++-- 6 files changed, 27 insertions(+), 27 deletions(-) diff --git a/dp_creator_ii/app/components/column_module.py b/dp_creator_ii/app/components/column_module.py index f3f41b8..ebafdf2 100644 --- a/dp_creator_ii/app/components/column_module.py +++ b/dp_creator_ii/app/components/column_module.py @@ -11,8 +11,8 @@ @module.ui def column_ui(): # pragma: no cover return [ - ui.input_numeric("min", "Min", 0), - ui.input_numeric("max", "Max", 10), + ui.input_numeric("lower", "Lower", 0), + ui.input_numeric("upper", "Upper", 10), ui.input_numeric("bins", "Bins", 10), ui.input_select( "weight", @@ -27,7 +27,7 @@ def column_ui(): # pragma: no cover output_code_sample("Column Definition", "column_code"), ui.markdown( "This simulation assumes a normal distribution " - "between the specified min and max. " + "between the specified lower and upper bounds. " "Your data file has not been read except to determine the columns." ), ui.output_plot("column_plot"), @@ -53,8 +53,8 @@ def _(): @reactive.calc def column_config(): return { - "min": input.min(), - "max": input.max(), + "lower": input.lower(), + "upper": input.upper(), "bins": input.bins(), "weight": float(input.weight()), } @@ -64,16 +64,16 @@ def column_code(): config = column_config() return make_column_config_block( name=name, - min_value=config["min"], - max_value=config["max"], + lower_bound=config["lower"], + upper_bound=config["upper"], bin_count=config["bins"], ) @render.plot() def column_plot(): config = column_config() - min_x = config["min"] - max_x = config["max"] + lower_x = config["lower"] + upper_x = config["upper"] bin_count = config["bins"] weight = config["weight"] weights_sum = get_weights_sum() @@ -83,8 +83,8 @@ def column_plot(): # Exit early to avoid divide-by-zero. return None _confidence, accuracy, histogram = make_confidence_accuracy_histogram( - lower=min_x, - upper=max_x, + lower=lower_x, + upper=upper_x, bin_count=bin_count, contributions=contributions, weighted_epsilon=epsilon * weight / weights_sum, diff --git a/dp_creator_ii/utils/dp_helper.py b/dp_creator_ii/utils/dp_helper.py index e05088a..c32a3dd 100644 --- a/dp_creator_ii/utils/dp_helper.py +++ b/dp_creator_ii/utils/dp_helper.py @@ -49,7 +49,7 @@ def make_confidence_accuracy_histogram( │ (8, 10] ┆ ... │ └─────────┴─────┘ """ - # Mock data only depends on min and max, so it could be cached, + # Mock data only depends on lower and upper bounds, so it could be cached, # but I'd guess this is dominated by the DP operations, # so not worth optimizing. row_count = 100 diff --git a/dp_creator_ii/utils/mock_data.py b/dp_creator_ii/utils/mock_data.py index 1cde06d..bfdd52c 100644 --- a/dp_creator_ii/utils/mock_data.py +++ b/dp_creator_ii/utils/mock_data.py @@ -2,7 +2,7 @@ import polars as pl from scipy.stats import norm # type: ignore -ColumnDef = namedtuple("ColumnDef", ["min", "max"]) +ColumnDef = namedtuple("ColumnDef", ["lower", "upper"]) def mock_data(column_defs, row_count=1000): @@ -34,12 +34,12 @@ def mock_data(column_defs, row_count=1000): quantile_width = 95 / 100 for column_name, column_def in column_defs.items(): - min_ppf = norm.ppf((1 - quantile_width) / 2) - max_ppf = norm.ppf(1 - (1 - quantile_width) / 2) - min_value = column_def.min - max_value = column_def.max - slope = (max_value - min_value) / (max_ppf - min_ppf) - intercept = min_value - slope * min_ppf + lower_ppf = norm.ppf((1 - quantile_width) / 2) + upper_ppf = norm.ppf(1 - (1 - quantile_width) / 2) + lower_bound = column_def.lower + upper_bound = column_def.upper + slope = (upper_bound - lower_bound) / (upper_ppf - lower_ppf) + intercept = lower_bound - slope * lower_ppf # Start from 1 instead of 0: # The polars bin intervals are closed at the top, # so if we include the zero, there is one value in the diff --git a/dp_creator_ii/utils/templates/__init__.py b/dp_creator_ii/utils/templates/__init__.py index 838979b..8251890 100644 --- a/dp_creator_ii/utils/templates/__init__.py +++ b/dp_creator_ii/utils/templates/__init__.py @@ -143,12 +143,12 @@ def make_privacy_loss_block(epsilon): return str(_Template("privacy_loss").fill_values(EPSILON=epsilon)) -def make_column_config_block(name, min_value, max_value, bin_count): +def make_column_config_block(name, lower_bound, upper_bound, bin_count): """ >>> print(make_column_config_block( ... name="HW GRADE", - ... min_value=0, - ... max_value=100, + ... lower_bound=0, + ... upper_bound=100, ... bin_count=10 ... )) # From the public information, determine the bins: @@ -171,8 +171,8 @@ def make_column_config_block(name, min_value, max_value, bin_count): POLARS_CONFIG_NAME=f"{snake_name}_config", ) .fill_values( - MIN=min_value, - MAX=max_value, + LOWER_BOUND=lower_bound, + UPPER_BOUND=upper_bound, BINS=bin_count, COLUMN_NAME=name, BIN_COLUMN_NAME=f"{snake_name}_bin", diff --git a/dp_creator_ii/utils/templates/no-tests/_column_config.py b/dp_creator_ii/utils/templates/no-tests/_column_config.py index 88b3132..ddb44bd 100644 --- a/dp_creator_ii/utils/templates/no-tests/_column_config.py +++ b/dp_creator_ii/utils/templates/no-tests/_column_config.py @@ -1,5 +1,5 @@ # From the public information, determine the bins: -CUT_LIST_NAME = make_cut_points(MIN, MAX, BINS) +CUT_LIST_NAME = make_cut_points(LOWER_BOUND, UPPER_BOUND, BINS) # Use these bins to define a Polars column: POLARS_CONFIG_NAME = ( diff --git a/tests/test_app.py b/tests/test_app.py index 9a75dd1..8f70985 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -74,8 +74,8 @@ def expect_no_error(): expect_visible("Epsilon: 0.158") # Set column details: page.get_by_label("grade").check() - page.get_by_label("Min").click() - page.get_by_label("Min").fill("0") + page.get_by_label("Lower").click() + page.get_by_label("Lower").fill("0") # TODO: All these recalculations cause timeouts: # It is still rerendering the graph after hitting "Download results". # page.get_by_label("Max").click() From 12c140274264e9a315f5834d545d0c9394f2c9db Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 4 Nov 2024 16:45:19 -0500 Subject: [PATCH 02/29] one more --- tests/test_app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_app.py b/tests/test_app.py index 8f70985..8fe04c7 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -78,8 +78,8 @@ def expect_no_error(): page.get_by_label("Lower").fill("0") # TODO: All these recalculations cause timeouts: # It is still rerendering the graph after hitting "Download results". - # page.get_by_label("Max").click() - # page.get_by_label("Max").fill("100") + # page.get_by_label("Upper").click() + # page.get_by_label("Upper").fill("100") # page.get_by_label("Bins").click() # page.get_by_label("Bins").fill("20") page.get_by_label("Weight").select_option("1") From 1274e75d0a45b5b85b4e394b21142cf4f80b3116 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 5 Nov 2024 12:54:34 -0500 Subject: [PATCH 03/29] handle bounds/bins/counts the same way --- dp_creator_ii/app/analysis_panel.py | 15 ++++++++++++++ dp_creator_ii/app/components/column_module.py | 20 ++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py index ab17b60..87888cd 100644 --- a/dp_creator_ii/app/analysis_panel.py +++ b/dp_creator_ii/app/analysis_panel.py @@ -46,8 +46,20 @@ def button_enabled(): column_ids_selected = input.columns_checkbox_group() return len(column_ids_selected) > 0 + lower_bounds = reactive.value({}) + upper_bounds = reactive.value({}) + bin_counts = reactive.value({}) weights = reactive.value({}) + def set_column_lower(column_id, lower): + lower_bounds.set({**lower_bounds(), column_id: lower}) + + def set_column_upper(column_id, upper): + upper_bounds.set({**upper_bounds(), column_id: upper}) + + def set_column_bins(column_id, bins): + bin_counts.set({**bin_counts(), column_id: bins}) + def set_column_weight(column_id, weight): weights.set({**weights(), column_id: weight}) @@ -84,6 +96,9 @@ def columns_ui(): name=column_id, contributions=contributions(), epsilon=epsilon_calc(), + set_column_lower=set_column_lower, + set_column_upper=set_column_upper, + set_column_bins=set_column_bins, set_column_weight=set_column_weight, get_weights_sum=get_weights_sum, ) diff --git a/dp_creator_ii/app/components/column_module.py b/dp_creator_ii/app/components/column_module.py index f3f41b8..da70610 100644 --- a/dp_creator_ii/app/components/column_module.py +++ b/dp_creator_ii/app/components/column_module.py @@ -42,12 +42,30 @@ def column_server( name, contributions, epsilon, + set_column_lower, + set_column_upper, + set_column_bins, set_column_weight, get_weights_sum, ): # pragma: no cover + @reactive.effect + @reactive.event(input.min) + def _set_lower(): + set_column_lower(name, float(input.min())) + + @reactive.effect + @reactive.event(input.max) + def _set_upper(): + set_column_upper(name, float(input.max())) + + @reactive.effect + @reactive.event(input.bins) + def _set_bins(): + set_column_bins(name, float(input.bins())) + @reactive.effect @reactive.event(input.weight) - def _(): + def _set_weight(): set_column_weight(name, float(input.weight())) @reactive.calc From 19c306524f1bf1bdd90595aa9f75406b83780779 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 5 Nov 2024 14:19:06 -0500 Subject: [PATCH 04/29] lots of reactive dicts, but the UI has not changed --- WHAT-WE-LEARNED.md | 14 +++- dp_creator_ii/app/__init__.py | 28 ++++++- dp_creator_ii/app/analysis_panel.py | 77 ++++++++----------- dp_creator_ii/app/components/column_module.py | 44 ++++------- dp_creator_ii/app/results_panel.py | 13 +++- 5 files changed, 98 insertions(+), 78 deletions(-) diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md index 9045739..fbc1a82 100644 --- a/WHAT-WE-LEARNED.md +++ b/WHAT-WE-LEARNED.md @@ -2,15 +2,21 @@ Even if it seems obvious in retrospect, what have we learned about Python Shiny in this project? -## No warning if ID mismatch +## No warning if ID mismatch / type mismatch Unless I'm missing something, there doesn't seem to be any warning when there isn't a matching function name in the server for an ID in the UI. Either from typos, or fumbling some more complicated display logic, there have been times where this could have been helpful. +Related: I had +``` +ui.output_text("epsilon") +``` +but then changed `epsilon` from `render.text` to `reactive.value` and forgot to update the UI. No warning in the logs: Spinner in the browser window. + ## UI and Server functions don't really separate concerns My first impression was that the UI function would be something like a "view" and the server would be a "controller", but for any kind of conditional display I need a `render.ui`, so that distinction breaks down quickly. -## Refactoring: values vs. reactive values +## Values vs. reactive values A couple times I've started with something as a plain value, and then realized I needed a reactive value. This gets confusing if there are merge conflicts, or if some variables are reactive, and some aren't. @@ -62,3 +68,7 @@ I've had to tweak the CSS a few times: The different flavors of "Shiny" are a bit of nuissance when trying to find examples. The maturity of Shiny for R means that the vast majority of the examples are for R, even with Python in the search. It would be nice if the docs site remembered that I only want to look at docs for Core. + +## More validation / type casting on inputs + +If we we imagine we have a field that is a required positive integer, it would be nice to be able to specify that in the input itself, with a default error message handled by the UI, instead of needing to set up a calc on our side. diff --git a/dp_creator_ii/app/__init__.py b/dp_creator_ii/app/__init__.py index b91456d..8031cf4 100644 --- a/dp_creator_ii/app/__init__.py +++ b/dp_creator_ii/app/__init__.py @@ -30,23 +30,45 @@ def server(input, output, session): # pragma: no cover csv_path = reactive.value(cli_info.csv_path) contributions = reactive.value(cli_info.contributions) + lower_bounds = reactive.value({}) + upper_bounds = reactive.value({}) + bin_counts = reactive.value({}) + weights = reactive.value({}) + epsilon = reactive.value(1) + dataset_panel.dataset_server( input, output, session, + is_demo=cli_info.is_demo, csv_path=csv_path, contributions=contributions, - is_demo=cli_info.is_demo, ) analysis_panel.analysis_server( input, output, session, + is_demo=cli_info.is_demo, csv_path=csv_path, contributions=contributions, - is_demo=cli_info.is_demo, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + bin_counts=bin_counts, + weights=weights, + epsilon=epsilon, + ) + results_panel.results_server( + input, + output, + session, + csv_path=csv_path, + contributions=contributions, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + bin_counts=bin_counts, + weights=weights, + epsilon=epsilon, ) - results_panel.results_server(input, output, session) session.on_ended(ctrl_c_reminder) return server diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py index 87888cd..8809a88 100644 --- a/dp_creator_ii/app/analysis_panel.py +++ b/dp_creator_ii/app/analysis_panel.py @@ -26,53 +26,39 @@ def analysis_ui(): "but have a greater risk of revealing individual data." ), log_slider("log_epsilon_slider", 0.1, 10.0), - ui.output_text("epsilon"), + ui.output_text("epsilon_text"), output_code_sample("Privacy Loss", "privacy_loss_python"), ui.output_ui("download_results_button_ui"), value="analysis_panel", ) +def _cleanup_reactive_dict(reactive_dict, keys_to_keep): + reactive_dict_copy = {**reactive_dict()} + keys_to_del = set(reactive_dict_copy.keys()) - set(keys_to_keep) + for key in keys_to_del: + del reactive_dict_copy[key] + reactive_dict.set(reactive_dict_copy) + + def analysis_server( input, output, session, - csv_path=None, - contributions=None, - is_demo=None, + csv_path, + contributions, + is_demo, + lower_bounds, + upper_bounds, + bin_counts, + weights, + epsilon, ): # pragma: no cover @reactive.calc def button_enabled(): column_ids_selected = input.columns_checkbox_group() return len(column_ids_selected) > 0 - lower_bounds = reactive.value({}) - upper_bounds = reactive.value({}) - bin_counts = reactive.value({}) - weights = reactive.value({}) - - def set_column_lower(column_id, lower): - lower_bounds.set({**lower_bounds(), column_id: lower}) - - def set_column_upper(column_id, upper): - upper_bounds.set({**upper_bounds(), column_id: upper}) - - def set_column_bins(column_id, bins): - bin_counts.set({**bin_counts(), column_id: bins}) - - def set_column_weight(column_id, weight): - weights.set({**weights(), column_id: weight}) - - def clear_column_weights(columns_ids_to_keep): - weights_copy = {**weights()} - column_ids_to_del = set(weights_copy.keys()) - set(columns_ids_to_keep) - for column_id in column_ids_to_del: - del weights_copy[column_id] - weights.set(weights_copy) - - def get_weights_sum(): - return sum(weights().values()) - @reactive.effect def _update_checkbox_group(): ui.update_checkbox_group( @@ -85,7 +71,10 @@ def _update_checkbox_group(): @reactive.event(input.columns_checkbox_group) def _on_column_set_change(): column_ids_selected = input.columns_checkbox_group() - clear_column_weights(column_ids_selected) + _cleanup_reactive_dict(lower_bounds, column_ids_selected) + _cleanup_reactive_dict(upper_bounds, column_ids_selected) + _cleanup_reactive_dict(bin_counts, column_ids_selected) + _cleanup_reactive_dict(weights, column_ids_selected) @render.ui def columns_ui(): @@ -95,12 +84,11 @@ def columns_ui(): column_id, name=column_id, contributions=contributions(), - epsilon=epsilon_calc(), - set_column_lower=set_column_lower, - set_column_upper=set_column_upper, - set_column_bins=set_column_bins, - set_column_weight=set_column_weight, - get_weights_sum=get_weights_sum, + epsilon=epsilon(), + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + bin_counts=bin_counts, + weights=weights, ) return [ [ @@ -118,17 +106,18 @@ def csv_fields_calc(): def csv_fields(): return csv_fields_calc() - @reactive.calc - def epsilon_calc(): - return pow(10, input.log_epsilon_slider()) + @reactive.effect + @reactive.event(input.log_epsilon_slider) + def _set_epsilon(): + epsilon.set(pow(10, input.log_epsilon_slider())) @render.text - def epsilon(): - return f"Epsilon: {epsilon_calc():0.3}" + def epsilon_text(): + return f"Epsilon: {epsilon():0.3}" @render.code def privacy_loss_python(): - return make_privacy_loss_block(epsilon_calc()) + return make_privacy_loss_block(epsilon()) @reactive.effect @reactive.event(input.go_to_results) diff --git a/dp_creator_ii/app/components/column_module.py b/dp_creator_ii/app/components/column_module.py index da70610..f9ae1a4 100644 --- a/dp_creator_ii/app/components/column_module.py +++ b/dp_creator_ii/app/components/column_module.py @@ -42,59 +42,47 @@ def column_server( name, contributions, epsilon, - set_column_lower, - set_column_upper, - set_column_bins, - set_column_weight, - get_weights_sum, + lower_bounds, + upper_bounds, + bin_counts, + weights, ): # pragma: no cover @reactive.effect @reactive.event(input.min) def _set_lower(): - set_column_lower(name, float(input.min())) + lower_bounds.set({**lower_bounds(), name: float(input.min())}) @reactive.effect @reactive.event(input.max) def _set_upper(): - set_column_upper(name, float(input.max())) + upper_bounds.set({**upper_bounds(), name: float(input.max())}) @reactive.effect @reactive.event(input.bins) def _set_bins(): - set_column_bins(name, float(input.bins())) + bin_counts.set({**bin_counts(), name: float(input.bins())}) @reactive.effect @reactive.event(input.weight) def _set_weight(): - set_column_weight(name, float(input.weight())) - - @reactive.calc - def column_config(): - return { - "min": input.min(), - "max": input.max(), - "bins": input.bins(), - "weight": float(input.weight()), - } + weights.set({**weights(), name: float(input.weight())}) @render.code def column_code(): - config = column_config() return make_column_config_block( name=name, - min_value=config["min"], - max_value=config["max"], - bin_count=config["bins"], + min_value=float(input.min()), + max_value=float(input.max()), + bin_count=int(input.bins()), ) @render.plot() def column_plot(): - config = column_config() - min_x = config["min"] - max_x = config["max"] - bin_count = config["bins"] - weight = config["weight"] - weights_sum = get_weights_sum() + min_x = float(input.min()) + max_x = float(input.max()) + bin_count = int(input.bins()) + weight = float(input.weight()) + weights_sum = sum(weights().values()) info(f"Weight ratio for {name}: {weight}/{weights_sum}") if weights_sum == 0: # This function is triggered when column is removed; diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index 64d6621..c1a2198 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -29,7 +29,18 @@ def results_ui(): ) -def results_server(input, output, session): # pragma: no cover +def results_server( + input, + output, + session, + csv_path, + contributions, + lower_bounds, + upper_bounds, + bin_counts, + weights, + epsilon, +): # pragma: no cover @render.download( filename="dp-creator-script.py", media_type="text/x-python", From 603d0bcb9928967cb27bf647cff71c7c640f50db Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 5 Nov 2024 14:30:22 -0500 Subject: [PATCH 05/29] data dump on the results page --- dp_creator_ii/app/results_panel.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index c1a2198..c760717 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -1,3 +1,5 @@ +from json import dumps + from shiny import ui, render from dp_creator_ii.utils.templates import make_notebook_py, make_script_py @@ -7,6 +9,8 @@ def results_ui(): return ui.nav_panel( "Download results", + ui.p("TODO: Use this information to fill in a template!"), + ui.output_code("data_dump"), ui.markdown( "You can now make a differentially private release of your data. " "This will lock the configuration you’ve provided on the previous pages." @@ -41,6 +45,22 @@ def results_server( weights, epsilon, ): # pragma: no cover + @render.code + def data_dump(): + # TODO: Use this information in a template! + return dumps( + { + "csv_path": csv_path(), + "contributions": contributions(), + "lower_bounds": lower_bounds(), + "upper_bounds": upper_bounds(), + "bin_counts": bin_counts(), + "weights": weights(), + "epsilon": epsilon(), + }, + indent=2, + ) + @render.download( filename="dp-creator-script.py", media_type="text/x-python", From ad322178956e810cea1176bb0e2c97fec3445389 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 5 Nov 2024 14:43:11 -0500 Subject: [PATCH 06/29] add a pragma: no cover --- WHAT-WE-LEARNED.md | 2 ++ dp_creator_ii/app/analysis_panel.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md index fbc1a82..6d2aff5 100644 --- a/WHAT-WE-LEARNED.md +++ b/WHAT-WE-LEARNED.md @@ -50,6 +50,8 @@ Renderer.__call__() missing 1 required positional argument: '_fn' It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component. +Short of full component testing, being able to write unit tests around reactive values would be nice. + ## Normal tooling doesn't work inside of app? There are several bits of tooling that don't seem to work inside end-to-end app tests. My guess is that this isn't related to Shiny per se, but rather the ASGI framework: It's not running in the same process as pytest, so it's not surprising that the pytest process can't instrument this. diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py index 8809a88..7d6fdad 100644 --- a/dp_creator_ii/app/analysis_panel.py +++ b/dp_creator_ii/app/analysis_panel.py @@ -33,7 +33,7 @@ def analysis_ui(): ) -def _cleanup_reactive_dict(reactive_dict, keys_to_keep): +def _cleanup_reactive_dict(reactive_dict, keys_to_keep): # pragma: no cover reactive_dict_copy = {**reactive_dict()} keys_to_del = set(reactive_dict_copy.keys()) - set(keys_to_keep) for key in keys_to_del: From c8b4ddc281b1292249c8d64d993407436cd97119 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Thu, 7 Nov 2024 15:51:10 -0500 Subject: [PATCH 07/29] reset widget values after checkbox change --- dp_creator_ii/app/components/column_module.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/dp_creator_ii/app/components/column_module.py b/dp_creator_ii/app/components/column_module.py index f9ae1a4..72e4356 100644 --- a/dp_creator_ii/app/components/column_module.py +++ b/dp_creator_ii/app/components/column_module.py @@ -8,21 +8,26 @@ from dp_creator_ii.app.components.outputs import output_code_sample +default_weight = 2 + + @module.ui def column_ui(): # pragma: no cover return [ + # The default values on these inputs + # should be overridden by the reactive.effect. ui.input_numeric("min", "Min", 0), - ui.input_numeric("max", "Max", 10), - ui.input_numeric("bins", "Bins", 10), + ui.input_numeric("max", "Max", 0), + ui.input_numeric("bins", "Bins", 0), ui.input_select( "weight", "Weight", choices={ 1: "Less accurate", - 2: "Default", + default_weight: "Default", 4: "More accurate", }, - selected=2, + selected=1, ), output_code_sample("Column Definition", "column_code"), ui.markdown( @@ -47,6 +52,14 @@ def column_server( bin_counts, weights, ): # pragma: no cover + @reactive.effect + def _set_all_inputs(): + with reactive.isolate(): # Without isolate, there is an infinite loop. + ui.update_numeric("min", value=lower_bounds().get(name, 0)) + ui.update_numeric("max", value=upper_bounds().get(name, 10)) + ui.update_numeric("bins", value=bin_counts().get(name, 10)) + ui.update_numeric("weight", value=weights().get(name, default_weight)) + @reactive.effect @reactive.event(input.min) def _set_lower(): From 1247392a3dfb9c87bdd6825af000f81cdd651792 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Thu, 7 Nov 2024 16:31:05 -0500 Subject: [PATCH 08/29] do not clean up values --- dp_creator_ii/app/analysis_panel.py | 6 +++--- tests/test_app.py | 24 ++++++++++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py index 7d6fdad..b4f241d 100644 --- a/dp_creator_ii/app/analysis_panel.py +++ b/dp_creator_ii/app/analysis_panel.py @@ -71,9 +71,9 @@ def _update_checkbox_group(): @reactive.event(input.columns_checkbox_group) def _on_column_set_change(): column_ids_selected = input.columns_checkbox_group() - _cleanup_reactive_dict(lower_bounds, column_ids_selected) - _cleanup_reactive_dict(upper_bounds, column_ids_selected) - _cleanup_reactive_dict(bin_counts, column_ids_selected) + # We only clean up the weights, and everything else is left in place, + # so if you restore a column, you see the original values. + # (Except for weight, which goes back to the default.) _cleanup_reactive_dict(weights, column_ids_selected) @render.ui diff --git a/tests/test_app.py b/tests/test_app.py index 624b276..379941b 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -9,6 +9,7 @@ default_app = create_app_fixture(Path(__file__).parent / "fixtures/default_app.py") tooltip = "#choose_csv_demo_tooltip_ui svg" for_the_demo = "For the demo, we'll imagine" +simulation = "This simulation assumes a normal distribution" # TODO: Why is incomplete coverage reported here? @@ -84,16 +85,23 @@ def expect_no_error(): # Set column details: page.get_by_label("grade").check() - page.get_by_label("Min").click() - page.get_by_label("Min").fill("0") - # TODO: All these recalculations cause timeouts: + expect_visible(simulation) + # Check that default is set correctly: + assert page.get_by_label("Max").input_value() == "10" + # Reset, and confirm: + new_value = "20" + page.get_by_label("Max").fill(new_value) + # Uncheck the column: + page.get_by_label("grade").uncheck() + expect_not_visible(simulation) + # Recheck the column: + page.get_by_label("grade").check() + expect_visible(simulation) + assert page.get_by_label("Max").input_value() == new_value + # TODO: Setting more inputs without checking for updates + # cause recalculations to pile up, and these cause timeouts on CI: # It is still rerendering the graph after hitting "Download results". # https://github.com/opendp/dp-creator-ii/issues/116 - # page.get_by_label("Max").click() - # page.get_by_label("Max").fill("100") - # page.get_by_label("Bins").click() - # page.get_by_label("Bins").fill("20") - page.get_by_label("Weight").select_option("1") expect_no_error() # -- Download results -- From 16463b46456d174347016901c5ac47f8252c029d Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 11:00:45 -0500 Subject: [PATCH 09/29] put tooltips in labels --- dp_creator_ii/app/analysis_panel.py | 7 +++++-- dp_creator_ii/app/components/column_module.py | 15 ++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py index 48ea19c..b172e96 100644 --- a/dp_creator_ii/app/analysis_panel.py +++ b/dp_creator_ii/app/analysis_panel.py @@ -18,8 +18,11 @@ def analysis_ui(): "the number of bins for the histogram, " "and its relative share of the privacy budget." ), - ui.output_ui("columns_checkbox_group_tooltip_ui"), - ui.input_checkbox_group("columns_checkbox_group", None, []), + ui.input_checkbox_group( + "columns_checkbox_group", + ["Columns", ui.output_ui("columns_checkbox_group_tooltip_ui")], + [], + ), ui.output_ui("columns_ui"), ui.markdown( "What is your privacy budget for this release? " diff --git a/dp_creator_ii/app/components/column_module.py b/dp_creator_ii/app/components/column_module.py index 719a1ff..d5e2499 100644 --- a/dp_creator_ii/app/components/column_module.py +++ b/dp_creator_ii/app/components/column_module.py @@ -16,17 +16,18 @@ def column_ui(): # pragma: no cover width = "10em" # Just wide enough so the text isn't trucated. return ui.layout_columns( [ - # The default values on these inputs + # The initial values on these inputs # should be overridden by the reactive.effect. - ui.output_ui("bounds_tooltip_ui"), - ui.input_numeric("lower", "Lower", 0, width=width), + ui.input_numeric( + "lower", ["Lower", ui.output_ui("bounds_tooltip_ui")], 0, width=width + ), ui.input_numeric("upper", "Upper", 0, width=width), - ui.output_ui("bins_tooltip_ui"), - ui.input_numeric("bins", "Bins", 0, width=width), - ui.output_ui("weight_tooltip_ui"), + ui.input_numeric( + "bins", ["Bins", ui.output_ui("bins_tooltip_ui")], 0, width=width + ), ui.input_select( "weight", - "Weight", + ["Weight", ui.output_ui("weight_tooltip_ui")], choices={ 1: "Less accurate", default_weight: "Default", From 9cd991d109742824afd54f77d61821942d10fbc3 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 11:08:57 -0500 Subject: [PATCH 10/29] pull warning up to analysis panel. TODO: conditional --- dp_creator_ii/app/analysis_panel.py | 12 ++++++++++ dp_creator_ii/app/components/column_module.py | 23 ++++++++----------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py index b172e96..d63369f 100644 --- a/dp_creator_ii/app/analysis_panel.py +++ b/dp_creator_ii/app/analysis_panel.py @@ -7,6 +7,7 @@ from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names from dp_creator_ii.app.components.outputs import output_code_sample, demo_tooltip from dp_creator_ii.utils.templates import make_privacy_loss_block +from dp_creator_ii.app.components.column_module import col_widths def analysis_ui(): @@ -24,6 +25,17 @@ def analysis_ui(): [], ), ui.output_ui("columns_ui"), + ui.layout_columns( + [], + [ + ui.markdown( + "This simulation assumes a normal distribution " + "between the specified lower and upper bounds. " + "Your data file has not been read except to determine the columns." + ) + ], + col_widths=col_widths, + ), ui.markdown( "What is your privacy budget for this release? " "Values above 1 will add less noise to the data, " diff --git a/dp_creator_ii/app/components/column_module.py b/dp_creator_ii/app/components/column_module.py index d5e2499..c7c9ddf 100644 --- a/dp_creator_ii/app/components/column_module.py +++ b/dp_creator_ii/app/components/column_module.py @@ -10,6 +10,14 @@ default_weight = 2 +col_widths = { + # Controls stay roughly a constant width; + # Graph expands to fill space. + "sm": (4, 8), + "md": (3, 9), + "lg": (2, 10), +} + @module.ui def column_ui(): # pragma: no cover @@ -38,24 +46,11 @@ def column_ui(): # pragma: no cover ), ], [ - # TODO: This doesn't need to be repeated: could just go once at the top. - # https://github.com/opendp/dp-creator-ii/issues/138 - ui.markdown( - "This simulation assumes a normal distribution " - "between the specified lower and upper bounds. " - "Your data file has not been read except to determine the columns." - ), ui.output_plot("column_plot", height="300px"), # Make plot smaller than default: about the same size as the other column. output_code_sample("Column Definition", "column_code"), ], - col_widths={ - # Controls stay roughly a constant width; - # Graph expands to fill space. - "sm": (4, 8), - "md": (3, 9), - "lg": (2, 10), - }, + col_widths=col_widths, ) From f2b5192c5743283a4f28a41fcd697d4514505e84 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 11:20:12 -0500 Subject: [PATCH 11/29] move warning to bottom of list --- dp_creator_ii/app/analysis_panel.py | 42 ++++++++++++++++++----------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py index d63369f..6142454 100644 --- a/dp_creator_ii/app/analysis_panel.py +++ b/dp_creator_ii/app/analysis_panel.py @@ -25,17 +25,6 @@ def analysis_ui(): [], ), ui.output_ui("columns_ui"), - ui.layout_columns( - [], - [ - ui.markdown( - "This simulation assumes a normal distribution " - "between the specified lower and upper bounds. " - "Your data file has not been read except to determine the columns." - ) - ], - col_widths=col_widths, - ), ui.markdown( "What is your privacy budget for this release? " "Values above 1 will add less noise to the data, " @@ -123,10 +112,33 @@ def columns_ui(): ) return [ [ - ui.h3(column_ids_to_labels[column_id]), - column_ui(column_id), - ] - for column_id in column_ids + [ + ui.h3(column_ids_to_labels[column_id]), + column_ui(column_id), + ] + for column_id in column_ids + ], + [ + ( + ui.layout_columns( + [], + [ + ui.markdown( + """ + This simulation assumes a normal + distribution between the specified + lower and upper bounds. Your data + file has not been read except to + determine the columns. + """ + ) + ], + col_widths=col_widths, + ) + if column_ids + else [] + ) + ], ] @reactive.calc From da6a0cbf6a1d923ea20d73ba5840bbbd0535fc04 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 12:21:36 -0500 Subject: [PATCH 12/29] analysis definition JSON --- WHAT-WE-LEARNED.md | 2 ++ dp_creator_ii/app/results_panel.py | 45 ++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md index 8d92f73..edf76b1 100644 --- a/WHAT-WE-LEARNED.md +++ b/WHAT-WE-LEARNED.md @@ -46,6 +46,8 @@ but that returns an error: Renderer.__call__() missing 1 required positional argument: '_fn' ``` +If I just refer to a reactive calc directly in the UI there is no error in the log, just a spinner in the UI. + ## No component testing It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component. diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index c760717..8def728 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -1,16 +1,17 @@ from json import dumps -from shiny import ui, render +from shiny import ui, render, reactive from dp_creator_ii.utils.templates import make_notebook_py, make_script_py from dp_creator_ii.utils.converters import convert_py_to_nb +from dp_creator_ii.app.components.outputs import output_code_sample def results_ui(): return ui.nav_panel( "Download results", ui.p("TODO: Use this information to fill in a template!"), - ui.output_code("data_dump"), + output_code_sample("Analysis JSON", "analysis_json_text"), ui.markdown( "You can now make a differentially private release of your data. " "This will lock the configuration you’ve provided on the previous pages." @@ -45,22 +46,38 @@ def results_server( weights, epsilon, ): # pragma: no cover - @render.code - def data_dump(): - # TODO: Use this information in a template! + @reactive.calc + def analysis_dict(): + # weights().keys() will reflect the desired columns: + # The others retain inactive columns, so user + # inputs aren't lost when toggling checkboxes. + columns = { + col: { + "lower_bound": lower_bounds()[col], + "upper_bound": upper_bounds()[col], + "bin_count": bin_counts()[col], + "weight": weights()[col], + } + for col in weights().keys() + } + return { + "csv_path": csv_path(), + "contributions": contributions(), + "epsilon": epsilon(), + "columns": columns, + } + + @reactive.calc + def analysis_json(): return dumps( - { - "csv_path": csv_path(), - "contributions": contributions(), - "lower_bounds": lower_bounds(), - "upper_bounds": upper_bounds(), - "bin_counts": bin_counts(), - "weights": weights(), - "epsilon": epsilon(), - }, + analysis_dict(), indent=2, ) + @render.text + def analysis_json_text(): + return analysis_json() + @render.download( filename="dp-creator-script.py", media_type="text/x-python", From 0dbdd7bb5086940f163bc7026c4ad321915bfd3f Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 12:37:07 -0500 Subject: [PATCH 13/29] stubs for python --- dp_creator_ii/app/results_panel.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index 8def728..84be9e3 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -10,14 +10,13 @@ def results_ui(): return ui.nav_panel( "Download results", - ui.p("TODO: Use this information to fill in a template!"), - output_code_sample("Analysis JSON", "analysis_json_text"), + ui.p("These code snippets describe how to make a DP release of your data:"), + output_code_sample("Analysis JSON", "analysis_json_text"), # TODO: Drop this? + output_code_sample("Analysis Python", "analysis_python_text"), ui.markdown( "You can now make a differentially private release of your data. " "This will lock the configuration you’ve provided on the previous pages." ), - ui.markdown("TODO: Button: “Download Report (.txt)” (implemented as yaml?)"), - ui.markdown("TODO: Button: “Download Report (.csv)"), ui.markdown( "You can also download code that can be executed to produce a DP release. " "Downloaded code does not lock the configuration." @@ -78,6 +77,14 @@ def analysis_json(): def analysis_json_text(): return analysis_json() + @reactive.calc + def analysis_python(): + pass # TODO + + @render.text + def analysis_python_text(): + return analysis_python() + @render.download( filename="dp-creator-script.py", media_type="text/x-python", From 610404c7540002d592588ba92e359fdf4a1df937 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 13:37:47 -0500 Subject: [PATCH 14/29] stub a script on results page --- dp_creator_ii/app/results_panel.py | 8 +++++++- dp_creator_ii/utils/templates/no-tests/_imports.py | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index 84be9e3..a8c1d6d 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -79,7 +79,13 @@ def analysis_json_text(): @reactive.calc def analysis_python(): - pass # TODO + analysis = analysis_dict() + return make_notebook_py( + csv_path=analysis["csv_path"], + contributions=analysis["contributions"], + epsilon=analysis["epsilon"], + weights=[1], + ) @render.text def analysis_python_text(): diff --git a/dp_creator_ii/utils/templates/no-tests/_imports.py b/dp_creator_ii/utils/templates/no-tests/_imports.py index 5df8d79..a5fdc8c 100644 --- a/dp_creator_ii/utils/templates/no-tests/_imports.py +++ b/dp_creator_ii/utils/templates/no-tests/_imports.py @@ -1,4 +1,6 @@ import polars as pl import opendp.prelude as dp +# The OpenDP team is working to vet the core algorithms. +# Until that is complete we need to opt-in to use these features. dp.enable_features("contrib") From c45585bb8d5f0fdbe289eaa8a3e0c5c9229a56bb Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 15:58:46 -0500 Subject: [PATCH 15/29] include column info in generated script --- dp_creator_ii/app/results_panel.py | 18 +++--- dp_creator_ii/utils/templates/__init__.py | 30 ++++++++-- .../templates/no-tests/_column_config.py | 6 +- .../utils/templates/no-tests/_imports.py | 11 ++++ .../utils/templates/no-tests/_notebook.py | 2 + .../utils/templates/no-tests/_script.py | 2 + tests/fixtures/expected-script.py | 31 ---------- tests/fixtures/expected-script.py.txt | 58 +++++++++++++++++++ tests/utils/test_templates.py | 20 ++++++- 9 files changed, 128 insertions(+), 50 deletions(-) delete mode 100644 tests/fixtures/expected-script.py create mode 100644 tests/fixtures/expected-script.py.txt diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index a8c1d6d..cfee3ee 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -84,7 +84,7 @@ def analysis_python(): csv_path=analysis["csv_path"], contributions=analysis["contributions"], epsilon=analysis["epsilon"], - weights=[1], + columns=analysis["columns"], ) @render.text @@ -96,11 +96,11 @@ def analysis_python_text(): media_type="text/x-python", ) async def download_script(): - contributions = input.contributions() + analysis = analysis_dict() script_py = make_script_py( - contributions=contributions, - epsilon=1, - weights=[1], + contributions=analysis["contributions"], + epsilon=analysis["epsilon"], + columns=analysis["columns"], ) yield script_py @@ -109,12 +109,12 @@ async def download_script(): media_type="application/x-ipynb+json", ) async def download_notebook_unexecuted(): - contributions = input.contributions() + analysis = analysis_dict() notebook_py = make_notebook_py( csv_path="todo.csv", - contributions=contributions, - epsilon=1, - weights=[1], + contributions=analysis["contributions"], + epsilon=analysis["epsilon"], + columns=analysis["columns"], ) notebook_nb = convert_py_to_nb(notebook_py) yield notebook_nb diff --git a/dp_creator_ii/utils/templates/__init__.py b/dp_creator_ii/utils/templates/__init__.py index 8251890..5a9a035 100644 --- a/dp_creator_ii/utils/templates/__init__.py +++ b/dp_creator_ii/utils/templates/__init__.py @@ -108,28 +108,42 @@ def _make_imports(): return str(_Template("imports").fill_values()) -def make_notebook_py(csv_path, contributions, epsilon, weights): +def _make_columns(columns): + return "\n".join( + make_column_config_block( + name=name, + lower_bound=col["lower_bound"], + upper_bound=col["upper_bound"], + bin_count=col["bin_count"], + ) + for name, col in columns.items() + ) + + +def make_notebook_py(csv_path, contributions, epsilon, columns): return str( _Template("notebook").fill_blocks( IMPORTS_BLOCK=_make_imports(), + COLUMNS_BLOCK=_make_columns(columns), CONTEXT_BLOCK=_make_context_for_notebook( csv_path=csv_path, contributions=contributions, epsilon=epsilon, - weights=weights, + weights=[column["weight"] for column in columns.values()], ), ) ) -def make_script_py(contributions, epsilon, weights): +def make_script_py(contributions, epsilon, columns): return str( _Template("script").fill_blocks( IMPORTS_BLOCK=_make_imports(), + COLUMNS_BLOCK=_make_columns(columns), CONTEXT_BLOCK=_make_context_for_script( contributions=contributions, epsilon=epsilon, - weights=weights, + weights=[column["weight"] for column in columns.values()], ), ) ) @@ -152,7 +166,11 @@ def make_column_config_block(name, lower_bound, upper_bound, bin_count): ... bin_count=10 ... )) # From the public information, determine the bins: - hw_grade_cut_points = make_cut_points(0, 100, 10) + hw_grade_cut_points = make_cut_points( + lower_bound=0, + upper_bound=100, + bin_count=10, + ) # Use these bins to define a Polars column: hw_grade_config = ( @@ -173,7 +191,7 @@ def make_column_config_block(name, lower_bound, upper_bound, bin_count): .fill_values( LOWER_BOUND=lower_bound, UPPER_BOUND=upper_bound, - BINS=bin_count, + BIN_COUNT=bin_count, COLUMN_NAME=name, BIN_COLUMN_NAME=f"{snake_name}_bin", ) diff --git a/dp_creator_ii/utils/templates/no-tests/_column_config.py b/dp_creator_ii/utils/templates/no-tests/_column_config.py index ddb44bd..19caf96 100644 --- a/dp_creator_ii/utils/templates/no-tests/_column_config.py +++ b/dp_creator_ii/utils/templates/no-tests/_column_config.py @@ -1,5 +1,9 @@ # From the public information, determine the bins: -CUT_LIST_NAME = make_cut_points(LOWER_BOUND, UPPER_BOUND, BINS) +CUT_LIST_NAME = make_cut_points( + lower_bound=LOWER_BOUND, + upper_bound=UPPER_BOUND, + bin_count=BIN_COUNT, +) # Use these bins to define a Polars column: POLARS_CONFIG_NAME = ( diff --git a/dp_creator_ii/utils/templates/no-tests/_imports.py b/dp_creator_ii/utils/templates/no-tests/_imports.py index a5fdc8c..ecf3628 100644 --- a/dp_creator_ii/utils/templates/no-tests/_imports.py +++ b/dp_creator_ii/utils/templates/no-tests/_imports.py @@ -4,3 +4,14 @@ # The OpenDP team is working to vet the core algorithms. # Until that is complete we need to opt-in to use these features. dp.enable_features("contrib") + + +def make_cut_points(lower_bound, upper_bound, bin_count): + """ + Returns one more cut point than the bin_count. + (There are actually two more bins, extending to + -inf and +inf, but we'll ignore those.) + Cut points are evenly spaced from lower_bound to upper_bound. + """ + bin_width = (upper_bound - lower_bound) / bin_count + return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] diff --git a/dp_creator_ii/utils/templates/no-tests/_notebook.py b/dp_creator_ii/utils/templates/no-tests/_notebook.py index c6aeed3..5be6fb0 100644 --- a/dp_creator_ii/utils/templates/no-tests/_notebook.py +++ b/dp_creator_ii/utils/templates/no-tests/_notebook.py @@ -5,5 +5,7 @@ # + IMPORTS_BLOCK +COLUMNS_BLOCK + CONTEXT_BLOCK print(context) diff --git a/dp_creator_ii/utils/templates/no-tests/_script.py b/dp_creator_ii/utils/templates/no-tests/_script.py index 37be6c5..956c5ae 100644 --- a/dp_creator_ii/utils/templates/no-tests/_script.py +++ b/dp_creator_ii/utils/templates/no-tests/_script.py @@ -2,6 +2,8 @@ IMPORTS_BLOCK +COLUMNS_BLOCK + def get_context(csv_path): CONTEXT_BLOCK diff --git a/tests/fixtures/expected-script.py b/tests/fixtures/expected-script.py deleted file mode 100644 index 2f83281..0000000 --- a/tests/fixtures/expected-script.py +++ /dev/null @@ -1,31 +0,0 @@ -from argparse import ArgumentParser - -import polars as pl -import opendp.prelude as dp - -dp.enable_features("contrib") - - -def get_context(csv_path): - privacy_unit = dp.unit_of(contributions=1) - - privacy_loss = dp.loss_of(epsilon=1, delta=1e-7) - - context = dp.Context.compositor( - data=pl.scan_csv(csv_path, encoding="utf8-lossy"), - privacy_unit=privacy_unit, - privacy_loss=privacy_loss, - split_by_weights=[1], - ) - - return context - - -if __name__ == "__main__": - parser = ArgumentParser( - description="Creates a differentially private release from a csv" - ) - parser.add_argument("--csv", help="Path to csv containing private data") - args = parser.parse_args() - context = get_context(csv_path=args.csv) - print(context) diff --git a/tests/fixtures/expected-script.py.txt b/tests/fixtures/expected-script.py.txt new file mode 100644 index 0000000..2f723b5 --- /dev/null +++ b/tests/fixtures/expected-script.py.txt @@ -0,0 +1,58 @@ +from argparse import ArgumentParser + +import polars as pl +import opendp.prelude as dp + +# The OpenDP team is working to vet the core algorithms. +# Until that is complete we need to opt-in to use these features. +dp.enable_features("contrib") + +def make_cut_points(lower_bound, upper_bound, bin_count): + """ + Returns one more cut point than the bin_count. + (There are actually two more bins, extending to + -inf and +inf, but we'll ignore those.) + Cut points are evenly spaced from lower_bound to upper_bound. + """ + bin_width = (upper_bound - lower_bound) / bin_count + return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] + +# From the public information, determine the bins: +fake_column_cut_points = make_cut_points( + lower_bound=5, + upper_bound=15, + bin_count=20, +) + +# Use these bins to define a Polars column: +fake_column_config = ( + pl.col('fake column') + .cut(fake_column_cut_points) + .alias('fake_column_bin') # Give the new column a name. + .cast(pl.String) +) + + +def get_context(csv_path): + privacy_unit = dp.unit_of(contributions=1) + + privacy_loss = dp.loss_of(epsilon=1, delta=1e-7) + + context = dp.Context.compositor( + data=pl.scan_csv(csv_path, encoding="utf8-lossy"), + privacy_unit=privacy_unit, + privacy_loss=privacy_loss, + split_by_weights=[4], + ) + + return context + + +if __name__ == "__main__": + parser = ArgumentParser( + description="Creates a differentially private release from a csv" + ) + parser.add_argument("--csv", help="Path to csv containing private data") + args = parser.parse_args() + context = get_context(csv_path=args.csv) + print(context) diff --git a/tests/utils/test_templates.py b/tests/utils/test_templates.py index b65c898..c988f9c 100644 --- a/tests/utils/test_templates.py +++ b/tests/utils/test_templates.py @@ -96,7 +96,14 @@ def test_make_notebook(): csv_path=fake_csv, contributions=1, epsilon=1, - weights=[1], + columns={ + "fake column": { + "lower_bound": 5, + "upper_bound": 15, + "bin_count": 20, + "weight": 4, + } + }, ) globals = {} exec(notebook, globals) @@ -107,14 +114,21 @@ def test_make_script(): script = make_script_py( contributions=1, epsilon=1, - weights=[1], + columns={ + "fake column": { + "lower_bound": 5, + "upper_bound": 15, + "bin_count": 20, + "weight": 4, + } + }, ) def clear_empty_lines(text): # Cleanup whitespace after indenting blocks return re.sub(r"^\s+$", "", text, flags=re.MULTILINE).strip() - expected_script = (fixtures_path / "expected-script.py").read_text() + expected_script = (fixtures_path / "expected-script.py.txt").read_text() assert clear_empty_lines(script) == clear_empty_lines(expected_script) with NamedTemporaryFile(mode="w") as fp: From 5803715fde130ce8c03c90fa1ece4b947feebb78 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 16:59:44 -0500 Subject: [PATCH 16/29] closer to a runable notebook --- dp_creator_ii/app/results_panel.py | 10 +++---- dp_creator_ii/utils/templates/__init__.py | 21 ++++++++++++++ .../utils/templates/no-tests/_imports.py | 28 +++++++++++++++++++ .../utils/templates/no-tests/_notebook.py | 5 ++-- .../utils/templates/no-tests/_query.py | 6 ++++ .../utils/templates/no-tests/_script.py | 3 +- 6 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 dp_creator_ii/utils/templates/no-tests/_query.py diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index cfee3ee..c8d55d9 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -54,7 +54,7 @@ def analysis_dict(): col: { "lower_bound": lower_bounds()[col], "upper_bound": upper_bounds()[col], - "bin_count": bin_counts()[col], + "bin_count": int(bin_counts()[col]), "weight": weights()[col], } for col in weights().keys() @@ -124,12 +124,12 @@ async def download_notebook_unexecuted(): media_type="application/x-ipynb+json", ) async def download_notebook_executed(): - contributions = input.contributions() + analysis = analysis_dict() notebook_py = make_notebook_py( csv_path="todo.csv", - contributions=contributions, - epsilon=1, - weights=[1], + contributions=analysis["contributions"], + epsilon=analysis["epsilon"], + columns=analysis["columns"], ) notebook_nb = convert_py_to_nb(notebook_py, execute=True) yield notebook_nb diff --git a/dp_creator_ii/utils/templates/__init__.py b/dp_creator_ii/utils/templates/__init__.py index 5a9a035..92e1abd 100644 --- a/dp_creator_ii/utils/templates/__init__.py +++ b/dp_creator_ii/utils/templates/__init__.py @@ -120,6 +120,24 @@ def _make_columns(columns): ) +def _make_query(column_name): + return str( + _Template("query") + .fill_values( + BIN_NAME=f"{column_name}_bin", + ) + .fill_expressions( + QUERY_NAME=f"{column_name}_query", + ACCURACY_NAME=f"{column_name}_accuracy", + HISTOGRAM_NAME=f"{column_name}_histogram", + ) + ) + + +def _make_queries(column_names): + return "\n".join(_make_query(column_name) for column_name in column_names) + + def make_notebook_py(csv_path, contributions, epsilon, columns): return str( _Template("notebook").fill_blocks( @@ -131,6 +149,7 @@ def make_notebook_py(csv_path, contributions, epsilon, columns): epsilon=epsilon, weights=[column["weight"] for column in columns.values()], ), + QUERIES_BLOCK=_make_queries(columns.keys()), ) ) @@ -141,10 +160,12 @@ def make_script_py(contributions, epsilon, columns): IMPORTS_BLOCK=_make_imports(), COLUMNS_BLOCK=_make_columns(columns), CONTEXT_BLOCK=_make_context_for_script( + # csv_path is a CLI parameter in the script contributions=contributions, epsilon=epsilon, weights=[column["weight"] for column in columns.values()], ), + QUERIES_BLOCK=_make_queries(columns.keys()), ) ) diff --git a/dp_creator_ii/utils/templates/no-tests/_imports.py b/dp_creator_ii/utils/templates/no-tests/_imports.py index ecf3628..870d807 100644 --- a/dp_creator_ii/utils/templates/no-tests/_imports.py +++ b/dp_creator_ii/utils/templates/no-tests/_imports.py @@ -1,5 +1,6 @@ import polars as pl import opendp.prelude as dp +import matplotlib.pyplot as plt # The OpenDP team is working to vet the core algorithms. # Until that is complete we need to opt-in to use these features. @@ -15,3 +16,30 @@ def make_cut_points(lower_bound, upper_bound, bin_count): """ bin_width = (upper_bound - lower_bound) / bin_count return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] + + +def df_to_columns(df): + """ + >>> import polars as pl + >>> df = pl.DataFrame({ + ... "bin": ["A", "B", "C"], + ... "len": [0, 10, 20], + ... }) + >>> _df_to_columns(df) + (['A', 'B', 'C'], [0, 10, 20]) + """ + return tuple(list(df[col]) for col in df.columns) + + +def plot_histogram(histogram_df, error, cutoff): # pragma: no cover + bins, values = df_to_columns(histogram_df) + mod = (len(bins) // 12) + 1 + majors = [label for i, label in enumerate(bins) if i % mod == 0] + minors = [label for i, label in enumerate(bins) if i % mod != 0] + _figure, axes = plt.subplots() + bar_colors = ["blue" if v > cutoff else "lightblue" for v in values] + axes.bar(bins, values, color=bar_colors, yerr=error) + axes.set_xticks(majors, majors) + axes.set_xticks(minors, ["" for _ in minors], minor=True) + axes.axhline(cutoff, color="lightgrey", zorder=-1) + axes.set_ylim(bottom=0) diff --git a/dp_creator_ii/utils/templates/no-tests/_notebook.py b/dp_creator_ii/utils/templates/no-tests/_notebook.py index 5be6fb0..ef72c3b 100644 --- a/dp_creator_ii/utils/templates/no-tests/_notebook.py +++ b/dp_creator_ii/utils/templates/no-tests/_notebook.py @@ -1,4 +1,4 @@ -# This is a demonstration how OpenDP can be used to create +# This is a demonstration of how OpenDP can be used to create # a differentially private release. To customize this, # see the documentation for OpenDP: https://docs.opendp.org/ @@ -8,4 +8,5 @@ COLUMNS_BLOCK CONTEXT_BLOCK -print(context) + +QUERIES_BLOCK diff --git a/dp_creator_ii/utils/templates/no-tests/_query.py b/dp_creator_ii/utils/templates/no-tests/_query.py new file mode 100644 index 0000000..aea0b3c --- /dev/null +++ b/dp_creator_ii/utils/templates/no-tests/_query.py @@ -0,0 +1,6 @@ +confidence = 0.95 + +QUERY_NAME = context.query().group_by(BIN_NAME).agg(pl.len().dp.noise()) +ACCURACY_NAME = QUERY_NAME.summarize(alpha=1 - confidence)["accuracy"].item() +HISTOGRAM_NAME = QUERY_NAME.release().collect().sort(BIN_NAME) +plot_histogram(HISTOGRAM_NAME, ACCURACY_NAME, 0) diff --git a/dp_creator_ii/utils/templates/no-tests/_script.py b/dp_creator_ii/utils/templates/no-tests/_script.py index 956c5ae..ab43f5c 100644 --- a/dp_creator_ii/utils/templates/no-tests/_script.py +++ b/dp_creator_ii/utils/templates/no-tests/_script.py @@ -17,4 +17,5 @@ def get_context(csv_path): parser.add_argument("--csv", help="Path to csv containing private data") args = parser.parse_args() context = get_context(csv_path=args.csv) - print(context) + + QUERIES_BLOCK From 93c9543959f297ff68cca71e3cf21c19cf702344 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 8 Nov 2024 17:05:50 -0500 Subject: [PATCH 17/29] stuck on split_by_weight... maybe a library bug? --- dp_creator_ii/app/results_panel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index c8d55d9..deeaea3 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -111,7 +111,7 @@ async def download_script(): async def download_notebook_unexecuted(): analysis = analysis_dict() notebook_py = make_notebook_py( - csv_path="todo.csv", + csv_path=analysis["csv_path"], contributions=analysis["contributions"], epsilon=analysis["epsilon"], columns=analysis["columns"], @@ -126,7 +126,7 @@ async def download_notebook_unexecuted(): async def download_notebook_executed(): analysis = analysis_dict() notebook_py = make_notebook_py( - csv_path="todo.csv", + csv_path=analysis["csv_path"], contributions=analysis["contributions"], epsilon=analysis["epsilon"], columns=analysis["columns"], From f27a1756771d5e3aa0c5f199813ac75fd0098df7 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 12 Nov 2024 23:55:34 -0500 Subject: [PATCH 18/29] margin stubs --- dp_creator_ii/app/results_panel.py | 4 ++- dp_creator_ii/utils/templates/__init__.py | 36 +++++++++++++++++-- .../utils/templates/no-tests/_context.py | 1 + 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py index deeaea3..66eb57d 100644 --- a/dp_creator_ii/app/results_panel.py +++ b/dp_creator_ii/app/results_panel.py @@ -55,7 +55,9 @@ def analysis_dict(): "lower_bound": lower_bounds()[col], "upper_bound": upper_bounds()[col], "bin_count": int(bin_counts()[col]), - "weight": weights()[col], + # TODO: Floats should work for weight, but they don't: + # https://github.com/opendp/opendp/issues/2140 + "weight": int(weights()[col]), } for col in weights().keys() } diff --git a/dp_creator_ii/utils/templates/__init__.py b/dp_creator_ii/utils/templates/__init__.py index 92e1abd..0d06495 100644 --- a/dp_creator_ii/utils/templates/__init__.py +++ b/dp_creator_ii/utils/templates/__init__.py @@ -70,11 +70,38 @@ def __str__(self): return self._template -def _make_context_for_notebook(csv_path, contributions, epsilon, weights): +def _make_margins_block(bin_names): + margins = ( + [ + """ + (): dp.polars.Margin( + public_info="lengths", + ), +""" + ] + + [ + f""" + ("{bin_name}",): dp.polars.Margin( + public_info="keys", + ), +""" + for bin_name in bin_names + ] + ) + + margins_block = "{" + "".join(margins) + "}" + return margins_block + + +def _make_context_for_notebook(csv_path, contributions, epsilon, weights, bin_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) + margins_block = _make_margins_block(bin_names) return str( _Template("context") + .fill_expressions( + MARGINS_BLOCK=margins_block, + ) .fill_values( CSV_PATH=csv_path, WEIGHTS=weights, @@ -86,13 +113,15 @@ def _make_context_for_notebook(csv_path, contributions, epsilon, weights): ) -def _make_context_for_script(contributions, epsilon, weights): +def _make_context_for_script(contributions, epsilon, weights, bin_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) + margins_block = _make_margins_block(bin_names) return str( _Template("context") .fill_expressions( CSV_PATH="csv_path", + MARGINS_BLOCK=margins_block, ) .fill_values( WEIGHTS=weights, @@ -100,6 +129,7 @@ def _make_context_for_script(contributions, epsilon, weights): .fill_blocks( PRIVACY_UNIT_BLOCK=privacy_unit_block, PRIVACY_LOSS_BLOCK=privacy_loss_block, + MARGINS_BLOCK=margins_block, ) ) @@ -148,6 +178,7 @@ def make_notebook_py(csv_path, contributions, epsilon, columns): contributions=contributions, epsilon=epsilon, weights=[column["weight"] for column in columns.values()], + bin_names=[f"{name}_bin" for name in columns.keys()], ), QUERIES_BLOCK=_make_queries(columns.keys()), ) @@ -164,6 +195,7 @@ def make_script_py(contributions, epsilon, columns): contributions=contributions, epsilon=epsilon, weights=[column["weight"] for column in columns.values()], + bin_names=[f"{name}_bin" for name in columns.keys()], ), QUERIES_BLOCK=_make_queries(columns.keys()), ) diff --git a/dp_creator_ii/utils/templates/no-tests/_context.py b/dp_creator_ii/utils/templates/no-tests/_context.py index cdd8194..f5c300b 100644 --- a/dp_creator_ii/utils/templates/no-tests/_context.py +++ b/dp_creator_ii/utils/templates/no-tests/_context.py @@ -5,4 +5,5 @@ privacy_unit=privacy_unit, privacy_loss=privacy_loss, split_by_weights=WEIGHTS, + margins=MARGINS_BLOCK, ) From 6b8a38f49ba25dd5104dfcb40bc7bbf1c6f0d721 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Wed, 13 Nov 2024 17:24:29 -0500 Subject: [PATCH 19/29] format python identifiers correctly --- dp_creator_ii/utils/csv_helper.py | 8 +++- dp_creator_ii/utils/templates/__init__.py | 42 +++++++++------- .../utils/templates/no-tests/_context.py | 2 +- tests/fixtures/expected-script.py.txt | 48 ++++++++++++++++++- tests/utils/test_templates.py | 2 + 5 files changed, 81 insertions(+), 21 deletions(-) diff --git a/dp_creator_ii/utils/csv_helper.py b/dp_creator_ii/utils/csv_helper.py index 4c279ee..b7b92dd 100644 --- a/dp_creator_ii/utils/csv_helper.py +++ b/dp_creator_ii/utils/csv_helper.py @@ -2,9 +2,11 @@ We'll use the following terms consistently throughout the application: - name: This is the exact column header in the CSV. - label: This is the string we'll display. -- id: This is the string we'll pass as a module ID. +- id: This is the opaque string we'll pass as a module ID. +- identifier: This is a form that can be used as a Python identifier. """ +import re import polars as pl @@ -34,3 +36,7 @@ def name_to_id(name): # Shiny is fussy about module IDs, # but we don't need them to be human readable. return str(hash(name)).replace("-", "_") + + +def name_to_identifier(name): + return re.sub(r"\W+", "_", name).lower() diff --git a/dp_creator_ii/utils/templates/__init__.py b/dp_creator_ii/utils/templates/__init__.py index 0d06495..985423f 100644 --- a/dp_creator_ii/utils/templates/__init__.py +++ b/dp_creator_ii/utils/templates/__init__.py @@ -7,6 +7,7 @@ from pathlib import Path import re +from dp_creator_ii.utils.csv_helper import name_to_identifier class _Template: @@ -70,37 +71,38 @@ def __str__(self): return self._template -def _make_margins_block(bin_names): +def _make_margins_dict(bin_names): + # TODO: Don't worry too much about the formatting here. + # Plan to run the output through black for consistency. + # https://github.com/opendp/dp-creator-ii/issues/50 margins = ( [ """ - (): dp.polars.Margin( - public_info="lengths", - ), -""" + (): dp.polars.Margin( + public_info="lengths", + ),""" ] + [ f""" - ("{bin_name}",): dp.polars.Margin( - public_info="keys", - ), -""" + ("{bin_name}",): dp.polars.Margin( + public_info="keys", + ),""" for bin_name in bin_names ] ) - margins_block = "{" + "".join(margins) + "}" - return margins_block + margins_dict = "{" + "".join(margins) + "\n }" + return margins_dict def _make_context_for_notebook(csv_path, contributions, epsilon, weights, bin_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) - margins_block = _make_margins_block(bin_names) + margins_dict = _make_margins_dict(bin_names) return str( _Template("context") .fill_expressions( - MARGINS_BLOCK=margins_block, + MARGINS_DICT=margins_dict, ) .fill_values( CSV_PATH=csv_path, @@ -116,12 +118,12 @@ def _make_context_for_notebook(csv_path, contributions, epsilon, weights, bin_na def _make_context_for_script(contributions, epsilon, weights, bin_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) - margins_block = _make_margins_block(bin_names) + margins_dict = _make_margins_dict(bin_names) return str( _Template("context") .fill_expressions( CSV_PATH="csv_path", - MARGINS_BLOCK=margins_block, + MARGINS_DICT=margins_dict, ) .fill_values( WEIGHTS=weights, @@ -129,7 +131,7 @@ def _make_context_for_script(contributions, epsilon, weights, bin_names): .fill_blocks( PRIVACY_UNIT_BLOCK=privacy_unit_block, PRIVACY_LOSS_BLOCK=privacy_loss_block, - MARGINS_BLOCK=margins_block, + MARGINS_DICT=margins_dict, ) ) @@ -178,7 +180,9 @@ def make_notebook_py(csv_path, contributions, epsilon, columns): contributions=contributions, epsilon=epsilon, weights=[column["weight"] for column in columns.values()], - bin_names=[f"{name}_bin" for name in columns.keys()], + bin_names=[ + name_to_identifier(name) + "_bin" for name in columns.keys() + ], ), QUERIES_BLOCK=_make_queries(columns.keys()), ) @@ -195,7 +199,9 @@ def make_script_py(contributions, epsilon, columns): contributions=contributions, epsilon=epsilon, weights=[column["weight"] for column in columns.values()], - bin_names=[f"{name}_bin" for name in columns.keys()], + bin_names=[ + name_to_identifier(name) + "_bin" for name in columns.keys() + ], ), QUERIES_BLOCK=_make_queries(columns.keys()), ) diff --git a/dp_creator_ii/utils/templates/no-tests/_context.py b/dp_creator_ii/utils/templates/no-tests/_context.py index f5c300b..4555c5c 100644 --- a/dp_creator_ii/utils/templates/no-tests/_context.py +++ b/dp_creator_ii/utils/templates/no-tests/_context.py @@ -5,5 +5,5 @@ privacy_unit=privacy_unit, privacy_loss=privacy_loss, split_by_weights=WEIGHTS, - margins=MARGINS_BLOCK, + margins=MARGINS_DICT, ) diff --git a/tests/fixtures/expected-script.py.txt b/tests/fixtures/expected-script.py.txt index 2f723b5..4db0457 100644 --- a/tests/fixtures/expected-script.py.txt +++ b/tests/fixtures/expected-script.py.txt @@ -2,11 +2,13 @@ from argparse import ArgumentParser import polars as pl import opendp.prelude as dp +import matplotlib.pyplot as plt # The OpenDP team is working to vet the core algorithms. # Until that is complete we need to opt-in to use these features. dp.enable_features("contrib") + def make_cut_points(lower_bound, upper_bound, bin_count): """ Returns one more cut point than the bin_count. @@ -17,6 +19,34 @@ def make_cut_points(lower_bound, upper_bound, bin_count): bin_width = (upper_bound - lower_bound) / bin_count return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] + +def df_to_columns(df): + """ + >>> import polars as pl + >>> df = pl.DataFrame({ + ... "bin": ["A", "B", "C"], + ... "len": [0, 10, 20], + ... }) + >>> _df_to_columns(df) + (['A', 'B', 'C'], [0, 10, 20]) + """ + return tuple(list(df[col]) for col in df.columns) + + +def plot_histogram(histogram_df, error, cutoff): # pragma: no cover + bins, values = df_to_columns(histogram_df) + mod = (len(bins) // 12) + 1 + majors = [label for i, label in enumerate(bins) if i % mod == 0] + minors = [label for i, label in enumerate(bins) if i % mod != 0] + _figure, axes = plt.subplots() + bar_colors = ["blue" if v > cutoff else "lightblue" for v in values] + axes.bar(bins, values, color=bar_colors, yerr=error) + axes.set_xticks(majors, majors) + axes.set_xticks(minors, ["" for _ in minors], minor=True) + axes.axhline(cutoff, color="lightgrey", zorder=-1) + axes.set_ylim(bottom=0) + + # From the public information, determine the bins: fake_column_cut_points = make_cut_points( lower_bound=5, @@ -33,6 +63,7 @@ fake_column_config = ( ) + def get_context(csv_path): privacy_unit = dp.unit_of(contributions=1) @@ -43,6 +74,15 @@ def get_context(csv_path): privacy_unit=privacy_unit, privacy_loss=privacy_loss, split_by_weights=[4], + margins={ + (): dp.polars.Margin( + public_info="lengths", + ), + + ("fake column_bin",): dp.polars.Margin( + public_info="keys", + ), + }, ) return context @@ -55,4 +95,10 @@ if __name__ == "__main__": parser.add_argument("--csv", help="Path to csv containing private data") args = parser.parse_args() context = get_context(csv_path=args.csv) - print(context) + + confidence = 0.95 + + fake column_query = context.query().group_by('fake column_bin').agg(pl.len().dp.noise()) + fake column_accuracy = fake column_query.summarize(alpha=1 - confidence)["accuracy"].item() + fake column_histogram = fake column_query.release().collect().sort('fake column_bin') + plot_histogram(fake column_histogram, fake column_accuracy, 0) diff --git a/tests/utils/test_templates.py b/tests/utils/test_templates.py index c988f9c..a68c5b0 100644 --- a/tests/utils/test_templates.py +++ b/tests/utils/test_templates.py @@ -129,6 +129,8 @@ def clear_empty_lines(text): return re.sub(r"^\s+$", "", text, flags=re.MULTILINE).strip() expected_script = (fixtures_path / "expected-script.py.txt").read_text() + print("Templated script:") + print(script) assert clear_empty_lines(script) == clear_empty_lines(expected_script) with NamedTemporaryFile(mode="w") as fp: From 1a9a2a77ccf761e9983d6b44a092030bd693bfcb Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Wed, 13 Nov 2024 17:30:41 -0500 Subject: [PATCH 20/29] script has gotten longer: does not make sense to check for exact equality --- tests/fixtures/expected-script.py.txt | 104 -------------------------- tests/utils/test_templates.py | 9 --- 2 files changed, 113 deletions(-) delete mode 100644 tests/fixtures/expected-script.py.txt diff --git a/tests/fixtures/expected-script.py.txt b/tests/fixtures/expected-script.py.txt deleted file mode 100644 index 4db0457..0000000 --- a/tests/fixtures/expected-script.py.txt +++ /dev/null @@ -1,104 +0,0 @@ -from argparse import ArgumentParser - -import polars as pl -import opendp.prelude as dp -import matplotlib.pyplot as plt - -# The OpenDP team is working to vet the core algorithms. -# Until that is complete we need to opt-in to use these features. -dp.enable_features("contrib") - - -def make_cut_points(lower_bound, upper_bound, bin_count): - """ - Returns one more cut point than the bin_count. - (There are actually two more bins, extending to - -inf and +inf, but we'll ignore those.) - Cut points are evenly spaced from lower_bound to upper_bound. - """ - bin_width = (upper_bound - lower_bound) / bin_count - return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] - - -def df_to_columns(df): - """ - >>> import polars as pl - >>> df = pl.DataFrame({ - ... "bin": ["A", "B", "C"], - ... "len": [0, 10, 20], - ... }) - >>> _df_to_columns(df) - (['A', 'B', 'C'], [0, 10, 20]) - """ - return tuple(list(df[col]) for col in df.columns) - - -def plot_histogram(histogram_df, error, cutoff): # pragma: no cover - bins, values = df_to_columns(histogram_df) - mod = (len(bins) // 12) + 1 - majors = [label for i, label in enumerate(bins) if i % mod == 0] - minors = [label for i, label in enumerate(bins) if i % mod != 0] - _figure, axes = plt.subplots() - bar_colors = ["blue" if v > cutoff else "lightblue" for v in values] - axes.bar(bins, values, color=bar_colors, yerr=error) - axes.set_xticks(majors, majors) - axes.set_xticks(minors, ["" for _ in minors], minor=True) - axes.axhline(cutoff, color="lightgrey", zorder=-1) - axes.set_ylim(bottom=0) - - -# From the public information, determine the bins: -fake_column_cut_points = make_cut_points( - lower_bound=5, - upper_bound=15, - bin_count=20, -) - -# Use these bins to define a Polars column: -fake_column_config = ( - pl.col('fake column') - .cut(fake_column_cut_points) - .alias('fake_column_bin') # Give the new column a name. - .cast(pl.String) -) - - - -def get_context(csv_path): - privacy_unit = dp.unit_of(contributions=1) - - privacy_loss = dp.loss_of(epsilon=1, delta=1e-7) - - context = dp.Context.compositor( - data=pl.scan_csv(csv_path, encoding="utf8-lossy"), - privacy_unit=privacy_unit, - privacy_loss=privacy_loss, - split_by_weights=[4], - margins={ - (): dp.polars.Margin( - public_info="lengths", - ), - - ("fake column_bin",): dp.polars.Margin( - public_info="keys", - ), - }, - ) - - return context - - -if __name__ == "__main__": - parser = ArgumentParser( - description="Creates a differentially private release from a csv" - ) - parser.add_argument("--csv", help="Path to csv containing private data") - args = parser.parse_args() - context = get_context(csv_path=args.csv) - - confidence = 0.95 - - fake column_query = context.query().group_by('fake column_bin').agg(pl.len().dp.noise()) - fake column_accuracy = fake column_query.summarize(alpha=1 - confidence)["accuracy"].item() - fake column_histogram = fake column_query.release().collect().sort('fake column_bin') - plot_histogram(fake column_histogram, fake column_accuracy, 0) diff --git a/tests/utils/test_templates.py b/tests/utils/test_templates.py index a68c5b0..14449d6 100644 --- a/tests/utils/test_templates.py +++ b/tests/utils/test_templates.py @@ -124,15 +124,6 @@ def test_make_script(): }, ) - def clear_empty_lines(text): - # Cleanup whitespace after indenting blocks - return re.sub(r"^\s+$", "", text, flags=re.MULTILINE).strip() - - expected_script = (fixtures_path / "expected-script.py.txt").read_text() - print("Templated script:") - print(script) - assert clear_empty_lines(script) == clear_empty_lines(expected_script) - with NamedTemporaryFile(mode="w") as fp: fp.write(script) fp.flush() From d3be33df0b59e7e50cdad573def0f54d8c0046b4 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Wed, 13 Nov 2024 17:40:53 -0500 Subject: [PATCH 21/29] fix syntactic problems in generated code --- dp_creator_ii/utils/templates/__init__.py | 13 +++++++------ tests/utils/test_templates.py | 4 +++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/dp_creator_ii/utils/templates/__init__.py b/dp_creator_ii/utils/templates/__init__.py index 985423f..bd7256e 100644 --- a/dp_creator_ii/utils/templates/__init__.py +++ b/dp_creator_ii/utils/templates/__init__.py @@ -91,7 +91,7 @@ def _make_margins_dict(bin_names): ] ) - margins_dict = "{" + "".join(margins) + "\n }" + margins_dict = "{" + "".join(margins) + "\n }" return margins_dict @@ -153,15 +153,16 @@ def _make_columns(columns): def _make_query(column_name): + indentifier = name_to_identifier(column_name) return str( _Template("query") .fill_values( - BIN_NAME=f"{column_name}_bin", + BIN_NAME=f"{indentifier}_bin", ) .fill_expressions( - QUERY_NAME=f"{column_name}_query", - ACCURACY_NAME=f"{column_name}_accuracy", - HISTOGRAM_NAME=f"{column_name}_histogram", + QUERY_NAME=f"{indentifier}_query", + ACCURACY_NAME=f"{indentifier}_accuracy", + HISTOGRAM_NAME=f"{indentifier}_histogram", ) ) @@ -181,7 +182,7 @@ def make_notebook_py(csv_path, contributions, epsilon, columns): epsilon=epsilon, weights=[column["weight"] for column in columns.values()], bin_names=[ - name_to_identifier(name) + "_bin" for name in columns.keys() + f"{name_to_identifier(name)}_bin" for name in columns.keys() ], ), QUERIES_BLOCK=_make_queries(columns.keys()), diff --git a/tests/utils/test_templates.py b/tests/utils/test_templates.py index 14449d6..02d06ec 100644 --- a/tests/utils/test_templates.py +++ b/tests/utils/test_templates.py @@ -85,7 +85,7 @@ def test_fill_template_unfilled_slots(): Exception, match=re.escape( "context.py has unfilled slots: " - "CSV_PATH, PRIVACY_LOSS_BLOCK, PRIVACY_UNIT_BLOCK, WEIGHTS" + "CSV_PATH, MARGINS_DICT, PRIVACY_LOSS_BLOCK, PRIVACY_UNIT_BLOCK, WEIGHTS" ), ): str(context_template.fill_values()) @@ -105,6 +105,7 @@ def test_make_notebook(): } }, ) + print(notebook) globals = {} exec(notebook, globals) assert isinstance(globals["context"], dp.Context) @@ -123,6 +124,7 @@ def test_make_script(): } }, ) + print(script) with NamedTemporaryFile(mode="w") as fp: fp.write(script) From 03c6dfa97e0f5a9062564fcff4902f73ff11978b Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Thu, 14 Nov 2024 09:53:18 -0500 Subject: [PATCH 22/29] fill in columns, but still WIP --- dp_creator_ii/utils/templates/__init__.py | 4 ++++ dp_creator_ii/utils/templates/no-tests/_context.py | 2 +- dp_creator_ii/utils/templates/no-tests/_imports.py | 2 +- tests/utils/test_templates.py | 6 ++++-- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/dp_creator_ii/utils/templates/__init__.py b/dp_creator_ii/utils/templates/__init__.py index bd7256e..538fc98 100644 --- a/dp_creator_ii/utils/templates/__init__.py +++ b/dp_creator_ii/utils/templates/__init__.py @@ -99,10 +99,12 @@ def _make_context_for_notebook(csv_path, contributions, epsilon, weights, bin_na privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) margins_dict = _make_margins_dict(bin_names) + columns = ",".join(f"{bin_name}_config" for bin_name in bin_names) return str( _Template("context") .fill_expressions( MARGINS_DICT=margins_dict, + COLUMNS=columns, ) .fill_values( CSV_PATH=csv_path, @@ -119,11 +121,13 @@ def _make_context_for_script(contributions, epsilon, weights, bin_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) margins_dict = _make_margins_dict(bin_names) + columns = ",".join(f"{bin_name}_config" for bin_name in bin_names) return str( _Template("context") .fill_expressions( CSV_PATH="csv_path", MARGINS_DICT=margins_dict, + COLUMNS=columns, ) .fill_values( WEIGHTS=weights, diff --git a/dp_creator_ii/utils/templates/no-tests/_context.py b/dp_creator_ii/utils/templates/no-tests/_context.py index 4555c5c..32ca440 100644 --- a/dp_creator_ii/utils/templates/no-tests/_context.py +++ b/dp_creator_ii/utils/templates/no-tests/_context.py @@ -1,7 +1,7 @@ PRIVACY_UNIT_BLOCK PRIVACY_LOSS_BLOCK context = dp.Context.compositor( - data=pl.scan_csv(CSV_PATH, encoding="utf8-lossy"), + data=pl.scan_csv(CSV_PATH, encoding="utf8-lossy").with_columns(COLUMNS), privacy_unit=privacy_unit, privacy_loss=privacy_loss, split_by_weights=WEIGHTS, diff --git a/dp_creator_ii/utils/templates/no-tests/_imports.py b/dp_creator_ii/utils/templates/no-tests/_imports.py index 870d807..b3ad0d1 100644 --- a/dp_creator_ii/utils/templates/no-tests/_imports.py +++ b/dp_creator_ii/utils/templates/no-tests/_imports.py @@ -25,7 +25,7 @@ def df_to_columns(df): ... "bin": ["A", "B", "C"], ... "len": [0, 10, 20], ... }) - >>> _df_to_columns(df) + >>> df_to_columns(df) (['A', 'B', 'C'], [0, 10, 20]) """ return tuple(list(df[col]) for col in df.columns) diff --git a/tests/utils/test_templates.py b/tests/utils/test_templates.py index 02d06ec..b2f72a3 100644 --- a/tests/utils/test_templates.py +++ b/tests/utils/test_templates.py @@ -97,7 +97,9 @@ def test_make_notebook(): contributions=1, epsilon=1, columns={ - "fake column": { + # For a strong test, use a column whose name + # doesn't work as a python identifier. + "hw-number": { "lower_bound": 5, "upper_bound": 15, "bin_count": 20, @@ -116,7 +118,7 @@ def test_make_script(): contributions=1, epsilon=1, columns={ - "fake column": { + "hw-number": { "lower_bound": 5, "upper_bound": 15, "bin_count": 20, From a3abd8d8914bbed04afecee8e5895648b640aaee Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Thu, 14 Nov 2024 20:39:45 -0500 Subject: [PATCH 23/29] fix column names; tests pass --- dp_wizard/utils/templates/__init__.py | 20 ++++++++------------ tests/utils/test_templates.py | 5 +---- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/dp_wizard/utils/templates/__init__.py b/dp_wizard/utils/templates/__init__.py index 3a0cffc..6e1ccc6 100644 --- a/dp_wizard/utils/templates/__init__.py +++ b/dp_wizard/utils/templates/__init__.py @@ -95,11 +95,11 @@ def _make_margins_dict(bin_names): return margins_dict -def _make_context_for_notebook(csv_path, contributions, epsilon, weights, bin_names): +def _make_context_for_notebook(csv_path, contributions, epsilon, weights, column_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) - margins_dict = _make_margins_dict(bin_names) - columns = ",".join(f"{bin_name}_config" for bin_name in bin_names) + margins_dict = _make_margins_dict([f"{name}_bin" for name in column_names]) + columns = ", ".join([f"{name}_config" for name in column_names]) return str( _Template("context") .fill_expressions( @@ -117,11 +117,11 @@ def _make_context_for_notebook(csv_path, contributions, epsilon, weights, bin_na ) -def _make_context_for_script(contributions, epsilon, weights, bin_names): +def _make_context_for_script(contributions, epsilon, weights, column_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) - margins_dict = _make_margins_dict(bin_names) - columns = ",".join(f"{bin_name}_config" for bin_name in bin_names) + margins_dict = _make_margins_dict([f"{name}_bin" for name in column_names]) + columns = ",".join([f"{name}_config" for name in column_names]) return str( _Template("context") .fill_expressions( @@ -185,9 +185,7 @@ def make_notebook_py(csv_path, contributions, epsilon, columns): contributions=contributions, epsilon=epsilon, weights=[column["weight"] for column in columns.values()], - bin_names=[ - f"{name_to_identifier(name)}_bin" for name in columns.keys() - ], + column_names=[name_to_identifier(name) for name in columns.keys()], ), QUERIES_BLOCK=_make_queries(columns.keys()), ) @@ -204,9 +202,7 @@ def make_script_py(contributions, epsilon, columns): contributions=contributions, epsilon=epsilon, weights=[column["weight"] for column in columns.values()], - bin_names=[ - name_to_identifier(name) + "_bin" for name in columns.keys() - ], + column_names=[name_to_identifier(name) for name in columns.keys()], ), QUERIES_BLOCK=_make_queries(columns.keys()), ) diff --git a/tests/utils/test_templates.py b/tests/utils/test_templates.py index fde79be..7bcdfc3 100644 --- a/tests/utils/test_templates.py +++ b/tests/utils/test_templates.py @@ -83,10 +83,7 @@ def test_fill_template_unfilled_slots(): context_template = _Template("context") with pytest.raises( Exception, - match=re.escape( - "context.py has unfilled slots: " - "CSV_PATH, MARGINS_DICT, PRIVACY_LOSS_BLOCK, PRIVACY_UNIT_BLOCK, WEIGHTS" - ), + match=re.escape("context.py has unfilled slots"), ): str(context_template.fill_values()) From a79dbdce62db54f3566e05883a004451f5caf6ec Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Thu, 14 Nov 2024 21:24:00 -0500 Subject: [PATCH 24/29] move confidence --- dp_wizard/utils/templates/__init__.py | 4 +++- dp_wizard/utils/templates/no-tests/_query.py | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dp_wizard/utils/templates/__init__.py b/dp_wizard/utils/templates/__init__.py index 6e1ccc6..15becb4 100644 --- a/dp_wizard/utils/templates/__init__.py +++ b/dp_wizard/utils/templates/__init__.py @@ -172,7 +172,9 @@ def _make_query(column_name): def _make_queries(column_names): - return "\n".join(_make_query(column_name) for column_name in column_names) + return "confidence = 0.95\n" + "\n".join( + _make_query(column_name) for column_name in column_names + ) def make_notebook_py(csv_path, contributions, epsilon, columns): diff --git a/dp_wizard/utils/templates/no-tests/_query.py b/dp_wizard/utils/templates/no-tests/_query.py index aea0b3c..0d78f0d 100644 --- a/dp_wizard/utils/templates/no-tests/_query.py +++ b/dp_wizard/utils/templates/no-tests/_query.py @@ -1,5 +1,3 @@ -confidence = 0.95 - QUERY_NAME = context.query().group_by(BIN_NAME).agg(pl.len().dp.noise()) ACCURACY_NAME = QUERY_NAME.summarize(alpha=1 - confidence)["accuracy"].item() HISTOGRAM_NAME = QUERY_NAME.release().collect().sort(BIN_NAME) From 8ff945a94d2439ad89be20d7dc0d183924e1c1ee Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Thu, 14 Nov 2024 21:35:32 -0500 Subject: [PATCH 25/29] simplify download panel --- dp_wizard/app/results_panel.py | 31 +++------------------------ dp_wizard/utils/templates/__init__.py | 2 +- 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/dp_wizard/app/results_panel.py b/dp_wizard/app/results_panel.py index 761c88c..e68fe5a 100644 --- a/dp_wizard/app/results_panel.py +++ b/dp_wizard/app/results_panel.py @@ -10,23 +10,13 @@ def results_ui(): return ui.nav_panel( "Download results", - ui.p("These code snippets describe how to make a DP release of your data:"), - output_code_sample("Analysis JSON", "analysis_json_text"), # TODO: Drop this? - output_code_sample("Analysis Python", "analysis_python_text"), - ui.markdown( - "You can now make a differentially private release of your data. " - "This will lock the configuration you’ve provided on the previous pages." - ), - ui.markdown( - "You can also download code that can be executed to produce a DP release. " - "Downloaded code does not lock the configuration." - ), + ui.markdown("You can now make a differentially private release of your data."), ui.download_button( "download_script", "Download Script (.py)", ), ui.download_button( - "download_notebook_unexecuted", + "download_notebook", "Download Notebook (.ipynb)", ), value="results_panel", @@ -110,22 +100,7 @@ async def download_script(): filename="dp-wizard-notebook.ipynb", media_type="application/x-ipynb+json", ) - async def download_notebook_unexecuted(): - analysis = analysis_dict() - notebook_py = make_notebook_py( - csv_path=analysis["csv_path"], - contributions=analysis["contributions"], - epsilon=analysis["epsilon"], - columns=analysis["columns"], - ) - notebook_nb = convert_py_to_nb(notebook_py) - yield notebook_nb - - @render.download( - filename="dp-wizard-notebook-executed.ipynb", - media_type="application/x-ipynb+json", - ) - async def download_notebook_executed(): + async def download_notebook(): analysis = analysis_dict() notebook_py = make_notebook_py( csv_path=analysis["csv_path"], diff --git a/dp_wizard/utils/templates/__init__.py b/dp_wizard/utils/templates/__init__.py index 15becb4..e130f8b 100644 --- a/dp_wizard/utils/templates/__init__.py +++ b/dp_wizard/utils/templates/__init__.py @@ -172,7 +172,7 @@ def _make_query(column_name): def _make_queries(column_names): - return "confidence = 0.95\n" + "\n".join( + return "confidence = 0.95\n\n" + "\n".join( _make_query(column_name) for column_name in column_names ) From d635dd25baf1df12423d5dc512665e84cc23a489 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Thu, 14 Nov 2024 22:23:08 -0500 Subject: [PATCH 26/29] add markdown cells --- .../templates/no-tests/_column_config.py | 2 +- .../utils/templates/no-tests/_imports.py | 4 +++ .../utils/templates/no-tests/_notebook.py | 28 +++++++++++++++++-- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/dp_wizard/utils/templates/no-tests/_column_config.py b/dp_wizard/utils/templates/no-tests/_column_config.py index 19caf96..2cb3686 100644 --- a/dp_wizard/utils/templates/no-tests/_column_config.py +++ b/dp_wizard/utils/templates/no-tests/_column_config.py @@ -1,4 +1,4 @@ -# From the public information, determine the bins: +# From the public information, determine the bins for "COLUMN_NAME": CUT_LIST_NAME = make_cut_points( lower_bound=LOWER_BOUND, upper_bound=UPPER_BOUND, diff --git a/dp_wizard/utils/templates/no-tests/_imports.py b/dp_wizard/utils/templates/no-tests/_imports.py index b3ad0d1..8b1370a 100644 --- a/dp_wizard/utils/templates/no-tests/_imports.py +++ b/dp_wizard/utils/templates/no-tests/_imports.py @@ -20,6 +20,7 @@ def make_cut_points(lower_bound, upper_bound, bin_count): def df_to_columns(df): """ + Transform a Dataframe into a format that is easier to plot. >>> import polars as pl >>> df = pl.DataFrame({ ... "bin": ["A", "B", "C"], @@ -32,6 +33,9 @@ def df_to_columns(df): def plot_histogram(histogram_df, error, cutoff): # pragma: no cover + """ + Given a Dataframe for a histogram, plot the data. + """ bins, values = df_to_columns(histogram_df) mod = (len(bins) // 12) + 1 majors = [label for i, label in enumerate(bins) if i % mod == 0] diff --git a/dp_wizard/utils/templates/no-tests/_notebook.py b/dp_wizard/utils/templates/no-tests/_notebook.py index ef72c3b..16862ba 100644 --- a/dp_wizard/utils/templates/no-tests/_notebook.py +++ b/dp_wizard/utils/templates/no-tests/_notebook.py @@ -1,12 +1,36 @@ # This is a demonstration of how OpenDP can be used to create -# a differentially private release. To customize this, -# see the documentation for OpenDP: https://docs.opendp.org/ +# a differentially private release. To learn more about what's +# going on here, see the documentation for OpenDP: https://docs.opendp.org/ # + IMPORTS_BLOCK +# - +# Based on the input you provided, for each column we'll create a set of cut points, +# and a Polars expression that describes how we want to summarize that column. + +# + COLUMNS_BLOCK +# - + +# Next, we'll define our Context. This is where we set the privacy budget, +# and set the weight for each query under that overall budget. +# If we try to run more one more query than we have weights, it will error. +# Once the privacy budget is consumed, you shouldn't run more queries. +# + CONTEXT_BLOCK +# - + +# A note on `utf8-lossy`: CSVs can use different "character encodings" to +# represent characters outside the plain ascii character set, but out of the box +# the Polars library only supports UTF8. Specifying `utf8-lossy` preserves as +# much information as possible, and any unrecognized characters will be replaced +# by "�". If this is not sufficient, you will need to preprocess your data to +# reencode it as UTF8. +# +# Finally, we run the queries and plot the results. +# + QUERIES_BLOCK +# - From 0362d385bc18c3b97a963b57b97cbf99a63627cc Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Thu, 14 Nov 2024 22:30:54 -0500 Subject: [PATCH 27/29] tidy up --- dp_wizard/app/results_panel.py | 1 - dp_wizard/utils/templates/__init__.py | 2 +- dp_wizard/utils/templates/no-tests/_column_config.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dp_wizard/app/results_panel.py b/dp_wizard/app/results_panel.py index e68fe5a..58a23a6 100644 --- a/dp_wizard/app/results_panel.py +++ b/dp_wizard/app/results_panel.py @@ -4,7 +4,6 @@ from dp_wizard.utils.templates import make_notebook_py, make_script_py from dp_wizard.utils.converters import convert_py_to_nb -from dp_wizard.app.components.outputs import output_code_sample def results_ui(): diff --git a/dp_wizard/utils/templates/__init__.py b/dp_wizard/utils/templates/__init__.py index e130f8b..400b964 100644 --- a/dp_wizard/utils/templates/__init__.py +++ b/dp_wizard/utils/templates/__init__.py @@ -227,7 +227,7 @@ def make_column_config_block(name, lower_bound, upper_bound, bin_count): ... upper_bound=100, ... bin_count=10 ... )) - # From the public information, determine the bins: + # From the public information, determine the bins for 'HW GRADE': hw_grade_cut_points = make_cut_points( lower_bound=0, upper_bound=100, diff --git a/dp_wizard/utils/templates/no-tests/_column_config.py b/dp_wizard/utils/templates/no-tests/_column_config.py index 2cb3686..bb367e6 100644 --- a/dp_wizard/utils/templates/no-tests/_column_config.py +++ b/dp_wizard/utils/templates/no-tests/_column_config.py @@ -1,4 +1,4 @@ -# From the public information, determine the bins for "COLUMN_NAME": +# From the public information, determine the bins for COLUMN_NAME: CUT_LIST_NAME = make_cut_points( lower_bound=LOWER_BOUND, upper_bound=UPPER_BOUND, From 5ef222e6bbbabf6cba52ce2fc2e5b818a2aca537 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 15 Nov 2024 13:40:12 -0500 Subject: [PATCH 28/29] fix copy-paste of util functions --- dp_wizard/app/components/column_module.py | 2 +- dp_wizard/utils/dp_helper.py | 21 +--------- .../components/plots.py => utils/shared.py} | 28 ++++++++++--- dp_wizard/utils/templates/__init__.py | 5 ++- .../utils/templates/no-tests/_imports.py | 42 ------------------- 5 files changed, 30 insertions(+), 68 deletions(-) rename dp_wizard/{app/components/plots.py => utils/shared.py} (50%) diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py index a84a13d..3e152bf 100644 --- a/dp_wizard/app/components/column_module.py +++ b/dp_wizard/app/components/column_module.py @@ -3,7 +3,7 @@ from shiny import ui, render, module, reactive from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram -from dp_wizard.app.components.plots import plot_histogram +from dp_wizard.utils.shared import plot_histogram from dp_wizard.utils.templates import make_column_config_block from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip diff --git a/dp_wizard/utils/dp_helper.py b/dp_wizard/utils/dp_helper.py index b7949bd..f7ec1d9 100644 --- a/dp_wizard/utils/dp_helper.py +++ b/dp_wizard/utils/dp_helper.py @@ -2,28 +2,11 @@ import opendp.prelude as dp from dp_wizard.utils.mock_data import mock_data, ColumnDef +from dp_wizard.utils.shared import make_cut_points dp.enable_features("contrib") -def _make_cut_points(lower, upper, bin_count): - """ - Returns one more cut point than the bin_count. - (There are actually two more bins, extending to - -inf and +inf, but we'll ignore those.) - Cut points are evenly spaced from lower to upper. - - >>> _make_cut_points(0, 10, 1) - [0.0, 10.0] - >>> _make_cut_points(0, 10, 2) - [0.0, 5.0, 10.0] - >>> _make_cut_points(0, 10, 3) - [0.0, 3.33, 6.67, 10.0] - """ - bin_width = (upper - lower) / bin_count - return [round(lower + i * bin_width, 2) for i in range(bin_count + 1)] - - def make_confidence_accuracy_histogram( lower=None, upper=None, bin_count=None, contributions=None, weighted_epsilon=None ): @@ -58,7 +41,7 @@ def make_confidence_accuracy_histogram( # TODO: When this is stable, merge it to templates, so we can be # sure that we're using the same code in the preview that we # use in the generated notebook. - cut_points = _make_cut_points(lower, upper, bin_count) + cut_points = make_cut_points(lower, upper, bin_count) context = dp.Context.compositor( data=pl.LazyFrame(df).with_columns( # The cut() method returns a Polars categorical type. diff --git a/dp_wizard/app/components/plots.py b/dp_wizard/utils/shared.py similarity index 50% rename from dp_wizard/app/components/plots.py rename to dp_wizard/utils/shared.py index bd017b6..093ff1d 100644 --- a/dp_wizard/app/components/plots.py +++ b/dp_wizard/utils/shared.py @@ -1,21 +1,40 @@ -import matplotlib.pyplot as plt +# These functions are used both in the application and in generated notebooks. -def _df_to_columns(df): +def make_cut_points(lower_bound, upper_bound, bin_count): """ + Returns one more cut point than the bin_count. + (There are actually two more bins, extending to + -inf and +inf, but we'll ignore those.) + Cut points are evenly spaced from lower_bound to upper_bound. + >>> make_cut_points(0, 10, 2) + [0.0, 5.0, 10.0] + """ + bin_width = (upper_bound - lower_bound) / bin_count + return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] + + +def df_to_columns(df): + """ + Transform a Dataframe into a format that is easier to plot. >>> import polars as pl >>> df = pl.DataFrame({ ... "bin": ["A", "B", "C"], ... "len": [0, 10, 20], ... }) - >>> _df_to_columns(df) + >>> df_to_columns(df) (['A', 'B', 'C'], [0, 10, 20]) """ return tuple(list(df[col]) for col in df.columns) def plot_histogram(histogram_df, error, cutoff): # pragma: no cover - bins, values = _df_to_columns(histogram_df) + """ + Given a Dataframe for a histogram, plot the data. + """ + import matplotlib.pyplot as plt + + bins, values = df_to_columns(histogram_df) mod = (len(bins) // 12) + 1 majors = [label for i, label in enumerate(bins) if i % mod == 0] minors = [label for i, label in enumerate(bins) if i % mod != 0] @@ -26,4 +45,3 @@ def plot_histogram(histogram_df, error, cutoff): # pragma: no cover axes.set_xticks(minors, ["" for _ in minors], minor=True) axes.axhline(cutoff, color="lightgrey", zorder=-1) axes.set_ylim(bottom=0) - # TODO: Since this seems to return None, how does the information flow? diff --git a/dp_wizard/utils/templates/__init__.py b/dp_wizard/utils/templates/__init__.py index 400b964..591e1eb 100644 --- a/dp_wizard/utils/templates/__init__.py +++ b/dp_wizard/utils/templates/__init__.py @@ -141,7 +141,10 @@ def _make_context_for_script(contributions, epsilon, weights, column_names): def _make_imports(): - return str(_Template("imports").fill_values()) + return ( + str(_Template("imports").fill_values()) + + (Path(__file__).parent.parent / "shared.py").read_text() + ) def _make_columns(columns): diff --git a/dp_wizard/utils/templates/no-tests/_imports.py b/dp_wizard/utils/templates/no-tests/_imports.py index 8b1370a..9418f72 100644 --- a/dp_wizard/utils/templates/no-tests/_imports.py +++ b/dp_wizard/utils/templates/no-tests/_imports.py @@ -5,45 +5,3 @@ # The OpenDP team is working to vet the core algorithms. # Until that is complete we need to opt-in to use these features. dp.enable_features("contrib") - - -def make_cut_points(lower_bound, upper_bound, bin_count): - """ - Returns one more cut point than the bin_count. - (There are actually two more bins, extending to - -inf and +inf, but we'll ignore those.) - Cut points are evenly spaced from lower_bound to upper_bound. - """ - bin_width = (upper_bound - lower_bound) / bin_count - return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] - - -def df_to_columns(df): - """ - Transform a Dataframe into a format that is easier to plot. - >>> import polars as pl - >>> df = pl.DataFrame({ - ... "bin": ["A", "B", "C"], - ... "len": [0, 10, 20], - ... }) - >>> df_to_columns(df) - (['A', 'B', 'C'], [0, 10, 20]) - """ - return tuple(list(df[col]) for col in df.columns) - - -def plot_histogram(histogram_df, error, cutoff): # pragma: no cover - """ - Given a Dataframe for a histogram, plot the data. - """ - bins, values = df_to_columns(histogram_df) - mod = (len(bins) // 12) + 1 - majors = [label for i, label in enumerate(bins) if i % mod == 0] - minors = [label for i, label in enumerate(bins) if i % mod != 0] - _figure, axes = plt.subplots() - bar_colors = ["blue" if v > cutoff else "lightblue" for v in values] - axes.bar(bins, values, color=bar_colors, yerr=error) - axes.set_xticks(majors, majors) - axes.set_xticks(minors, ["" for _ in minors], minor=True) - axes.axhline(cutoff, color="lightgrey", zorder=-1) - axes.set_ylim(bottom=0) From 70d34945035ed0254266797b96bfd1f083af737e Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 15 Nov 2024 14:36:31 -0500 Subject: [PATCH 29/29] sort the intervals --- dp_wizard/utils/shared.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/dp_wizard/utils/shared.py b/dp_wizard/utils/shared.py index 093ff1d..75719ff 100644 --- a/dp_wizard/utils/shared.py +++ b/dp_wizard/utils/shared.py @@ -14,18 +14,28 @@ def make_cut_points(lower_bound, upper_bound, bin_count): return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] +def interval_bottom(interval): + """ + >>> interval_bottom("(10, 20]") + 10.0 + """ + return float(interval.split(",")[0][1:]) + + def df_to_columns(df): """ - Transform a Dataframe into a format that is easier to plot. + Transform a Dataframe into a format that is easier to plot, + parsing the interval strings to sort them as numbers. >>> import polars as pl >>> df = pl.DataFrame({ - ... "bin": ["A", "B", "C"], - ... "len": [0, 10, 20], + ... "bin": ["(-inf, 5]", "(10, 20]", "(5, 10]"], + ... "len": [0, 20, 10], ... }) >>> df_to_columns(df) - (['A', 'B', 'C'], [0, 10, 20]) + (('(-inf, 5]', '(5, 10]', '(10, 20]'), (0, 10, 20)) """ - return tuple(list(df[col]) for col in df.columns) + sorted_rows = sorted(df.rows(), key=lambda pair: interval_bottom(pair[0])) + return tuple(zip(*sorted_rows)) def plot_histogram(histogram_df, error, cutoff): # pragma: no cover