diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md index dff73d5..812b320 100644 --- a/WHAT-WE-LEARNED.md +++ b/WHAT-WE-LEARNED.md @@ -46,6 +46,8 @@ but that returns an error: Renderer.__call__() missing 1 required positional argument: '_fn' ``` +If I just refer to a reactive calc directly in the UI there is no error in the log, just a spinner in the UI. + ## No component testing It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component. diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py index a84a13d..3e152bf 100644 --- a/dp_wizard/app/components/column_module.py +++ b/dp_wizard/app/components/column_module.py @@ -3,7 +3,7 @@ from shiny import ui, render, module, reactive from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram -from dp_wizard.app.components.plots import plot_histogram +from dp_wizard.utils.shared import plot_histogram from dp_wizard.utils.templates import make_column_config_block from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip diff --git a/dp_wizard/app/components/plots.py b/dp_wizard/app/components/plots.py deleted file mode 100644 index bd017b6..0000000 --- a/dp_wizard/app/components/plots.py +++ /dev/null @@ -1,29 +0,0 @@ -import matplotlib.pyplot as plt - - -def _df_to_columns(df): - """ - >>> import polars as pl - >>> df = pl.DataFrame({ - ... "bin": ["A", "B", "C"], - ... "len": [0, 10, 20], - ... }) - >>> _df_to_columns(df) - (['A', 'B', 'C'], [0, 10, 20]) - """ - return tuple(list(df[col]) for col in df.columns) - - -def plot_histogram(histogram_df, error, cutoff): # pragma: no cover - bins, values = _df_to_columns(histogram_df) - mod = (len(bins) // 12) + 1 - majors = [label for i, label in enumerate(bins) if i % mod == 0] - minors = [label for i, label in enumerate(bins) if i % mod != 0] - _figure, axes = plt.subplots() - bar_colors = ["blue" if v > cutoff else "lightblue" for v in values] - axes.bar(bins, values, color=bar_colors, yerr=error) - axes.set_xticks(majors, majors) - axes.set_xticks(minors, ["" for _ in minors], minor=True) - axes.axhline(cutoff, color="lightgrey", zorder=-1) - axes.set_ylim(bottom=0) - # TODO: Since this seems to return None, how does the information flow? diff --git a/dp_wizard/app/results_panel.py b/dp_wizard/app/results_panel.py index b50aca1..58a23a6 100644 --- a/dp_wizard/app/results_panel.py +++ b/dp_wizard/app/results_panel.py @@ -1,6 +1,6 @@ from json import dumps -from shiny import ui, render +from shiny import ui, render, reactive from dp_wizard.utils.templates import make_notebook_py, make_script_py from dp_wizard.utils.converters import convert_py_to_nb @@ -9,24 +9,13 @@ def results_ui(): return ui.nav_panel( "Download results", - ui.p("TODO: Use this information to fill in a template!"), - ui.output_code("data_dump"), - ui.markdown( - "You can now make a differentially private release of your data. " - "This will lock the configuration you’ve provided on the previous pages." - ), - ui.markdown("TODO: Button: “Download Report (.txt)” (implemented as yaml?)"), - ui.markdown("TODO: Button: “Download Report (.csv)"), - ui.markdown( - "You can also download code that can be executed to produce a DP release. " - "Downloaded code does not lock the configuration." - ), + ui.markdown("You can now make a differentially private release of your data."), ui.download_button( "download_script", "Download Script (.py)", ), ui.download_button( - "download_notebook_unexecuted", + "download_notebook", "Download Notebook (.ipynb)", ), value="results_panel", @@ -45,32 +34,64 @@ def results_server( weights, epsilon, ): # pragma: no cover - @render.code - def data_dump(): - # TODO: Use this information in a template! + @reactive.calc + def analysis_dict(): + # weights().keys() will reflect the desired columns: + # The others retain inactive columns, so user + # inputs aren't lost when toggling checkboxes. + columns = { + col: { + "lower_bound": lower_bounds()[col], + "upper_bound": upper_bounds()[col], + "bin_count": int(bin_counts()[col]), + # TODO: Floats should work for weight, but they don't: + # https://github.com/opendp/opendp/issues/2140 + "weight": int(weights()[col]), + } + for col in weights().keys() + } + return { + "csv_path": csv_path(), + "contributions": contributions(), + "epsilon": epsilon(), + "columns": columns, + } + + @reactive.calc + def analysis_json(): return dumps( - { - "csv_path": csv_path(), - "contributions": contributions(), - "lower_bounds": lower_bounds(), - "upper_bounds": upper_bounds(), - "bin_counts": bin_counts(), - "weights": weights(), - "epsilon": epsilon(), - }, + analysis_dict(), indent=2, ) + @render.text + def analysis_json_text(): + return analysis_json() + + @reactive.calc + def analysis_python(): + analysis = analysis_dict() + return make_notebook_py( + csv_path=analysis["csv_path"], + contributions=analysis["contributions"], + epsilon=analysis["epsilon"], + columns=analysis["columns"], + ) + + @render.text + def analysis_python_text(): + return analysis_python() + @render.download( filename="dp-wizard-script.py", media_type="text/x-python", ) async def download_script(): - contributions = input.contributions() + analysis = analysis_dict() script_py = make_script_py( - contributions=contributions, - epsilon=1, - weights=[1], + contributions=analysis["contributions"], + epsilon=analysis["epsilon"], + columns=analysis["columns"], ) yield script_py @@ -78,28 +99,13 @@ async def download_script(): filename="dp-wizard-notebook.ipynb", media_type="application/x-ipynb+json", ) - async def download_notebook_unexecuted(): - contributions = input.contributions() - notebook_py = make_notebook_py( - csv_path="todo.csv", - contributions=contributions, - epsilon=1, - weights=[1], - ) - notebook_nb = convert_py_to_nb(notebook_py) - yield notebook_nb - - @render.download( - filename="dp-wizard-notebook-executed.ipynb", - media_type="application/x-ipynb+json", - ) - async def download_notebook_executed(): - contributions = input.contributions() + async def download_notebook(): + analysis = analysis_dict() notebook_py = make_notebook_py( - csv_path="todo.csv", - contributions=contributions, - epsilon=1, - weights=[1], + csv_path=analysis["csv_path"], + contributions=analysis["contributions"], + epsilon=analysis["epsilon"], + columns=analysis["columns"], ) notebook_nb = convert_py_to_nb(notebook_py, execute=True) yield notebook_nb diff --git a/dp_wizard/utils/csv_helper.py b/dp_wizard/utils/csv_helper.py index 4c279ee..b7b92dd 100644 --- a/dp_wizard/utils/csv_helper.py +++ b/dp_wizard/utils/csv_helper.py @@ -2,9 +2,11 @@ We'll use the following terms consistently throughout the application: - name: This is the exact column header in the CSV. - label: This is the string we'll display. -- id: This is the string we'll pass as a module ID. +- id: This is the opaque string we'll pass as a module ID. +- identifier: This is a form that can be used as a Python identifier. """ +import re import polars as pl @@ -34,3 +36,7 @@ def name_to_id(name): # Shiny is fussy about module IDs, # but we don't need them to be human readable. return str(hash(name)).replace("-", "_") + + +def name_to_identifier(name): + return re.sub(r"\W+", "_", name).lower() diff --git a/dp_wizard/utils/dp_helper.py b/dp_wizard/utils/dp_helper.py index b7949bd..f7ec1d9 100644 --- a/dp_wizard/utils/dp_helper.py +++ b/dp_wizard/utils/dp_helper.py @@ -2,28 +2,11 @@ import opendp.prelude as dp from dp_wizard.utils.mock_data import mock_data, ColumnDef +from dp_wizard.utils.shared import make_cut_points dp.enable_features("contrib") -def _make_cut_points(lower, upper, bin_count): - """ - Returns one more cut point than the bin_count. - (There are actually two more bins, extending to - -inf and +inf, but we'll ignore those.) - Cut points are evenly spaced from lower to upper. - - >>> _make_cut_points(0, 10, 1) - [0.0, 10.0] - >>> _make_cut_points(0, 10, 2) - [0.0, 5.0, 10.0] - >>> _make_cut_points(0, 10, 3) - [0.0, 3.33, 6.67, 10.0] - """ - bin_width = (upper - lower) / bin_count - return [round(lower + i * bin_width, 2) for i in range(bin_count + 1)] - - def make_confidence_accuracy_histogram( lower=None, upper=None, bin_count=None, contributions=None, weighted_epsilon=None ): @@ -58,7 +41,7 @@ def make_confidence_accuracy_histogram( # TODO: When this is stable, merge it to templates, so we can be # sure that we're using the same code in the preview that we # use in the generated notebook. - cut_points = _make_cut_points(lower, upper, bin_count) + cut_points = make_cut_points(lower, upper, bin_count) context = dp.Context.compositor( data=pl.LazyFrame(df).with_columns( # The cut() method returns a Polars categorical type. diff --git a/dp_wizard/utils/shared.py b/dp_wizard/utils/shared.py new file mode 100644 index 0000000..75719ff --- /dev/null +++ b/dp_wizard/utils/shared.py @@ -0,0 +1,57 @@ +# These functions are used both in the application and in generated notebooks. + + +def make_cut_points(lower_bound, upper_bound, bin_count): + """ + Returns one more cut point than the bin_count. + (There are actually two more bins, extending to + -inf and +inf, but we'll ignore those.) + Cut points are evenly spaced from lower_bound to upper_bound. + >>> make_cut_points(0, 10, 2) + [0.0, 5.0, 10.0] + """ + bin_width = (upper_bound - lower_bound) / bin_count + return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)] + + +def interval_bottom(interval): + """ + >>> interval_bottom("(10, 20]") + 10.0 + """ + return float(interval.split(",")[0][1:]) + + +def df_to_columns(df): + """ + Transform a Dataframe into a format that is easier to plot, + parsing the interval strings to sort them as numbers. + >>> import polars as pl + >>> df = pl.DataFrame({ + ... "bin": ["(-inf, 5]", "(10, 20]", "(5, 10]"], + ... "len": [0, 20, 10], + ... }) + >>> df_to_columns(df) + (('(-inf, 5]', '(5, 10]', '(10, 20]'), (0, 10, 20)) + """ + sorted_rows = sorted(df.rows(), key=lambda pair: interval_bottom(pair[0])) + return tuple(zip(*sorted_rows)) + + +def plot_histogram(histogram_df, error, cutoff): # pragma: no cover + """ + Given a Dataframe for a histogram, plot the data. + """ + import matplotlib.pyplot as plt + + bins, values = df_to_columns(histogram_df) + mod = (len(bins) // 12) + 1 + majors = [label for i, label in enumerate(bins) if i % mod == 0] + minors = [label for i, label in enumerate(bins) if i % mod != 0] + _figure, axes = plt.subplots() + bar_colors = ["blue" if v > cutoff else "lightblue" for v in values] + axes.bar(bins, values, color=bar_colors, yerr=error) + axes.set_xticks(majors, majors) + axes.set_xticks(minors, ["" for _ in minors], minor=True) + axes.axhline(cutoff, color="lightgrey", zorder=-1) + axes.set_ylim(bottom=0) diff --git a/dp_wizard/utils/templates/__init__.py b/dp_wizard/utils/templates/__init__.py index 8251890..591e1eb 100644 --- a/dp_wizard/utils/templates/__init__.py +++ b/dp_wizard/utils/templates/__init__.py @@ -7,6 +7,7 @@ from pathlib import Path import re +from dp_wizard.utils.csv_helper import name_to_identifier class _Template: @@ -70,11 +71,41 @@ def __str__(self): return self._template -def _make_context_for_notebook(csv_path, contributions, epsilon, weights): +def _make_margins_dict(bin_names): + # TODO: Don't worry too much about the formatting here. + # Plan to run the output through black for consistency. + # https://github.com/opendp/dp-creator-ii/issues/50 + margins = ( + [ + """ + (): dp.polars.Margin( + public_info="lengths", + ),""" + ] + + [ + f""" + ("{bin_name}",): dp.polars.Margin( + public_info="keys", + ),""" + for bin_name in bin_names + ] + ) + + margins_dict = "{" + "".join(margins) + "\n }" + return margins_dict + + +def _make_context_for_notebook(csv_path, contributions, epsilon, weights, column_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) + margins_dict = _make_margins_dict([f"{name}_bin" for name in column_names]) + columns = ", ".join([f"{name}_config" for name in column_names]) return str( _Template("context") + .fill_expressions( + MARGINS_DICT=margins_dict, + COLUMNS=columns, + ) .fill_values( CSV_PATH=csv_path, WEIGHTS=weights, @@ -86,13 +117,17 @@ def _make_context_for_notebook(csv_path, contributions, epsilon, weights): ) -def _make_context_for_script(contributions, epsilon, weights): +def _make_context_for_script(contributions, epsilon, weights, column_names): privacy_unit_block = make_privacy_unit_block(contributions) privacy_loss_block = make_privacy_loss_block(epsilon) + margins_dict = _make_margins_dict([f"{name}_bin" for name in column_names]) + columns = ",".join([f"{name}_config" for name in column_names]) return str( _Template("context") .fill_expressions( CSV_PATH="csv_path", + MARGINS_DICT=margins_dict, + COLUMNS=columns, ) .fill_values( WEIGHTS=weights, @@ -100,37 +135,81 @@ def _make_context_for_script(contributions, epsilon, weights): .fill_blocks( PRIVACY_UNIT_BLOCK=privacy_unit_block, PRIVACY_LOSS_BLOCK=privacy_loss_block, + MARGINS_DICT=margins_dict, ) ) def _make_imports(): - return str(_Template("imports").fill_values()) + return ( + str(_Template("imports").fill_values()) + + (Path(__file__).parent.parent / "shared.py").read_text() + ) + + +def _make_columns(columns): + return "\n".join( + make_column_config_block( + name=name, + lower_bound=col["lower_bound"], + upper_bound=col["upper_bound"], + bin_count=col["bin_count"], + ) + for name, col in columns.items() + ) -def make_notebook_py(csv_path, contributions, epsilon, weights): +def _make_query(column_name): + indentifier = name_to_identifier(column_name) + return str( + _Template("query") + .fill_values( + BIN_NAME=f"{indentifier}_bin", + ) + .fill_expressions( + QUERY_NAME=f"{indentifier}_query", + ACCURACY_NAME=f"{indentifier}_accuracy", + HISTOGRAM_NAME=f"{indentifier}_histogram", + ) + ) + + +def _make_queries(column_names): + return "confidence = 0.95\n\n" + "\n".join( + _make_query(column_name) for column_name in column_names + ) + + +def make_notebook_py(csv_path, contributions, epsilon, columns): return str( _Template("notebook").fill_blocks( IMPORTS_BLOCK=_make_imports(), + COLUMNS_BLOCK=_make_columns(columns), CONTEXT_BLOCK=_make_context_for_notebook( csv_path=csv_path, contributions=contributions, epsilon=epsilon, - weights=weights, + weights=[column["weight"] for column in columns.values()], + column_names=[name_to_identifier(name) for name in columns.keys()], ), + QUERIES_BLOCK=_make_queries(columns.keys()), ) ) -def make_script_py(contributions, epsilon, weights): +def make_script_py(contributions, epsilon, columns): return str( _Template("script").fill_blocks( IMPORTS_BLOCK=_make_imports(), + COLUMNS_BLOCK=_make_columns(columns), CONTEXT_BLOCK=_make_context_for_script( + # csv_path is a CLI parameter in the script contributions=contributions, epsilon=epsilon, - weights=weights, + weights=[column["weight"] for column in columns.values()], + column_names=[name_to_identifier(name) for name in columns.keys()], ), + QUERIES_BLOCK=_make_queries(columns.keys()), ) ) @@ -151,8 +230,12 @@ def make_column_config_block(name, lower_bound, upper_bound, bin_count): ... upper_bound=100, ... bin_count=10 ... )) - # From the public information, determine the bins: - hw_grade_cut_points = make_cut_points(0, 100, 10) + # From the public information, determine the bins for 'HW GRADE': + hw_grade_cut_points = make_cut_points( + lower_bound=0, + upper_bound=100, + bin_count=10, + ) # Use these bins to define a Polars column: hw_grade_config = ( @@ -173,7 +256,7 @@ def make_column_config_block(name, lower_bound, upper_bound, bin_count): .fill_values( LOWER_BOUND=lower_bound, UPPER_BOUND=upper_bound, - BINS=bin_count, + BIN_COUNT=bin_count, COLUMN_NAME=name, BIN_COLUMN_NAME=f"{snake_name}_bin", ) diff --git a/dp_wizard/utils/templates/no-tests/_column_config.py b/dp_wizard/utils/templates/no-tests/_column_config.py index ddb44bd..bb367e6 100644 --- a/dp_wizard/utils/templates/no-tests/_column_config.py +++ b/dp_wizard/utils/templates/no-tests/_column_config.py @@ -1,5 +1,9 @@ -# From the public information, determine the bins: -CUT_LIST_NAME = make_cut_points(LOWER_BOUND, UPPER_BOUND, BINS) +# From the public information, determine the bins for COLUMN_NAME: +CUT_LIST_NAME = make_cut_points( + lower_bound=LOWER_BOUND, + upper_bound=UPPER_BOUND, + bin_count=BIN_COUNT, +) # Use these bins to define a Polars column: POLARS_CONFIG_NAME = ( diff --git a/dp_wizard/utils/templates/no-tests/_context.py b/dp_wizard/utils/templates/no-tests/_context.py index cdd8194..32ca440 100644 --- a/dp_wizard/utils/templates/no-tests/_context.py +++ b/dp_wizard/utils/templates/no-tests/_context.py @@ -1,8 +1,9 @@ PRIVACY_UNIT_BLOCK PRIVACY_LOSS_BLOCK context = dp.Context.compositor( - data=pl.scan_csv(CSV_PATH, encoding="utf8-lossy"), + data=pl.scan_csv(CSV_PATH, encoding="utf8-lossy").with_columns(COLUMNS), privacy_unit=privacy_unit, privacy_loss=privacy_loss, split_by_weights=WEIGHTS, + margins=MARGINS_DICT, ) diff --git a/dp_wizard/utils/templates/no-tests/_imports.py b/dp_wizard/utils/templates/no-tests/_imports.py index 5df8d79..9418f72 100644 --- a/dp_wizard/utils/templates/no-tests/_imports.py +++ b/dp_wizard/utils/templates/no-tests/_imports.py @@ -1,4 +1,7 @@ import polars as pl import opendp.prelude as dp +import matplotlib.pyplot as plt +# The OpenDP team is working to vet the core algorithms. +# Until that is complete we need to opt-in to use these features. dp.enable_features("contrib") diff --git a/dp_wizard/utils/templates/no-tests/_notebook.py b/dp_wizard/utils/templates/no-tests/_notebook.py index c6aeed3..16862ba 100644 --- a/dp_wizard/utils/templates/no-tests/_notebook.py +++ b/dp_wizard/utils/templates/no-tests/_notebook.py @@ -1,9 +1,36 @@ -# This is a demonstration how OpenDP can be used to create -# a differentially private release. To customize this, -# see the documentation for OpenDP: https://docs.opendp.org/ +# This is a demonstration of how OpenDP can be used to create +# a differentially private release. To learn more about what's +# going on here, see the documentation for OpenDP: https://docs.opendp.org/ # + IMPORTS_BLOCK +# - +# Based on the input you provided, for each column we'll create a set of cut points, +# and a Polars expression that describes how we want to summarize that column. + +# + +COLUMNS_BLOCK +# - + +# Next, we'll define our Context. This is where we set the privacy budget, +# and set the weight for each query under that overall budget. +# If we try to run more one more query than we have weights, it will error. +# Once the privacy budget is consumed, you shouldn't run more queries. + +# + CONTEXT_BLOCK -print(context) +# - + +# A note on `utf8-lossy`: CSVs can use different "character encodings" to +# represent characters outside the plain ascii character set, but out of the box +# the Polars library only supports UTF8. Specifying `utf8-lossy` preserves as +# much information as possible, and any unrecognized characters will be replaced +# by "�". If this is not sufficient, you will need to preprocess your data to +# reencode it as UTF8. +# +# Finally, we run the queries and plot the results. + +# + +QUERIES_BLOCK +# - diff --git a/dp_wizard/utils/templates/no-tests/_query.py b/dp_wizard/utils/templates/no-tests/_query.py new file mode 100644 index 0000000..0d78f0d --- /dev/null +++ b/dp_wizard/utils/templates/no-tests/_query.py @@ -0,0 +1,4 @@ +QUERY_NAME = context.query().group_by(BIN_NAME).agg(pl.len().dp.noise()) +ACCURACY_NAME = QUERY_NAME.summarize(alpha=1 - confidence)["accuracy"].item() +HISTOGRAM_NAME = QUERY_NAME.release().collect().sort(BIN_NAME) +plot_histogram(HISTOGRAM_NAME, ACCURACY_NAME, 0) diff --git a/dp_wizard/utils/templates/no-tests/_script.py b/dp_wizard/utils/templates/no-tests/_script.py index 37be6c5..ab43f5c 100644 --- a/dp_wizard/utils/templates/no-tests/_script.py +++ b/dp_wizard/utils/templates/no-tests/_script.py @@ -2,6 +2,8 @@ IMPORTS_BLOCK +COLUMNS_BLOCK + def get_context(csv_path): CONTEXT_BLOCK @@ -15,4 +17,5 @@ def get_context(csv_path): parser.add_argument("--csv", help="Path to csv containing private data") args = parser.parse_args() context = get_context(csv_path=args.csv) - print(context) + + QUERIES_BLOCK diff --git a/tests/fixtures/expected-script.py b/tests/fixtures/expected-script.py deleted file mode 100644 index 2f83281..0000000 --- a/tests/fixtures/expected-script.py +++ /dev/null @@ -1,31 +0,0 @@ -from argparse import ArgumentParser - -import polars as pl -import opendp.prelude as dp - -dp.enable_features("contrib") - - -def get_context(csv_path): - privacy_unit = dp.unit_of(contributions=1) - - privacy_loss = dp.loss_of(epsilon=1, delta=1e-7) - - context = dp.Context.compositor( - data=pl.scan_csv(csv_path, encoding="utf8-lossy"), - privacy_unit=privacy_unit, - privacy_loss=privacy_loss, - split_by_weights=[1], - ) - - return context - - -if __name__ == "__main__": - parser = ArgumentParser( - description="Creates a differentially private release from a csv" - ) - parser.add_argument("--csv", help="Path to csv containing private data") - args = parser.parse_args() - context = get_context(csv_path=args.csv) - print(context) diff --git a/tests/utils/test_templates.py b/tests/utils/test_templates.py index a5970a2..7bcdfc3 100644 --- a/tests/utils/test_templates.py +++ b/tests/utils/test_templates.py @@ -83,10 +83,7 @@ def test_fill_template_unfilled_slots(): context_template = _Template("context") with pytest.raises( Exception, - match=re.escape( - "context.py has unfilled slots: " - "CSV_PATH, PRIVACY_LOSS_BLOCK, PRIVACY_UNIT_BLOCK, WEIGHTS" - ), + match=re.escape("context.py has unfilled slots"), ): str(context_template.fill_values()) @@ -96,8 +93,18 @@ def test_make_notebook(): csv_path=fake_csv, contributions=1, epsilon=1, - weights=[1], + columns={ + # For a strong test, use a column whose name + # doesn't work as a python identifier. + "hw-number": { + "lower_bound": 5, + "upper_bound": 15, + "bin_count": 20, + "weight": 4, + } + }, ) + print(notebook) globals = {} exec(notebook, globals) assert isinstance(globals["context"], dp.Context) @@ -107,15 +114,16 @@ def test_make_script(): script = make_script_py( contributions=1, epsilon=1, - weights=[1], + columns={ + "hw-number": { + "lower_bound": 5, + "upper_bound": 15, + "bin_count": 20, + "weight": 4, + } + }, ) - - def clear_empty_lines(text): - # Cleanup whitespace after indenting blocks - return re.sub(r"^\s+$", "", text, flags=re.MULTILINE).strip() - - expected_script = (fixtures_path / "expected-script.py").read_text() - assert clear_empty_lines(script) == clear_empty_lines(expected_script) + print(script) with NamedTemporaryFile(mode="w") as fp: fp.write(script)