Skip to content

Commit

Permalink
Make notebook with plots for columns (#152)
Browse files Browse the repository at this point in the history
* lower and upper more consistently

* one more

* handle bounds/bins/counts the same way

* lots of reactive dicts, but the UI has not changed

* data dump on the results page

* add a pragma: no cover

* reset widget values after checkbox change

* do not clean up values

* put tooltips in labels

* pull warning up to analysis panel. TODO: conditional

* move warning to bottom of list

* analysis definition JSON

* stubs for python

* stub a script on results page

* include column info in generated script

* closer to a runable notebook

* stuck on split_by_weight... maybe a library bug?

* margin stubs

* format python identifiers correctly

* script has gotten longer: does not make sense to check for exact equality

* fix syntactic problems in generated code

* fill in columns, but still WIP

* fix column names; tests pass

* move confidence

* simplify download panel

* add markdown cells

* tidy up

* fix copy-paste of util functions

* sort the intervals
  • Loading branch information
mccalluc authored Nov 18, 2024
1 parent 4f5b03d commit 3b89001
Show file tree
Hide file tree
Showing 16 changed files with 290 additions and 163 deletions.
2 changes: 2 additions & 0 deletions WHAT-WE-LEARNED.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ but that returns an error:
Renderer.__call__() missing 1 required positional argument: '_fn'
```

If I just refer to a reactive calc directly in the UI there is no error in the log, just a spinner in the UI.

## No component testing

It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component.
Expand Down
2 changes: 1 addition & 1 deletion dp_wizard/app/components/column_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from shiny import ui, render, module, reactive

from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram
from dp_wizard.app.components.plots import plot_histogram
from dp_wizard.utils.shared import plot_histogram
from dp_wizard.utils.templates import make_column_config_block
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip

Expand Down
29 changes: 0 additions & 29 deletions dp_wizard/app/components/plots.py

This file was deleted.

108 changes: 57 additions & 51 deletions dp_wizard/app/results_panel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from json import dumps

from shiny import ui, render
from shiny import ui, render, reactive

from dp_wizard.utils.templates import make_notebook_py, make_script_py
from dp_wizard.utils.converters import convert_py_to_nb
Expand All @@ -9,24 +9,13 @@
def results_ui():
return ui.nav_panel(
"Download results",
ui.p("TODO: Use this information to fill in a template!"),
ui.output_code("data_dump"),
ui.markdown(
"You can now make a differentially private release of your data. "
"This will lock the configuration you’ve provided on the previous pages."
),
ui.markdown("TODO: Button: “Download Report (.txt)” (implemented as yaml?)"),
ui.markdown("TODO: Button: “Download Report (.csv)"),
ui.markdown(
"You can also download code that can be executed to produce a DP release. "
"Downloaded code does not lock the configuration."
),
ui.markdown("You can now make a differentially private release of your data."),
ui.download_button(
"download_script",
"Download Script (.py)",
),
ui.download_button(
"download_notebook_unexecuted",
"download_notebook",
"Download Notebook (.ipynb)",
),
value="results_panel",
Expand All @@ -45,61 +34,78 @@ def results_server(
weights,
epsilon,
): # pragma: no cover
@render.code
def data_dump():
# TODO: Use this information in a template!
@reactive.calc
def analysis_dict():
# weights().keys() will reflect the desired columns:
# The others retain inactive columns, so user
# inputs aren't lost when toggling checkboxes.
columns = {
col: {
"lower_bound": lower_bounds()[col],
"upper_bound": upper_bounds()[col],
"bin_count": int(bin_counts()[col]),
# TODO: Floats should work for weight, but they don't:
# https://github.com/opendp/opendp/issues/2140
"weight": int(weights()[col]),
}
for col in weights().keys()
}
return {
"csv_path": csv_path(),
"contributions": contributions(),
"epsilon": epsilon(),
"columns": columns,
}

@reactive.calc
def analysis_json():
return dumps(
{
"csv_path": csv_path(),
"contributions": contributions(),
"lower_bounds": lower_bounds(),
"upper_bounds": upper_bounds(),
"bin_counts": bin_counts(),
"weights": weights(),
"epsilon": epsilon(),
},
analysis_dict(),
indent=2,
)

@render.text
def analysis_json_text():
return analysis_json()

@reactive.calc
def analysis_python():
analysis = analysis_dict()
return make_notebook_py(
csv_path=analysis["csv_path"],
contributions=analysis["contributions"],
epsilon=analysis["epsilon"],
columns=analysis["columns"],
)

@render.text
def analysis_python_text():
return analysis_python()

@render.download(
filename="dp-wizard-script.py",
media_type="text/x-python",
)
async def download_script():
contributions = input.contributions()
analysis = analysis_dict()
script_py = make_script_py(
contributions=contributions,
epsilon=1,
weights=[1],
contributions=analysis["contributions"],
epsilon=analysis["epsilon"],
columns=analysis["columns"],
)
yield script_py

@render.download(
filename="dp-wizard-notebook.ipynb",
media_type="application/x-ipynb+json",
)
async def download_notebook_unexecuted():
contributions = input.contributions()
notebook_py = make_notebook_py(
csv_path="todo.csv",
contributions=contributions,
epsilon=1,
weights=[1],
)
notebook_nb = convert_py_to_nb(notebook_py)
yield notebook_nb

@render.download(
filename="dp-wizard-notebook-executed.ipynb",
media_type="application/x-ipynb+json",
)
async def download_notebook_executed():
contributions = input.contributions()
async def download_notebook():
analysis = analysis_dict()
notebook_py = make_notebook_py(
csv_path="todo.csv",
contributions=contributions,
epsilon=1,
weights=[1],
csv_path=analysis["csv_path"],
contributions=analysis["contributions"],
epsilon=analysis["epsilon"],
columns=analysis["columns"],
)
notebook_nb = convert_py_to_nb(notebook_py, execute=True)
yield notebook_nb
8 changes: 7 additions & 1 deletion dp_wizard/utils/csv_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
We'll use the following terms consistently throughout the application:
- name: This is the exact column header in the CSV.
- label: This is the string we'll display.
- id: This is the string we'll pass as a module ID.
- id: This is the opaque string we'll pass as a module ID.
- identifier: This is a form that can be used as a Python identifier.
"""

import re
import polars as pl


Expand Down Expand Up @@ -34,3 +36,7 @@ def name_to_id(name):
# Shiny is fussy about module IDs,
# but we don't need them to be human readable.
return str(hash(name)).replace("-", "_")


def name_to_identifier(name):
return re.sub(r"\W+", "_", name).lower()
21 changes: 2 additions & 19 deletions dp_wizard/utils/dp_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,11 @@
import opendp.prelude as dp

from dp_wizard.utils.mock_data import mock_data, ColumnDef
from dp_wizard.utils.shared import make_cut_points

dp.enable_features("contrib")


def _make_cut_points(lower, upper, bin_count):
"""
Returns one more cut point than the bin_count.
(There are actually two more bins, extending to
-inf and +inf, but we'll ignore those.)
Cut points are evenly spaced from lower to upper.
>>> _make_cut_points(0, 10, 1)
[0.0, 10.0]
>>> _make_cut_points(0, 10, 2)
[0.0, 5.0, 10.0]
>>> _make_cut_points(0, 10, 3)
[0.0, 3.33, 6.67, 10.0]
"""
bin_width = (upper - lower) / bin_count
return [round(lower + i * bin_width, 2) for i in range(bin_count + 1)]


def make_confidence_accuracy_histogram(
lower=None, upper=None, bin_count=None, contributions=None, weighted_epsilon=None
):
Expand Down Expand Up @@ -58,7 +41,7 @@ def make_confidence_accuracy_histogram(
# TODO: When this is stable, merge it to templates, so we can be
# sure that we're using the same code in the preview that we
# use in the generated notebook.
cut_points = _make_cut_points(lower, upper, bin_count)
cut_points = make_cut_points(lower, upper, bin_count)
context = dp.Context.compositor(
data=pl.LazyFrame(df).with_columns(
# The cut() method returns a Polars categorical type.
Expand Down
57 changes: 57 additions & 0 deletions dp_wizard/utils/shared.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# These functions are used both in the application and in generated notebooks.


def make_cut_points(lower_bound, upper_bound, bin_count):
"""
Returns one more cut point than the bin_count.
(There are actually two more bins, extending to
-inf and +inf, but we'll ignore those.)
Cut points are evenly spaced from lower_bound to upper_bound.
>>> make_cut_points(0, 10, 2)
[0.0, 5.0, 10.0]
"""
bin_width = (upper_bound - lower_bound) / bin_count
return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)]


def interval_bottom(interval):
"""
>>> interval_bottom("(10, 20]")
10.0
"""
return float(interval.split(",")[0][1:])


def df_to_columns(df):
"""
Transform a Dataframe into a format that is easier to plot,
parsing the interval strings to sort them as numbers.
>>> import polars as pl
>>> df = pl.DataFrame({
... "bin": ["(-inf, 5]", "(10, 20]", "(5, 10]"],
... "len": [0, 20, 10],
... })
>>> df_to_columns(df)
(('(-inf, 5]', '(5, 10]', '(10, 20]'), (0, 10, 20))
"""
sorted_rows = sorted(df.rows(), key=lambda pair: interval_bottom(pair[0]))
return tuple(zip(*sorted_rows))


def plot_histogram(histogram_df, error, cutoff): # pragma: no cover
"""
Given a Dataframe for a histogram, plot the data.
"""
import matplotlib.pyplot as plt

bins, values = df_to_columns(histogram_df)
mod = (len(bins) // 12) + 1
majors = [label for i, label in enumerate(bins) if i % mod == 0]
minors = [label for i, label in enumerate(bins) if i % mod != 0]
_figure, axes = plt.subplots()
bar_colors = ["blue" if v > cutoff else "lightblue" for v in values]
axes.bar(bins, values, color=bar_colors, yerr=error)
axes.set_xticks(majors, majors)
axes.set_xticks(minors, ["" for _ in minors], minor=True)
axes.axhline(cutoff, color="lightgrey", zorder=-1)
axes.set_ylim(bottom=0)
Loading

0 comments on commit 3b89001

Please sign in to comment.