Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make notebook with plots for columns #152

Merged
merged 35 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
f4a3413
lower and upper more consistently
mccalluc Nov 4, 2024
12c1402
one more
mccalluc Nov 4, 2024
1274e75
handle bounds/bins/counts the same way
mccalluc Nov 5, 2024
19c3065
lots of reactive dicts, but the UI has not changed
mccalluc Nov 5, 2024
603d0bc
data dump on the results page
mccalluc Nov 5, 2024
ad32217
add a pragma: no cover
mccalluc Nov 5, 2024
c8b4ddc
reset widget values after checkbox change
mccalluc Nov 7, 2024
1247392
do not clean up values
mccalluc Nov 7, 2024
34e1170
resolve conflicts
mccalluc Nov 7, 2024
f35ed4f
resolve conflicts
mccalluc Nov 7, 2024
a11c9fa
resolve conflicts
mccalluc Nov 7, 2024
d84c4cd
use "upper" and "lower"
mccalluc Nov 7, 2024
16463b4
put tooltips in labels
mccalluc Nov 8, 2024
9cd991d
pull warning up to analysis panel. TODO: conditional
mccalluc Nov 8, 2024
f2b5192
move warning to bottom of list
mccalluc Nov 8, 2024
da6a0cb
analysis definition JSON
mccalluc Nov 8, 2024
0dbdd7b
stubs for python
mccalluc Nov 8, 2024
610404c
stub a script on results page
mccalluc Nov 8, 2024
c45585b
include column info in generated script
mccalluc Nov 8, 2024
5803715
closer to a runable notebook
mccalluc Nov 8, 2024
93c9543
stuck on split_by_weight... maybe a library bug?
mccalluc Nov 8, 2024
f27a175
margin stubs
mccalluc Nov 13, 2024
6093cfb
resolve conflicts
mccalluc Nov 13, 2024
6b8a38f
format python identifiers correctly
mccalluc Nov 13, 2024
1a9a2a7
script has gotten longer: does not make sense to check for exact equa…
mccalluc Nov 13, 2024
d3be33d
fix syntactic problems in generated code
mccalluc Nov 13, 2024
03c6dfa
fill in columns, but still WIP
mccalluc Nov 14, 2024
d560195
Merge dp_creator_ii -> dp_wizard
mccalluc Nov 15, 2024
a3abd8d
fix column names; tests pass
mccalluc Nov 15, 2024
a79dbdc
move confidence
mccalluc Nov 15, 2024
8ff945a
simplify download panel
mccalluc Nov 15, 2024
d635dd2
add markdown cells
mccalluc Nov 15, 2024
0362d38
tidy up
mccalluc Nov 15, 2024
5ef222e
fix copy-paste of util functions
mccalluc Nov 15, 2024
70d3494
sort the intervals
mccalluc Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions WHAT-WE-LEARNED.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ but that returns an error:
Renderer.__call__() missing 1 required positional argument: '_fn'
```

If I just refer to a reactive calc directly in the UI there is no error in the log, just a spinner in the UI.

## No component testing

It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component.
Expand Down
2 changes: 1 addition & 1 deletion dp_wizard/app/components/column_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from shiny import ui, render, module, reactive

from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram
from dp_wizard.app.components.plots import plot_histogram
from dp_wizard.utils.shared import plot_histogram
from dp_wizard.utils.templates import make_column_config_block
from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip

Expand Down
29 changes: 0 additions & 29 deletions dp_wizard/app/components/plots.py

This file was deleted.

108 changes: 57 additions & 51 deletions dp_wizard/app/results_panel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from json import dumps

from shiny import ui, render
from shiny import ui, render, reactive

from dp_wizard.utils.templates import make_notebook_py, make_script_py
from dp_wizard.utils.converters import convert_py_to_nb
Expand All @@ -9,24 +9,13 @@
def results_ui():
return ui.nav_panel(
"Download results",
ui.p("TODO: Use this information to fill in a template!"),
ui.output_code("data_dump"),
ui.markdown(
"You can now make a differentially private release of your data. "
"This will lock the configuration you’ve provided on the previous pages."
),
ui.markdown("TODO: Button: “Download Report (.txt)” (implemented as yaml?)"),
ui.markdown("TODO: Button: “Download Report (.csv)"),
ui.markdown(
"You can also download code that can be executed to produce a DP release. "
"Downloaded code does not lock the configuration."
),
ui.markdown("You can now make a differentially private release of your data."),
ui.download_button(
"download_script",
"Download Script (.py)",
),
ui.download_button(
"download_notebook_unexecuted",
"download_notebook",
"Download Notebook (.ipynb)",
),
value="results_panel",
Expand All @@ -45,61 +34,78 @@ def results_server(
weights,
epsilon,
): # pragma: no cover
@render.code
def data_dump():
# TODO: Use this information in a template!
@reactive.calc
def analysis_dict():
# weights().keys() will reflect the desired columns:
# The others retain inactive columns, so user
# inputs aren't lost when toggling checkboxes.
columns = {
col: {
"lower_bound": lower_bounds()[col],
"upper_bound": upper_bounds()[col],
"bin_count": int(bin_counts()[col]),
# TODO: Floats should work for weight, but they don't:
# https://github.com/opendp/opendp/issues/2140
"weight": int(weights()[col]),
}
for col in weights().keys()
}
return {
"csv_path": csv_path(),
"contributions": contributions(),
"epsilon": epsilon(),
"columns": columns,
}

@reactive.calc
def analysis_json():
return dumps(
{
"csv_path": csv_path(),
"contributions": contributions(),
"lower_bounds": lower_bounds(),
"upper_bounds": upper_bounds(),
"bin_counts": bin_counts(),
"weights": weights(),
"epsilon": epsilon(),
},
analysis_dict(),
indent=2,
)

@render.text
def analysis_json_text():
return analysis_json()

@reactive.calc
def analysis_python():
analysis = analysis_dict()
return make_notebook_py(
csv_path=analysis["csv_path"],
contributions=analysis["contributions"],
epsilon=analysis["epsilon"],
columns=analysis["columns"],
)

@render.text
def analysis_python_text():
return analysis_python()

@render.download(
filename="dp-wizard-script.py",
media_type="text/x-python",
)
async def download_script():
contributions = input.contributions()
analysis = analysis_dict()
script_py = make_script_py(
contributions=contributions,
epsilon=1,
weights=[1],
contributions=analysis["contributions"],
epsilon=analysis["epsilon"],
columns=analysis["columns"],
)
yield script_py

@render.download(
filename="dp-wizard-notebook.ipynb",
media_type="application/x-ipynb+json",
)
async def download_notebook_unexecuted():
contributions = input.contributions()
notebook_py = make_notebook_py(
csv_path="todo.csv",
contributions=contributions,
epsilon=1,
weights=[1],
)
notebook_nb = convert_py_to_nb(notebook_py)
yield notebook_nb

@render.download(
filename="dp-wizard-notebook-executed.ipynb",
media_type="application/x-ipynb+json",
)
async def download_notebook_executed():
contributions = input.contributions()
async def download_notebook():
analysis = analysis_dict()
notebook_py = make_notebook_py(
csv_path="todo.csv",
contributions=contributions,
epsilon=1,
weights=[1],
csv_path=analysis["csv_path"],
contributions=analysis["contributions"],
epsilon=analysis["epsilon"],
columns=analysis["columns"],
)
notebook_nb = convert_py_to_nb(notebook_py, execute=True)
yield notebook_nb
8 changes: 7 additions & 1 deletion dp_wizard/utils/csv_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
We'll use the following terms consistently throughout the application:
- name: This is the exact column header in the CSV.
- label: This is the string we'll display.
- id: This is the string we'll pass as a module ID.
- id: This is the opaque string we'll pass as a module ID.
- identifier: This is a form that can be used as a Python identifier.
"""

import re
import polars as pl


Expand Down Expand Up @@ -34,3 +36,7 @@ def name_to_id(name):
# Shiny is fussy about module IDs,
# but we don't need them to be human readable.
return str(hash(name)).replace("-", "_")


def name_to_identifier(name):
return re.sub(r"\W+", "_", name).lower()
21 changes: 2 additions & 19 deletions dp_wizard/utils/dp_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,11 @@
import opendp.prelude as dp

from dp_wizard.utils.mock_data import mock_data, ColumnDef
from dp_wizard.utils.shared import make_cut_points

dp.enable_features("contrib")


def _make_cut_points(lower, upper, bin_count):
"""
Returns one more cut point than the bin_count.
(There are actually two more bins, extending to
-inf and +inf, but we'll ignore those.)
Cut points are evenly spaced from lower to upper.

>>> _make_cut_points(0, 10, 1)
[0.0, 10.0]
>>> _make_cut_points(0, 10, 2)
[0.0, 5.0, 10.0]
>>> _make_cut_points(0, 10, 3)
[0.0, 3.33, 6.67, 10.0]
"""
bin_width = (upper - lower) / bin_count
return [round(lower + i * bin_width, 2) for i in range(bin_count + 1)]


def make_confidence_accuracy_histogram(
lower=None, upper=None, bin_count=None, contributions=None, weighted_epsilon=None
):
Expand Down Expand Up @@ -58,7 +41,7 @@ def make_confidence_accuracy_histogram(
# TODO: When this is stable, merge it to templates, so we can be
# sure that we're using the same code in the preview that we
# use in the generated notebook.
cut_points = _make_cut_points(lower, upper, bin_count)
cut_points = make_cut_points(lower, upper, bin_count)
context = dp.Context.compositor(
data=pl.LazyFrame(df).with_columns(
# The cut() method returns a Polars categorical type.
Expand Down
57 changes: 57 additions & 0 deletions dp_wizard/utils/shared.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# These functions are used both in the application and in generated notebooks.


def make_cut_points(lower_bound, upper_bound, bin_count):
"""
Returns one more cut point than the bin_count.
(There are actually two more bins, extending to
-inf and +inf, but we'll ignore those.)
Cut points are evenly spaced from lower_bound to upper_bound.
>>> make_cut_points(0, 10, 2)
[0.0, 5.0, 10.0]
"""
bin_width = (upper_bound - lower_bound) / bin_count
return [round(lower_bound + i * bin_width, 2) for i in range(bin_count + 1)]


def interval_bottom(interval):
"""
>>> interval_bottom("(10, 20]")
10.0
"""
return float(interval.split(",")[0][1:])


def df_to_columns(df):
"""
Transform a Dataframe into a format that is easier to plot,
parsing the interval strings to sort them as numbers.
>>> import polars as pl
>>> df = pl.DataFrame({
... "bin": ["(-inf, 5]", "(10, 20]", "(5, 10]"],
... "len": [0, 20, 10],
... })
>>> df_to_columns(df)
(('(-inf, 5]', '(5, 10]', '(10, 20]'), (0, 10, 20))
"""
sorted_rows = sorted(df.rows(), key=lambda pair: interval_bottom(pair[0]))
return tuple(zip(*sorted_rows))


def plot_histogram(histogram_df, error, cutoff): # pragma: no cover
"""
Given a Dataframe for a histogram, plot the data.
"""
import matplotlib.pyplot as plt

bins, values = df_to_columns(histogram_df)
mod = (len(bins) // 12) + 1
majors = [label for i, label in enumerate(bins) if i % mod == 0]
minors = [label for i, label in enumerate(bins) if i % mod != 0]
_figure, axes = plt.subplots()
bar_colors = ["blue" if v > cutoff else "lightblue" for v in values]
axes.bar(bins, values, color=bar_colors, yerr=error)
axes.set_xticks(majors, majors)
axes.set_xticks(minors, ["" for _ in minors], minor=True)
axes.axhline(cutoff, color="lightgrey", zorder=-1)
axes.set_ylim(bottom=0)
Loading
Loading