user provided contributions flows all the way to the generated code (#41

) * input for unit-of-privacy, and show code * factor out code sample into helper * value is now in generated code * rename CLI param * add a test * use shiny code instead of pre * privacy unit template * renaming: unit -> contributions * use contributions on results page * test that user input shows up in generated notebook * better unit tests for template filling * flush instead of close so we will cleanup on exit * check that script is what we expect * finally style script correctly?
opendp · Oct 10, 2024 · 1f02fa9 · 1f02fa9
1 parent e0c2ab6
commit 1f02fa9
Show file tree

Hide file tree

Showing 12 changed files with 143 additions and 54 deletions.
diff --git a/.flake8 b/.flake8
@@ -8,5 +8,5 @@ extend-select = B950
 extend-ignore = E203,E501,E701
 
 per-file-ignores =
-    # Ignore undefined names
-    */templates/*:F821,F401
+    # Ignore undefined names in templates.
+    */templates/*:F821,F401,E302
diff --git a/README.md b/README.md
@@ -16,16 +16,14 @@ We plan to implement a [proof of concept](https://docs.google.com/document/d/1dt
 ## Usage
 
 ```
-usage: dp-creator-ii [-h] [--csv CSV_PATH] [--unit UNIT_OF_PRIVACY]
+usage: dp-creator-ii [-h] [--csv CSV_PATH] [--contrib CONTRIB]
 
 DP Creator II makes it easier to get started with Differential Privacy.
 
 options:
-  -h, --help            show this help message and exit
-  --csv CSV_PATH        Path to CSV containing private data
-  --unit UNIT_OF_PRIVACY
-                        Unit of privacy: How many rows can an individual
-                        contribute?
+  -h, --help         show this help message and exit
+  --csv CSV_PATH     Path to CSV containing private data
+  --contrib CONTRIB  How many rows can an individual contribute?
 ```
 
 

diff --git a/dp_creator_ii/__init__.py b/dp_creator_ii/__init__.py
@@ -27,10 +27,12 @@ def get_arg_parser():
         help="Path to CSV containing private data",
     )
     parser.add_argument(
-        "--unit",
-        dest="unit_of_privacy",
+        "--contrib",
+        dest="contributions",
+        metavar="CONTRIB",
         type=int,
-        help="Unit of privacy: How many rows can an individual contribute?",
+        default=1,
+        help="How many rows can an individual contribute?",
     )
     return parser
 

diff --git a/dp_creator_ii/app/dataset_panel.py b/dp_creator_ii/app/dataset_panel.py
@@ -4,9 +4,24 @@
 
 from dp_creator_ii import get_arg_parser
 from dp_creator_ii.csv_helper import read_field_names
+from dp_creator_ii.app.ui_helpers import output_code_sample
+from dp_creator_ii.template import make_privacy_unit_block
+
+
+def get_args():
+    arg_parser = get_arg_parser()
+    if argv[1:3] == ["run", "--port"]:
+        # We are running a Playwright test,
+        # and ARGV is polluted, so override:
+        return arg_parser.parse_args([])
+    else:
+        # Normal parsing:
+        return arg_parser.parse_args()
 
 
 def dataset_ui():
+    args = get_args()
+
     return ui.nav_panel(
         "Select Dataset",
         "TODO: Pick dataset",
@@ -15,25 +30,17 @@ def dataset_ui():
         ui.output_text("csv_path"),
         "CSV fields:",
         ui.output_text("csv_fields"),
-        "Unit of privacy:",
-        ui.output_text("unit_of_privacy_text"),
+        ui.input_numeric("contributions", "Contributions", args.contributions),
+        output_code_sample("unit_of_privacy_python"),
         ui.input_action_button("go_to_analysis", "Define analysis"),
         value="dataset_panel",
     )
 
 
 def dataset_server(input, output, session):
-    if argv[1:3] == ["run", "--port"]:
-        # Started by playwright
-        arg_csv_path = None
-        arg_unit_of_privacy = None
-    else:
-        args = get_arg_parser().parse_args()
-        arg_csv_path = args.csv_path
-        arg_unit_of_privacy = args.unit_of_privacy
+    args = get_args()
 
-    csv_path_from_cli_value = reactive.value(arg_csv_path)
-    unit_of_privacy = reactive.value(arg_unit_of_privacy)
+    csv_path_from_cli_value = reactive.value(args.csv_path)
 
     @reactive.calc
     def csv_path_calc():
@@ -57,9 +64,10 @@ def csv_fields_calc():
     def csv_fields():
         return csv_fields_calc()
 
-    @render.text
-    def unit_of_privacy_text():
-        return str(unit_of_privacy.get())
+    @render.code
+    def unit_of_privacy_python():
+        contributions = input.contributions()
+        return make_privacy_unit_block(contributions)
 
     @reactive.effect
     @reactive.event(input.go_to_analysis)

diff --git a/dp_creator_ii/app/results_panel.py b/dp_creator_ii/app/results_panel.py
@@ -20,8 +20,9 @@ def results_server(input, output, session):
         media_type="text/x-python",
     )
     async def download_script():
+        contributions = input.contributions()
         script_py = make_script_py(
-            unit=1,
+            contributions=contributions,
             loss=1,
             weights=[1],
         )
@@ -32,9 +33,10 @@ async def download_script():
         media_type="application/x-ipynb+json",
     )
     async def download_notebook_unexecuted():
+        contributions = input.contributions()
         notebook_py = make_notebook_py(
             csv_path="todo.csv",
-            unit=1,
+            contributions=contributions,
             loss=1,
             weights=[1],
         )
@@ -46,9 +48,10 @@ async def download_notebook_unexecuted():
         media_type="application/x-ipynb+json",
     )
     async def download_notebook_executed():
+        contributions = input.contributions()
         notebook_py = make_notebook_py(
             csv_path="todo.csv",
-            unit=1,
+            contributions=contributions,
             loss=1,
             weights=[1],
         )

diff --git a/dp_creator_ii/app/ui_helpers.py b/dp_creator_ii/app/ui_helpers.py
@@ -0,0 +1,9 @@
+from htmltools.tags import details, summary
+from shiny import ui
+
+
+def output_code_sample(name_of_render_function):
+    return details(
+        summary("Code sample"),
+        ui.output_code(name_of_render_function),
+    )
diff --git a/dp_creator_ii/template.py b/dp_creator_ii/template.py
@@ -57,57 +57,68 @@ def __str__(self):
         return self._template
 
 
-def _make_context_for_notebook(csv_path, unit, loss, weights):
+def _make_context_for_notebook(csv_path, contributions, loss, weights):
+    privacy_unit_block = make_privacy_unit_block(contributions)
     return str(
-        _Template("context.py").fill_values(
+        _Template("context.py")
+        .fill_values(
             CSV_PATH=csv_path,
-            UNIT=unit,
             LOSS=loss,
             WEIGHTS=weights,
         )
+        .fill_blocks(
+            PRIVACY_UNIT_BLOCK=privacy_unit_block,
+        )
     )
 
 
-def _make_context_for_script(unit, loss, weights):
+def _make_context_for_script(contributions, loss, weights):
+    privacy_unit_block = make_privacy_unit_block(contributions)
     return str(
         _Template("context.py")
         .fill_expressions(
             CSV_PATH="csv_path",
         )
         .fill_values(
-            UNIT=unit,
             LOSS=loss,
             WEIGHTS=weights,
         )
+        .fill_blocks(
+            PRIVACY_UNIT_BLOCK=privacy_unit_block,
+        )
     )
 
 
 def _make_imports():
     return str(_Template("imports.py").fill_values())
 
 
-def make_notebook_py(csv_path, unit, loss, weights):
+def make_notebook_py(csv_path, contributions, loss, weights):
     return str(
         _Template("notebook.py").fill_blocks(
             IMPORTS_BLOCK=_make_imports(),
             CONTEXT_BLOCK=_make_context_for_notebook(
                 csv_path=csv_path,
-                unit=unit,
+                contributions=contributions,
                 loss=loss,
                 weights=weights,
             ),
         )
     )
 
 
-def make_script_py(unit, loss, weights):
+def make_script_py(contributions, loss, weights):
     return str(
         _Template("script.py").fill_blocks(
             IMPORTS_BLOCK=_make_imports(),
             CONTEXT_BLOCK=_make_context_for_script(
-                unit=unit,
+                contributions=contributions,
                 loss=loss,
                 weights=weights,
             ),
         )
     )
+
+
+def make_privacy_unit_block(contributions):
+    return str(_Template("privacy_unit.py").fill_values(CONTRIBUTIONS=contributions))
diff --git a/dp_creator_ii/templates/context.py b/dp_creator_ii/templates/context.py
@@ -1,6 +1,7 @@
+PRIVACY_UNIT_BLOCK
 context = dp.Context.compositor(
     data=pl.scan_csv(CSV_PATH, encoding="utf8-lossy"),
-    privacy_unit=dp.unit_of(contributions=UNIT),
+    privacy_unit=privacy_unit,
     privacy_loss=dp.loss_of(epsilon=LOSS),
     split_by_weights=WEIGHTS,
 )
diff --git a/dp_creator_ii/templates/privacy_unit.py b/dp_creator_ii/templates/privacy_unit.py
@@ -0,0 +1 @@
+privacy_unit = dp.unit_of(contributions=CONTRIBUTIONS)
diff --git a/dp_creator_ii/tests/fixtures/expected-script.py b/dp_creator_ii/tests/fixtures/expected-script.py
@@ -0,0 +1,29 @@
+from argparse import ArgumentParser
+
+import polars as pl
+import opendp.prelude as dp
+
+dp.enable_features("contrib")
+
+
+def get_context(csv_path):
+    privacy_unit = dp.unit_of(contributions=1)
+
+    context = dp.Context.compositor(
+        data=pl.scan_csv(csv_path, encoding="utf8-lossy"),
+        privacy_unit=privacy_unit,
+        privacy_loss=dp.loss_of(epsilon=1),
+        split_by_weights=[1],
+    )
+
+    return context
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Creates a differentially private release from a csv"
+    )
+    parser.add_argument("--csv", help="Path to csv containing private data")
+    args = parser.parse_args()
+    context = get_context(csv_path=args.csv)
+    print(context)
diff --git a/dp_creator_ii/tests/test_app.py b/dp_creator_ii/tests/test_app.py
@@ -10,7 +10,7 @@
 
 # TODO: Why is incomplete coverage reported here?
 # https://github.com/opendp/dp-creator-ii/issues/18
-def test_app(page: Page, app: ShinyAppProc):  # pragma: no cover
+def test_navigation(page: Page, app: ShinyAppProc):  # pragma: no cover
     pick_dataset_text = "TODO: Pick dataset"
     perform_analysis_text = "TODO: Define analysis"
     download_results_text = "TODO: Download results"
@@ -29,6 +29,9 @@ def expect_no_error():
     expect_visible(pick_dataset_text)
     expect_not_visible(perform_analysis_text)
     expect_not_visible(download_results_text)
+    page.get_by_label("Contributions").fill("42")
+    page.get_by_text("Code sample").click()
+    expect_visible("dp.unit_of(contributions=42)")
     expect_no_error()
 
     csv_path = Path(__file__).parent / "fixtures" / "fake.csv"
@@ -54,4 +57,4 @@ def expect_no_error():
 
     download = download_info.value
     script = download.path().read_text()
-    assert "privacy_unit=dp.unit_of(contributions=1)" in script
+    assert "privacy_unit = dp.unit_of(contributions=42)" in script
diff --git a/dp_creator_ii/tests/test_template.py b/dp_creator_ii/tests/test_template.py
@@ -1,5 +1,6 @@
 from tempfile import NamedTemporaryFile
 import subprocess
+from pathlib import Path
 import re
 import pytest
 import opendp.prelude as dp
@@ -9,17 +10,28 @@
 fake_csv = "dp_creator_ii/tests/fixtures/fake.csv"
 
 
+def test_fill_expressions():
+    template = _Template(None, template="No one VERB the ADJ NOUN!")
+    filled = str(
+        template.fill_expressions(
+            VERB="expects",
+            ADJ="Spanish",
+            NOUN="Inquisition",
+        )
+    )
+    assert filled == "No one expects the Spanish Inquisition!"
+
+
 def test_fill_values():
-    context_template = _Template("context.py")
-    context_block = str(
-        context_template.fill_values(
-            CSV_PATH=fake_csv,
-            UNIT=1,
-            LOSS=1,
-            WEIGHTS=[1],
+    template = _Template(None, template="assert [STRING] * NUM == LIST")
+    filled = str(
+        template.fill_values(
+            STRING="🙂",
+            NUM=3,
+            LIST=["🙂", "🙂", "🙂"],
         )
     )
-    assert f"data=pl.scan_csv('{fake_csv}', encoding=\"utf8-lossy\")" in context_block
+    assert filled == "assert ['🙂'] * 3 == ['🙂', '🙂', '🙂']"
 
 
 def test_fill_blocks():
@@ -65,15 +77,18 @@ def test_fill_template_unfilled_slots():
     context_template = _Template("context.py")
     with pytest.raises(
         Exception,
-        match=re.escape("context.py has unfilled slots: CSV_PATH, LOSS, UNIT, WEIGHTS"),
+        match=re.escape(
+            "context.py has unfilled slots: "
+            "CSV_PATH, LOSS, PRIVACY_UNIT_BLOCK, WEIGHTS"
+        ),
     ):
         str(context_template.fill_values())
 
 
 def test_make_notebook():
     notebook = make_notebook_py(
         csv_path=fake_csv,
-        unit=1,
+        contributions=1,
         loss=1,
         weights=[1],
     )
@@ -84,14 +99,23 @@ def test_make_notebook():
 
 def test_make_script():
     script = make_script_py(
-        unit=1,
+        contributions=1,
         loss=1,
         weights=[1],
     )
 
-    with NamedTemporaryFile(mode="w", delete=False) as fp:
+    def clear_empty_lines(text):
+        # Cleanup whitespace after indenting blocks
+        return re.sub(r"^\s+$", "", text, flags=re.MULTILINE).strip()
+
+    expected_script = (
+        Path(__file__).parent / "fixtures" / "expected-script.py"
+    ).read_text()
+    assert clear_empty_lines(script) == clear_empty_lines(expected_script)
+
+    with NamedTemporaryFile(mode="w") as fp:
         fp.write(script)
-        fp.close()
+        fp.flush()
 
         result = subprocess.run(["python", fp.name, "--csv", fake_csv])
         assert result.returncode == 0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		privacy_unit = dp.unit_of(contributions=CONTRIBUTIONS)