From 87b6f8e40288c7fb8efe0493994ddfface55c2b2 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccalluc@users.noreply.github.com>
Date: Thu, 7 Nov 2024 17:13:56 -0500
Subject: [PATCH 1/2] More tooltips (#135)

---
 WHAT-WE-LEARNED.md                            |  6 ++-
 dp_creator_ii/app/analysis_panel.py           | 27 +++++++++++-
 dp_creator_ii/app/components/column_module.py | 44 ++++++++++++++++++-
 3 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md
index cbddfaa..72a09ca 100644
--- a/WHAT-WE-LEARNED.md
+++ b/WHAT-WE-LEARNED.md
@@ -8,7 +8,7 @@ Unless I'm missing something, there doesn't seem to be any warning when there is
 
 ## UI and Server functions don't really separate concerns
 
-My first impression was that the UI function would be something like a "view" and the server would be a "controller", but for any kind of conditional display I need a `render.ui`, so that distinction breaks down quickly.
+My first impression was that the UI function would be something like a "view" and the server would be a "controller", but for any kind of conditional display I need a `render.ui`, so that distinction breaks down quickly. Just maintaining a naming convention for these little bits of UI in the server gets to be a chore. It would be kludgy, but what if we could suply lambdas instead of names?
 
 ## Refactoring: values vs. reactive values
 
@@ -63,6 +63,10 @@ I've had to tweak the CSS a few times:
 The different flavors of "Shiny" are a bit of nuissance when trying to find examples.
 The maturity of Shiny for R means that the vast majority of the examples are for R, even with Python in the search. It would be nice if the docs site remembered that I only want to look at docs for Core.
 
+## It's easy to forget `return`
+
+This is simple, but I was still scratching my head for a while. While there are some cases where returning `None` is intended, is it more more likely to be an error? What if it raised a warning, and an explicit empty string could be returned if that's really what you want?
+
 ## Shiny docs could have better formatting
 
 - https://shiny.posit.co/py/api/core/ui.layout_columns.html: bullet list not rendered correctly.
diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py
index ab17b60..2a256da 100644
--- a/dp_creator_ii/app/analysis_panel.py
+++ b/dp_creator_ii/app/analysis_panel.py
@@ -5,7 +5,7 @@
 from dp_creator_ii.app.components.inputs import log_slider
 from dp_creator_ii.app.components.column_module import column_ui, column_server
 from dp_creator_ii.utils.csv_helper import read_field_names
-from dp_creator_ii.app.components.outputs import output_code_sample
+from dp_creator_ii.app.components.outputs import output_code_sample, demo_tooltip
 from dp_creator_ii.utils.templates import make_privacy_loss_block
 
 
@@ -18,6 +18,7 @@ def analysis_ui():
             "the number of bins for the histogram, "
             "and its relative share of the privacy budget."
         ),
+        ui.output_ui("columns_checkbox_group_tooltip_ui"),
         ui.input_checkbox_group("columns_checkbox_group", None, []),
         ui.output_ui("columns_ui"),
         ui.markdown(
@@ -25,6 +26,7 @@ def analysis_ui():
             "Values above 1 will add less noise to the data, "
             "but have a greater risk of revealing individual data."
         ),
+        ui.output_ui("epsilon_tooltip_ui"),
         log_slider("log_epsilon_slider", 0.1, 10.0),
         ui.output_text("epsilon"),
         output_code_sample("Privacy Loss", "privacy_loss_python"),
@@ -75,6 +77,17 @@ def _on_column_set_change():
         column_ids_selected = input.columns_checkbox_group()
         clear_column_weights(column_ids_selected)
 
+    @render.ui
+    def columns_checkbox_group_tooltip_ui():
+        return demo_tooltip(
+            is_demo,
+            """
+            Not all columns need analysis. For this demo, just check
+            "class_year" and "grade". With more columns selected,
+            each column has a smaller share of the privacy budget.
+            """,
+        )
+
     @render.ui
     def columns_ui():
         column_ids = input.columns_checkbox_group()
@@ -86,6 +99,7 @@ def columns_ui():
                 epsilon=epsilon_calc(),
                 set_column_weight=set_column_weight,
                 get_weights_sum=get_weights_sum,
+                is_demo=is_demo,
             )
         return [
             [
@@ -103,6 +117,17 @@ def csv_fields_calc():
     def csv_fields():
         return csv_fields_calc()
 
+    @render.ui
+    def epsilon_tooltip_ui():
+        return demo_tooltip(
+            is_demo,
+            """
+            If you set epsilon above one, you'll see that the distribution
+            becomes less noisy, and the confidence intervals become smaller...
+            but increased accuracy risks revealing personal information.
+            """,
+        )
+
     @reactive.calc
     def epsilon_calc():
         return pow(10, input.log_epsilon_slider())
diff --git a/dp_creator_ii/app/components/column_module.py b/dp_creator_ii/app/components/column_module.py
index 7bc95d8..c27e51d 100644
--- a/dp_creator_ii/app/components/column_module.py
+++ b/dp_creator_ii/app/components/column_module.py
@@ -5,7 +5,7 @@
 from dp_creator_ii.utils.dp_helper import make_confidence_accuracy_histogram
 from dp_creator_ii.app.components.plots import plot_histogram
 from dp_creator_ii.utils.templates import make_column_config_block
-from dp_creator_ii.app.components.outputs import output_code_sample
+from dp_creator_ii.app.components.outputs import output_code_sample, demo_tooltip
 
 
 @module.ui
@@ -13,9 +13,12 @@ def column_ui():  # pragma: no cover
     width = "10em"  # Just wide enough so the text isn't trucated.
     return ui.layout_columns(
         [
+            ui.output_ui("bounds_tooltip_ui"),
             ui.input_numeric("min", "Min", 0, width=width),
             ui.input_numeric("max", "Max", 10, width=width),
+            ui.output_ui("bins_tooltip_ui"),
             ui.input_numeric("bins", "Bins", 10, width=width),
+            ui.output_ui("weight_tooltip_ui"),
             ui.input_select(
                 "weight",
                 "Weight",
@@ -60,6 +63,7 @@ def column_server(
     epsilon,
     set_column_weight,
     get_weights_sum,
+    is_demo,
 ):  # pragma: no cover
     @reactive.effect
     @reactive.event(input.weight)
@@ -75,6 +79,44 @@ def column_config():
             "weight": float(input.weight()),
         }
 
+    @render.ui
+    def bounds_tooltip_ui():
+        return demo_tooltip(
+            is_demo,
+            """
+            DP requires that we limit the sensitivity to the contributions
+            of any individual. To do this, we need an estimate of the lower
+            and upper bounds for each variable. We should not look at the
+            data when estimating the bounds! In this case, we could imagine
+            that "class year" would vary between 1 and 4, and we could limit
+            "grade" to values between 50 and 100.
+            """,
+        )
+
+    @render.ui
+    def bins_tooltip_ui():
+        return demo_tooltip(
+            is_demo,
+            """
+            Different statistics can be measured with DP.
+            This tool provides a histogram. If you increase the number of bins,
+            you'll see that each individual bin becomes noisier to provide
+            the same overall privacy guarantee. For this example, give
+            "class_year" 4 bins and "grade" 5 bins.
+            """,
+        )
+
+    @render.ui
+    def weight_tooltip_ui():
+        return demo_tooltip(
+            is_demo,
+            """
+            You have a finite privacy budget, but you can choose
+            how to allocate it. For simplicity, we limit the options here,
+            but when using the library you can fine tune this.
+            """,
+        )
+
     @render.code
     def column_code():
         config = column_config()

From ec70686455437c57b26caaa5766cddc0425de14b Mon Sep 17 00:00:00 2001
From: Chuck McCallum <mccalluc@users.noreply.github.com>
Date: Thu, 7 Nov 2024 17:23:35 -0500
Subject: [PATCH 2/2] Handle empty column headers (#147)

* tests pass: do not really like ui

* demo that we can handle spaces

* [blank] for empty headers

* Show labels in UI instead of names or IDs

* What if two columns are only distinguished by a non-word character?

* duplicated columns: another CSV edge case
---
 WHAT-WE-LEARNED.md                  | 26 ++++++++++++++++++++------
 dp_creator_ii/app/analysis_panel.py | 20 +++++++++++---------
 dp_creator_ii/utils/csv_helper.py   | 26 +++++++++++++++++++++++++-
 tests/fixtures/fake.csv             | 14 +++++++-------
 tests/test_app.py                   | 10 ++++++----
 tests/utils/test_csv_helper.py      | 14 ++++++++++----
 6 files changed, 79 insertions(+), 31 deletions(-)

diff --git a/WHAT-WE-LEARNED.md b/WHAT-WE-LEARNED.md
index 72a09ca..72ab7ba 100644
--- a/WHAT-WE-LEARNED.md
+++ b/WHAT-WE-LEARNED.md
@@ -21,19 +21,19 @@ Typing might help here. I've also wondered about naming conventions, but I haven
 It seems like the returned value would be the same, so I would like to compress something like this:
 ```python
 @reactive.calc
-def csv_fields_calc():
-    return read_field_names(req(csv_path()))
+def csv_labels_calc():
+    return read_labels(req(csv_path()))
 
 @render.text
-def csv_fields():
-    return csv_fields_calc()
+def csv_labels():
+    return csv_labels_calc()
 ```
 into:
 ```python
 @reactive.calc
 @render.text
-def csv_fields():
-    return read_field_names(req(csv_path()))
+def csv_labels():
+    return read_labels(req(csv_path()))
 ```
 but that returns an error:
 ```
@@ -44,6 +44,20 @@ Renderer.__call__() missing 1 required positional argument: '_fn'
 
 It feels like a gap in the library that there is no component testing. The only advice is to pull out testable logic from the server functions, and for the rest, use end-to-end tests: There's not a recommended way to test the ui+server interaction for just one component.
 
+## Unstated requirements for module IDs
+
+The [docs](https://shiny.posit.co/py/docs/modules.html#how-to-use-modules) only say:
+
+> This id has two requirements. First, it must be unique in a single scope, and can’t be duplicated in a given application or module definition. ... Second, the UI and server ids must match.
+
+But it's fussier than that:
+
+```
+ValueError: The string 'Will this work?' is not a valid id; only letters, numbers, and underscore are permitted
+```
+
+Was planning to just use the CSV column headers as IDs, but that's not going to work. Could Shiny just hash whatever we provide, so we wouldn't have to worry about this?
+
 ## Normal tooling doesn't work inside of app?
 
 There are several bits of tooling that don't seem to work inside end-to-end app tests. My guess is that this isn't related to Shiny per se, but rather the ASGI framework: It's not running in the same process as pytest, so it's not surprising that the pytest process can't instrument this.
diff --git a/dp_creator_ii/app/analysis_panel.py b/dp_creator_ii/app/analysis_panel.py
index 2a256da..8f50eca 100644
--- a/dp_creator_ii/app/analysis_panel.py
+++ b/dp_creator_ii/app/analysis_panel.py
@@ -4,7 +4,7 @@
 
 from dp_creator_ii.app.components.inputs import log_slider
 from dp_creator_ii.app.components.column_module import column_ui, column_server
-from dp_creator_ii.utils.csv_helper import read_field_names
+from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
 from dp_creator_ii.app.components.outputs import output_code_sample, demo_tooltip
 from dp_creator_ii.utils.templates import make_privacy_loss_block
 
@@ -68,7 +68,7 @@ def _update_checkbox_group():
         ui.update_checkbox_group(
             "columns_checkbox_group",
             label=None,
-            choices=csv_fields_calc(),
+            choices=csv_ids_labels_calc(),
         )
 
     @reactive.effect
@@ -91,10 +91,12 @@ def columns_checkbox_group_tooltip_ui():
     @render.ui
     def columns_ui():
         column_ids = input.columns_checkbox_group()
+        column_ids_to_names = csv_ids_names_calc()
+        column_ids_to_labels = csv_ids_labels_calc()
         for column_id in column_ids:
             column_server(
                 column_id,
-                name=column_id,
+                name=column_ids_to_names[column_id],
                 contributions=contributions(),
                 epsilon=epsilon_calc(),
                 set_column_weight=set_column_weight,
@@ -103,19 +105,19 @@ def columns_ui():
             )
         return [
             [
-                ui.h3(column_id),
+                ui.h3(column_ids_to_labels[column_id]),
                 column_ui(column_id),
             ]
             for column_id in column_ids
         ]
 
     @reactive.calc
-    def csv_fields_calc():
-        return read_field_names(req(csv_path()))
+    def csv_ids_names_calc():
+        return read_csv_ids_names(req(csv_path()))
 
-    @render.text
-    def csv_fields():
-        return csv_fields_calc()
+    @reactive.calc
+    def csv_ids_labels_calc():
+        return read_csv_ids_labels(req(csv_path()))
 
     @render.ui
     def epsilon_tooltip_ui():
diff --git a/dp_creator_ii/utils/csv_helper.py b/dp_creator_ii/utils/csv_helper.py
index 147551b..4c279ee 100644
--- a/dp_creator_ii/utils/csv_helper.py
+++ b/dp_creator_ii/utils/csv_helper.py
@@ -1,7 +1,14 @@
+"""
+We'll use the following terms consistently throughout the application:
+- name: This is the exact column header in the CSV.
+- label: This is the string we'll display.
+- id: This is the string we'll pass as a module ID.
+"""
+
 import polars as pl
 
 
-def read_field_names(csv_path):
+def read_csv_names(csv_path):
     # Polars is overkill, but it is more robust against
     # variations in encoding than Python stdlib csv.
     # However, it could be slow:
@@ -10,3 +17,20 @@ def read_field_names(csv_path):
     # > resolving its schema, which is a potentially expensive operation.
     lf = pl.scan_csv(csv_path)
     return lf.collect_schema().names()
+
+
+def read_csv_ids_labels(csv_path):
+    return {
+        name_to_id(name): f"{i+1}: {name or '[blank]'}"
+        for i, name in enumerate(read_csv_names(csv_path))
+    }
+
+
+def read_csv_ids_names(csv_path):
+    return {name_to_id(name): name for name in read_csv_names(csv_path)}
+
+
+def name_to_id(name):
+    # Shiny is fussy about module IDs,
+    # but we don't need them to be human readable.
+    return str(hash(name)).replace("-", "_")
diff --git a/tests/fixtures/fake.csv b/tests/fixtures/fake.csv
index baabea7..e11908a 100644
--- a/tests/fixtures/fake.csv
+++ b/tests/fixtures/fake.csv
@@ -1,7 +1,7 @@
-student_id,class_year,hw_number,grade
-1234,1,1,90
-1234,1,2,95
-1234,1,3,85
-6789,2,1,70
-6789,2,2,100
-6789,2,3,90
+,class year,class year,hw number,hw-number,grade
+1234,1,1,1,4,90
+1234,1,1,2,5,95
+1234,1,1,3,6,85
+6789,2,2,1,4,70
+6789,2,2,2,5,100
+6789,2,2,3,6,90
diff --git a/tests/test_app.py b/tests/test_app.py
index 624b276..638fb90 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -62,10 +62,12 @@ def expect_no_error():
     expect_visible(perform_analysis_text)
     expect_not_visible(download_results_text)
     # Columns:
-    expect_visible("student_id")
-    expect_visible("class_year")
-    expect_visible("hw_number")
-    expect_visible("grade")
+    expect_visible("1: [blank]")  # Empty column!
+    expect_visible("2: class year")
+    expect_visible("3: class year_duplicated_0")  # Duplicated column!
+    expect_visible("4: hw number")
+    expect_visible("5: hw-number")  # Distinguished by non-word character!
+    expect_visible("6: grade")
     # Epsilon slider:
     # (Note: Slider tests failed on CI when run after column details,
     # although it worked locally. This works in either environment.
diff --git a/tests/utils/test_csv_helper.py b/tests/utils/test_csv_helper.py
index c35c39a..de6f219 100644
--- a/tests/utils/test_csv_helper.py
+++ b/tests/utils/test_csv_helper.py
@@ -4,7 +4,7 @@
 import tempfile
 import pytest
 
-from dp_creator_ii.utils.csv_helper import read_field_names
+from dp_creator_ii.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
 
 
 # We will not reference the encoding when reading:
@@ -48,6 +48,12 @@ def test_csv_loading(write_encoding):
         else:
             pl_testing.assert_frame_equal(write_lf, read_lf)
 
-        # Preceding lines are reading the whole DF via Polars.
-        field_names_read = read_field_names(fp.name)
-        assert field_names_read == list(data.keys())
+        # Preceding lines are reading the whole DF via Polars:
+        # Now test how we read just the headers.
+        # Keys are hashes, and won't be stable across platforms,
+        # so let's just look at the values.
+        ids_labels = read_csv_ids_labels(fp.name)
+        assert set(ids_labels.values()) == {"2: AGE", "1: NAME"}
+
+        ids_names = read_csv_ids_names(fp.name)
+        assert set(ids_names.values()) == {"AGE", "NAME"}