OO-ify the code generation code (#168)

* stub classes, but not really oo. Remove unused app code * named tuple for analysis plan * Move functions which depend on analysis plan into base class * function params -> constructor params * define "root_template" and dedup * fix copy paste in make_context * make base class abstract * rename to code_generators: templates are just a means * Template class to own file * create a README * AnalysisPlanColumn * pass analysis plan to code generators * analysis plan in tests * check for expression and value slots * more systematic tests of bad template filling * more tests of template filling
opendp · Nov 20, 2024 · fc2aa18 · fc2aa18
1 parent 7de7bb5
commit fc2aa18
Show file tree

Hide file tree

Showing 20 changed files with 521 additions and 464 deletions.
diff --git a/.flake8 b/.flake8
@@ -9,4 +9,4 @@ extend-ignore = E203,E501,E701
 
 per-file-ignores =
     # Ignore undefined names in templates.
-    */templates/no-tests/*.py:F821,F401,E302
+    */code_generators/no-tests/*.py:F821,F401,E302
diff --git a/.pytest.ini b/.pytest.ini
@@ -4,7 +4,7 @@
 filterwarnings =
 	error
 
-addopts = --doctest-glob '*.md' --doctest-modules --ignore dp_wizard/utils/templates/no-tests --ignore dp_wizard/tests/fixtures/ --tracing=retain-on-failure
+addopts = --doctest-glob '*.md' --doctest-modules --ignore dp_wizard/utils/code_generators/no-tests --ignore dp_wizard/tests/fixtures/ --tracing=retain-on-failure
 
 # If an xfail starts passing unexpectedly, that should count as a failure:
 xfail_strict=true
diff --git a/dp_wizard/app/analysis_panel.py b/dp_wizard/app/analysis_panel.py
@@ -6,7 +6,7 @@
 from dp_wizard.app.components.column_module import column_ui, column_server
 from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names
 from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
-from dp_wizard.utils.templates import make_privacy_loss_block
+from dp_wizard.utils.code_generators import make_privacy_loss_block
 from dp_wizard.app.components.column_module import col_widths
 
 

diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py
@@ -4,7 +4,7 @@
 
 from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram
 from dp_wizard.utils.shared import plot_histogram
-from dp_wizard.utils.templates import make_column_config_block
+from dp_wizard.utils.code_generators import make_column_config_block
 from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
 
 

diff --git a/dp_wizard/app/dataset_panel.py b/dp_wizard/app/dataset_panel.py
@@ -4,7 +4,7 @@
 
 from dp_wizard.utils.argparse_helpers import get_cli_info
 from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip
-from dp_wizard.utils.templates import make_privacy_unit_block
+from dp_wizard.utils.code_generators import make_privacy_unit_block
 
 
 def dataset_ui():

diff --git a/dp_wizard/app/results_panel.py b/dp_wizard/app/results_panel.py
@@ -1,8 +1,11 @@
-from json import dumps
-
 from shiny import ui, render, reactive
 
-from dp_wizard.utils.templates import make_notebook_py, make_script_py
+from dp_wizard.utils.code_generators import (
+    NotebookGenerator,
+    ScriptGenerator,
+    AnalysisPlan,
+    AnalysisPlanColumn,
+)
 from dp_wizard.utils.converters import convert_py_to_nb
 
 
@@ -35,77 +38,39 @@ def results_server(
     epsilon,
 ):  # pragma: no cover
     @reactive.calc
-    def analysis_dict():
+    def analysis_plan() -> AnalysisPlan:
         # weights().keys() will reflect the desired columns:
         # The others retain inactive columns, so user
         # inputs aren't lost when toggling checkboxes.
         columns = {
-            col: {
-                "lower_bound": lower_bounds()[col],
-                "upper_bound": upper_bounds()[col],
-                "bin_count": int(bin_counts()[col]),
-                # TODO: Floats should work for weight, but they don't:
-                # https://github.com/opendp/opendp/issues/2140
-                "weight": int(weights()[col]),
-            }
+            col: AnalysisPlanColumn(
+                lower_bound=lower_bounds()[col],
+                upper_bound=upper_bounds()[col],
+                bin_count=int(bin_counts()[col]),
+                weight=int(weights()[col]),
+            )
             for col in weights().keys()
         }
-        return {
-            "csv_path": csv_path(),
-            "contributions": contributions(),
-            "epsilon": epsilon(),
-            "columns": columns,
-        }
-
-    @reactive.calc
-    def analysis_json():
-        return dumps(
-            analysis_dict(),
-            indent=2,
+        return AnalysisPlan(
+            csv_path=csv_path(),
+            contributions=contributions(),
+            epsilon=epsilon(),
+            columns=columns,
         )
 
-    @render.text
-    def analysis_json_text():
-        return analysis_json()
-
-    @reactive.calc
-    def analysis_python():
-        analysis = analysis_dict()
-        return make_notebook_py(
-            csv_path=analysis["csv_path"],
-            contributions=analysis["contributions"],
-            epsilon=analysis["epsilon"],
-            columns=analysis["columns"],
-        )
-
-    @render.text
-    def analysis_python_text():
-        return analysis_python()
-
     @render.download(
         filename="dp-wizard-script.py",
         media_type="text/x-python",
     )
     async def download_script():
-        analysis = analysis_dict()
-        script_py = make_script_py(
-            contributions=analysis["contributions"],
-            epsilon=analysis["epsilon"],
-            columns=analysis["columns"],
-        )
+        script_py = ScriptGenerator(analysis_plan()).make_py()
         yield script_py
 
     @render.download(
         filename="dp-wizard-notebook.ipynb",
         media_type="application/x-ipynb+json",
     )
     async def download_notebook():
-        analysis = analysis_dict()
-        notebook_py = make_notebook_py(
-            csv_path=analysis["csv_path"],
-            contributions=analysis["contributions"],
-            epsilon=analysis["epsilon"],
-            columns=analysis["columns"],
-        )
+        notebook_py = NotebookGenerator(analysis_plan()).make_py()
         notebook_nb = convert_py_to_nb(notebook_py, execute=True)
         yield notebook_nb
diff --git a/dp_wizard/utils/code_generators/__init__.py b/dp_wizard/utils/code_generators/__init__.py
@@ -0,0 +1,204 @@
+from typing import NamedTuple
+from abc import ABC, abstractmethod
+from pathlib import Path
+import re
+from dp_wizard.utils.csv_helper import name_to_identifier
+from dp_wizard.utils.code_generators._template import Template
+
+
+class AnalysisPlanColumn(NamedTuple):
+    lower_bound: float
+    upper_bound: float
+    bin_count: int
+    weight: int
+
+
+class AnalysisPlan(NamedTuple):
+    csv_path: str
+    contributions: int
+    epsilon: float
+    columns: dict[str, AnalysisPlanColumn]
+
+
+class _CodeGenerator(ABC):
+    def __init__(self, analysis_plan):
+        self.csv_path = analysis_plan.csv_path
+        self.contributions = analysis_plan.contributions
+        self.epsilon = analysis_plan.epsilon
+        self.columns = analysis_plan.columns
+
+    @abstractmethod
+    def _make_context(self): ...  # pragma: no cover
+
+    def make_py(self):
+        return str(
+            Template(self.root_template).fill_blocks(
+                IMPORTS_BLOCK=_make_imports(),
+                COLUMNS_BLOCK=self._make_columns(self.columns),
+                CONTEXT_BLOCK=self._make_context(),
+                QUERIES_BLOCK=self._make_queries(self.columns.keys()),
+            )
+        )
+
+    def _make_margins_dict(self, bin_names):
+        # TODO: Don't worry too much about the formatting here.
+        # Plan to run the output through black for consistency.
+        # https://github.com/opendp/dp-creator-ii/issues/50
+        margins = (
+            [
+                """
+            (): dp.polars.Margin(
+                public_info="lengths",
+            ),"""
+            ]
+            + [
+                f"""
+            ("{bin_name}",): dp.polars.Margin(
+                public_info="keys",
+            ),"""
+                for bin_name in bin_names
+            ]
+        )
+
+        margins_dict = "{" + "".join(margins) + "\n    }"
+        return margins_dict
+
+    def _make_columns(self, columns):
+        return "\n".join(
+            make_column_config_block(
+                name=name,
+                lower_bound=col.lower_bound,
+                upper_bound=col.upper_bound,
+                bin_count=col.bin_count,
+            )
+            for name, col in columns.items()
+        )
+
+    def _make_queries(self, column_names):
+        return "confidence = 0.95\n\n" + "\n".join(
+            _make_query(column_name) for column_name in column_names
+        )
+
+    def _make_partial_context(self):
+        weights = [column.weight for column in self.columns.values()]
+        column_names = [name_to_identifier(name) for name in self.columns.keys()]
+        privacy_unit_block = make_privacy_unit_block(self.contributions)
+        privacy_loss_block = make_privacy_loss_block(self.epsilon)
+        margins_dict = self._make_margins_dict([f"{name}_bin" for name in column_names])
+        columns = ", ".join([f"{name}_config" for name in column_names])
+        return (
+            Template("context")
+            .fill_expressions(
+                MARGINS_DICT=margins_dict,
+                COLUMNS=columns,
+            )
+            .fill_values(
+                WEIGHTS=weights,
+            )
+            .fill_blocks(
+                PRIVACY_UNIT_BLOCK=privacy_unit_block,
+                PRIVACY_LOSS_BLOCK=privacy_loss_block,
+            )
+        )
+
+
+class NotebookGenerator(_CodeGenerator):
+    root_template = "notebook"
+
+    def _make_context(self):
+        return str(self._make_partial_context().fill_values(CSV_PATH=self.csv_path))
+
+
+class ScriptGenerator(_CodeGenerator):
+    root_template = "script"
+
+    def _make_context(self):
+        return str(self._make_partial_context().fill_expressions(CSV_PATH="csv_path"))
+
+
+# Public functions used to generate code snippets in the UI;
+# These do not require an entire analysis plan, so they stand on their own.
+
+
+def make_privacy_unit_block(contributions):
+    return str(Template("privacy_unit").fill_values(CONTRIBUTIONS=contributions))
+
+
+def make_privacy_loss_block(epsilon):
+    return str(Template("privacy_loss").fill_values(EPSILON=epsilon))
+
+
+def make_column_config_block(name, lower_bound, upper_bound, bin_count):
+    """
+    >>> print(make_column_config_block(
+    ...     name="HW GRADE",
+    ...     lower_bound=0,
+    ...     upper_bound=100,
+    ...     bin_count=10
+    ... ))
+    # From the public information, determine the bins for 'HW GRADE':
+    hw_grade_cut_points = make_cut_points(
+        lower_bound=0,
+        upper_bound=100,
+        bin_count=10,
+    )
+    <BLANKLINE>
+    # Use these bins to define a Polars column:
+    hw_grade_config = (
+        pl.col('HW GRADE')
+        .cut(hw_grade_cut_points)
+        .alias('hw_grade_bin')  # Give the new column a name.
+        .cast(pl.String)
+    )
+    <BLANKLINE>
+    """
+    snake_name = _snake_case(name)
+    return str(
+        Template("column_config")
+        .fill_expressions(
+            CUT_LIST_NAME=f"{snake_name}_cut_points",
+            POLARS_CONFIG_NAME=f"{snake_name}_config",
+        )
+        .fill_values(
+            LOWER_BOUND=lower_bound,
+            UPPER_BOUND=upper_bound,
+            BIN_COUNT=bin_count,
+            COLUMN_NAME=name,
+            BIN_COLUMN_NAME=f"{snake_name}_bin",
+        )
+    )
+
+
+# Private helper functions:
+# These do not depend on the AnalysisPlan,
+# so it's better to keep them out of the class.
+
+
+def _make_query(column_name):
+    indentifier = name_to_identifier(column_name)
+    return str(
+        Template("query")
+        .fill_values(
+            BIN_NAME=f"{indentifier}_bin",
+        )
+        .fill_expressions(
+            QUERY_NAME=f"{indentifier}_query",
+            ACCURACY_NAME=f"{indentifier}_accuracy",
+            HISTOGRAM_NAME=f"{indentifier}_histogram",
+        )
+    )
+
+
+def _snake_case(name: str):
+    """
+    >>> _snake_case("HW GRADE")
+    'hw_grade'
+    """
+    return re.sub(r"\W+", "_", name.lower())
+
+
+def _make_imports():
+    return (
+        str(Template("imports").fill_values())
+        + (Path(__file__).parent.parent / "shared.py").read_text()
+    )