From fc2aa18212747fbb93e2087acf91c53ce7fd6be4 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Wed, 20 Nov 2024 09:49:52 -0500 Subject: [PATCH] OO-ify the code generation code (#168) * stub classes, but not really oo. Remove unused app code * named tuple for analysis plan * Move functions which depend on analysis plan into base class * function params -> constructor params * define "root_template" and dedup * fix copy paste in make_context * make base class abstract * rename to code_generators: templates are just a means * Template class to own file * create a README * AnalysisPlanColumn * pass analysis plan to code generators * analysis plan in tests * check for expression and value slots * more systematic tests of bad template filling * more tests of template filling --- .flake8 | 2 +- .pytest.ini | 2 +- dp_wizard/app/analysis_panel.py | 2 +- dp_wizard/app/components/column_module.py | 2 +- dp_wizard/app/dataset_panel.py | 2 +- dp_wizard/app/results_panel.py | 75 ++--- dp_wizard/utils/code_generators/__init__.py | 204 +++++++++++++ dp_wizard/utils/code_generators/_template.py | 86 ++++++ .../utils/code_generators/no-tests/README.md | 4 + .../no-tests/_column_config.py | 0 .../no-tests/_context.py | 0 .../no-tests/_imports.py | 0 .../no-tests/_notebook.py | 0 .../no-tests/_privacy_loss.py | 0 .../no-tests/_privacy_unit.py | 0 .../no-tests/_query.py | 0 .../no-tests/_script.py | 0 dp_wizard/utils/templates/__init__.py | 271 ------------------ tests/utils/test_code_generators.py | 202 +++++++++++++ tests/utils/test_templates.py | 133 --------- 20 files changed, 521 insertions(+), 464 deletions(-) create mode 100644 dp_wizard/utils/code_generators/__init__.py create mode 100644 dp_wizard/utils/code_generators/_template.py create mode 100644 dp_wizard/utils/code_generators/no-tests/README.md rename dp_wizard/utils/{templates => code_generators}/no-tests/_column_config.py (100%) rename dp_wizard/utils/{templates => code_generators}/no-tests/_context.py (100%) rename dp_wizard/utils/{templates => code_generators}/no-tests/_imports.py (100%) rename dp_wizard/utils/{templates => code_generators}/no-tests/_notebook.py (100%) rename dp_wizard/utils/{templates => code_generators}/no-tests/_privacy_loss.py (100%) rename dp_wizard/utils/{templates => code_generators}/no-tests/_privacy_unit.py (100%) rename dp_wizard/utils/{templates => code_generators}/no-tests/_query.py (100%) rename dp_wizard/utils/{templates => code_generators}/no-tests/_script.py (100%) delete mode 100644 dp_wizard/utils/templates/__init__.py create mode 100644 tests/utils/test_code_generators.py delete mode 100644 tests/utils/test_templates.py diff --git a/.flake8 b/.flake8 index 05c4fa7..9873a77 100644 --- a/.flake8 +++ b/.flake8 @@ -9,4 +9,4 @@ extend-ignore = E203,E501,E701 per-file-ignores = # Ignore undefined names in templates. - */templates/no-tests/*.py:F821,F401,E302 + */code_generators/no-tests/*.py:F821,F401,E302 diff --git a/.pytest.ini b/.pytest.ini index 25385dd..5aa7a5d 100644 --- a/.pytest.ini +++ b/.pytest.ini @@ -4,7 +4,7 @@ filterwarnings = error -addopts = --doctest-glob '*.md' --doctest-modules --ignore dp_wizard/utils/templates/no-tests --ignore dp_wizard/tests/fixtures/ --tracing=retain-on-failure +addopts = --doctest-glob '*.md' --doctest-modules --ignore dp_wizard/utils/code_generators/no-tests --ignore dp_wizard/tests/fixtures/ --tracing=retain-on-failure # If an xfail starts passing unexpectedly, that should count as a failure: xfail_strict=true diff --git a/dp_wizard/app/analysis_panel.py b/dp_wizard/app/analysis_panel.py index a9171a6..8465c1b 100644 --- a/dp_wizard/app/analysis_panel.py +++ b/dp_wizard/app/analysis_panel.py @@ -6,7 +6,7 @@ from dp_wizard.app.components.column_module import column_ui, column_server from dp_wizard.utils.csv_helper import read_csv_ids_labels, read_csv_ids_names from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip -from dp_wizard.utils.templates import make_privacy_loss_block +from dp_wizard.utils.code_generators import make_privacy_loss_block from dp_wizard.app.components.column_module import col_widths diff --git a/dp_wizard/app/components/column_module.py b/dp_wizard/app/components/column_module.py index 3e152bf..7296943 100644 --- a/dp_wizard/app/components/column_module.py +++ b/dp_wizard/app/components/column_module.py @@ -4,7 +4,7 @@ from dp_wizard.utils.dp_helper import make_confidence_accuracy_histogram from dp_wizard.utils.shared import plot_histogram -from dp_wizard.utils.templates import make_column_config_block +from dp_wizard.utils.code_generators import make_column_config_block from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip diff --git a/dp_wizard/app/dataset_panel.py b/dp_wizard/app/dataset_panel.py index 52ed4f4..9cd9836 100644 --- a/dp_wizard/app/dataset_panel.py +++ b/dp_wizard/app/dataset_panel.py @@ -4,7 +4,7 @@ from dp_wizard.utils.argparse_helpers import get_cli_info from dp_wizard.app.components.outputs import output_code_sample, demo_tooltip -from dp_wizard.utils.templates import make_privacy_unit_block +from dp_wizard.utils.code_generators import make_privacy_unit_block def dataset_ui(): diff --git a/dp_wizard/app/results_panel.py b/dp_wizard/app/results_panel.py index 58a23a6..8d448c5 100644 --- a/dp_wizard/app/results_panel.py +++ b/dp_wizard/app/results_panel.py @@ -1,8 +1,11 @@ -from json import dumps - from shiny import ui, render, reactive -from dp_wizard.utils.templates import make_notebook_py, make_script_py +from dp_wizard.utils.code_generators import ( + NotebookGenerator, + ScriptGenerator, + AnalysisPlan, + AnalysisPlanColumn, +) from dp_wizard.utils.converters import convert_py_to_nb @@ -35,64 +38,32 @@ def results_server( epsilon, ): # pragma: no cover @reactive.calc - def analysis_dict(): + def analysis_plan() -> AnalysisPlan: # weights().keys() will reflect the desired columns: # The others retain inactive columns, so user # inputs aren't lost when toggling checkboxes. columns = { - col: { - "lower_bound": lower_bounds()[col], - "upper_bound": upper_bounds()[col], - "bin_count": int(bin_counts()[col]), - # TODO: Floats should work for weight, but they don't: - # https://github.com/opendp/opendp/issues/2140 - "weight": int(weights()[col]), - } + col: AnalysisPlanColumn( + lower_bound=lower_bounds()[col], + upper_bound=upper_bounds()[col], + bin_count=int(bin_counts()[col]), + weight=int(weights()[col]), + ) for col in weights().keys() } - return { - "csv_path": csv_path(), - "contributions": contributions(), - "epsilon": epsilon(), - "columns": columns, - } - - @reactive.calc - def analysis_json(): - return dumps( - analysis_dict(), - indent=2, + return AnalysisPlan( + csv_path=csv_path(), + contributions=contributions(), + epsilon=epsilon(), + columns=columns, ) - @render.text - def analysis_json_text(): - return analysis_json() - - @reactive.calc - def analysis_python(): - analysis = analysis_dict() - return make_notebook_py( - csv_path=analysis["csv_path"], - contributions=analysis["contributions"], - epsilon=analysis["epsilon"], - columns=analysis["columns"], - ) - - @render.text - def analysis_python_text(): - return analysis_python() - @render.download( filename="dp-wizard-script.py", media_type="text/x-python", ) async def download_script(): - analysis = analysis_dict() - script_py = make_script_py( - contributions=analysis["contributions"], - epsilon=analysis["epsilon"], - columns=analysis["columns"], - ) + script_py = ScriptGenerator(analysis_plan()).make_py() yield script_py @render.download( @@ -100,12 +71,6 @@ async def download_script(): media_type="application/x-ipynb+json", ) async def download_notebook(): - analysis = analysis_dict() - notebook_py = make_notebook_py( - csv_path=analysis["csv_path"], - contributions=analysis["contributions"], - epsilon=analysis["epsilon"], - columns=analysis["columns"], - ) + notebook_py = NotebookGenerator(analysis_plan()).make_py() notebook_nb = convert_py_to_nb(notebook_py, execute=True) yield notebook_nb diff --git a/dp_wizard/utils/code_generators/__init__.py b/dp_wizard/utils/code_generators/__init__.py new file mode 100644 index 0000000..521c4a4 --- /dev/null +++ b/dp_wizard/utils/code_generators/__init__.py @@ -0,0 +1,204 @@ +from typing import NamedTuple +from abc import ABC, abstractmethod +from pathlib import Path +import re +from dp_wizard.utils.csv_helper import name_to_identifier +from dp_wizard.utils.code_generators._template import Template + + +class AnalysisPlanColumn(NamedTuple): + lower_bound: float + upper_bound: float + bin_count: int + weight: int + + +class AnalysisPlan(NamedTuple): + csv_path: str + contributions: int + epsilon: float + columns: dict[str, AnalysisPlanColumn] + + +class _CodeGenerator(ABC): + def __init__(self, analysis_plan): + self.csv_path = analysis_plan.csv_path + self.contributions = analysis_plan.contributions + self.epsilon = analysis_plan.epsilon + self.columns = analysis_plan.columns + + @abstractmethod + def _make_context(self): ... # pragma: no cover + + def make_py(self): + return str( + Template(self.root_template).fill_blocks( + IMPORTS_BLOCK=_make_imports(), + COLUMNS_BLOCK=self._make_columns(self.columns), + CONTEXT_BLOCK=self._make_context(), + QUERIES_BLOCK=self._make_queries(self.columns.keys()), + ) + ) + + def _make_margins_dict(self, bin_names): + # TODO: Don't worry too much about the formatting here. + # Plan to run the output through black for consistency. + # https://github.com/opendp/dp-creator-ii/issues/50 + margins = ( + [ + """ + (): dp.polars.Margin( + public_info="lengths", + ),""" + ] + + [ + f""" + ("{bin_name}",): dp.polars.Margin( + public_info="keys", + ),""" + for bin_name in bin_names + ] + ) + + margins_dict = "{" + "".join(margins) + "\n }" + return margins_dict + + def _make_columns(self, columns): + return "\n".join( + make_column_config_block( + name=name, + lower_bound=col.lower_bound, + upper_bound=col.upper_bound, + bin_count=col.bin_count, + ) + for name, col in columns.items() + ) + + def _make_queries(self, column_names): + return "confidence = 0.95\n\n" + "\n".join( + _make_query(column_name) for column_name in column_names + ) + + def _make_partial_context(self): + weights = [column.weight for column in self.columns.values()] + column_names = [name_to_identifier(name) for name in self.columns.keys()] + privacy_unit_block = make_privacy_unit_block(self.contributions) + privacy_loss_block = make_privacy_loss_block(self.epsilon) + margins_dict = self._make_margins_dict([f"{name}_bin" for name in column_names]) + columns = ", ".join([f"{name}_config" for name in column_names]) + return ( + Template("context") + .fill_expressions( + MARGINS_DICT=margins_dict, + COLUMNS=columns, + ) + .fill_values( + WEIGHTS=weights, + ) + .fill_blocks( + PRIVACY_UNIT_BLOCK=privacy_unit_block, + PRIVACY_LOSS_BLOCK=privacy_loss_block, + ) + ) + + +class NotebookGenerator(_CodeGenerator): + root_template = "notebook" + + def _make_context(self): + return str(self._make_partial_context().fill_values(CSV_PATH=self.csv_path)) + + +class ScriptGenerator(_CodeGenerator): + root_template = "script" + + def _make_context(self): + return str(self._make_partial_context().fill_expressions(CSV_PATH="csv_path")) + + +# Public functions used to generate code snippets in the UI; +# These do not require an entire analysis plan, so they stand on their own. + + +def make_privacy_unit_block(contributions): + return str(Template("privacy_unit").fill_values(CONTRIBUTIONS=contributions)) + + +def make_privacy_loss_block(epsilon): + return str(Template("privacy_loss").fill_values(EPSILON=epsilon)) + + +def make_column_config_block(name, lower_bound, upper_bound, bin_count): + """ + >>> print(make_column_config_block( + ... name="HW GRADE", + ... lower_bound=0, + ... upper_bound=100, + ... bin_count=10 + ... )) + # From the public information, determine the bins for 'HW GRADE': + hw_grade_cut_points = make_cut_points( + lower_bound=0, + upper_bound=100, + bin_count=10, + ) + + # Use these bins to define a Polars column: + hw_grade_config = ( + pl.col('HW GRADE') + .cut(hw_grade_cut_points) + .alias('hw_grade_bin') # Give the new column a name. + .cast(pl.String) + ) + + """ + snake_name = _snake_case(name) + return str( + Template("column_config") + .fill_expressions( + CUT_LIST_NAME=f"{snake_name}_cut_points", + POLARS_CONFIG_NAME=f"{snake_name}_config", + ) + .fill_values( + LOWER_BOUND=lower_bound, + UPPER_BOUND=upper_bound, + BIN_COUNT=bin_count, + COLUMN_NAME=name, + BIN_COLUMN_NAME=f"{snake_name}_bin", + ) + ) + + +# Private helper functions: +# These do not depend on the AnalysisPlan, +# so it's better to keep them out of the class. + + +def _make_query(column_name): + indentifier = name_to_identifier(column_name) + return str( + Template("query") + .fill_values( + BIN_NAME=f"{indentifier}_bin", + ) + .fill_expressions( + QUERY_NAME=f"{indentifier}_query", + ACCURACY_NAME=f"{indentifier}_accuracy", + HISTOGRAM_NAME=f"{indentifier}_histogram", + ) + ) + + +def _snake_case(name: str): + """ + >>> _snake_case("HW GRADE") + 'hw_grade' + """ + return re.sub(r"\W+", "_", name.lower()) + + +def _make_imports(): + return ( + str(Template("imports").fill_values()) + + (Path(__file__).parent.parent / "shared.py").read_text() + ) diff --git a/dp_wizard/utils/code_generators/_template.py b/dp_wizard/utils/code_generators/_template.py new file mode 100644 index 0000000..35d2d97 --- /dev/null +++ b/dp_wizard/utils/code_generators/_template.py @@ -0,0 +1,86 @@ +import re +from pathlib import Path + + +class Template: + def __init__(self, path, template=None): + if path is not None: + self._path = f"_{path}.py" + template_path = Path(__file__).parent / "no-tests" / self._path + self._template = template_path.read_text() + if template is not None: + if path is not None: + raise Exception('"path" and "template" are mutually exclusive') + self._path = "template-instead-of-path" + self._template = template + # We want a list of the initial slots, because substitutions + # can produce sequences of upper case letters that could be mistaken for slots. + self._initial_slots = self._find_slots() + + def _find_slots(self): + # Slots: + # - are all caps or underscores + # - have word boundary on either side + # - are at least three characters + slot_re = r"\b[A-Z][A-Z_]{2,}\b" + return set(re.findall(slot_re, self._template)) + + def fill_expressions(self, **kwargs): + for k, v in kwargs.items(): + k_re = re.escape(k) + self._template, count = re.subn(rf"\b{k_re}\b", str(v), self._template) + if count == 0: + raise Exception( + f"No '{k}' slot to fill with '{v}' in " + f"'{self._path}':\n\n{self._template}" + ) + return self + + def fill_values(self, **kwargs): + for k, v in kwargs.items(): + k_re = re.escape(k) + self._template, count = re.subn(rf"\b{k_re}\b", repr(v), self._template) + if count == 0: + raise Exception( + f"No '{k}' slot to fill with '{v}' in " + f"'{self._path}':\n\n{self._template}" + ) + return self + + def fill_blocks(self, **kwargs): + for k, v in kwargs.items(): + + def match_indent(match): + # This does what we want, but binding is confusing. + return "\n".join( + match.group(1) + line for line in v.split("\n") # noqa: B023 + ) + + k_re = re.escape(k) + self._template, count = re.subn( + rf"^([ \t]*){k_re}$", + match_indent, + self._template, + flags=re.MULTILINE, + ) + if count == 0: + base_message = ( + f"No '{k}' slot to fill with '{v}' in " + f"'{self._path}':\n\n{self._template}" + ) + if k in self._template: + raise Exception( + f"Block slots must be alone on line; {base_message}" + ) + else: + raise Exception(base_message) + return self + + def __str__(self): + unfilled_slots = self._initial_slots & self._find_slots() + if unfilled_slots: + slots_str = ", ".join(sorted(f"'{slot}'" for slot in unfilled_slots)) + raise Exception( + f"{slots_str} slot not filled in '{self._path}':\n\n{self._template}" + ) + return self._template diff --git a/dp_wizard/utils/code_generators/no-tests/README.md b/dp_wizard/utils/code_generators/no-tests/README.md new file mode 100644 index 0000000..fbdcd5e --- /dev/null +++ b/dp_wizard/utils/code_generators/no-tests/README.md @@ -0,0 +1,4 @@ +Strings of ALL CAPS are replaced in these templates. +Keeping them in a format which can actually be parsed as python +makes some things easier, but it is also reinventing the wheel. +We may revisit this. diff --git a/dp_wizard/utils/templates/no-tests/_column_config.py b/dp_wizard/utils/code_generators/no-tests/_column_config.py similarity index 100% rename from dp_wizard/utils/templates/no-tests/_column_config.py rename to dp_wizard/utils/code_generators/no-tests/_column_config.py diff --git a/dp_wizard/utils/templates/no-tests/_context.py b/dp_wizard/utils/code_generators/no-tests/_context.py similarity index 100% rename from dp_wizard/utils/templates/no-tests/_context.py rename to dp_wizard/utils/code_generators/no-tests/_context.py diff --git a/dp_wizard/utils/templates/no-tests/_imports.py b/dp_wizard/utils/code_generators/no-tests/_imports.py similarity index 100% rename from dp_wizard/utils/templates/no-tests/_imports.py rename to dp_wizard/utils/code_generators/no-tests/_imports.py diff --git a/dp_wizard/utils/templates/no-tests/_notebook.py b/dp_wizard/utils/code_generators/no-tests/_notebook.py similarity index 100% rename from dp_wizard/utils/templates/no-tests/_notebook.py rename to dp_wizard/utils/code_generators/no-tests/_notebook.py diff --git a/dp_wizard/utils/templates/no-tests/_privacy_loss.py b/dp_wizard/utils/code_generators/no-tests/_privacy_loss.py similarity index 100% rename from dp_wizard/utils/templates/no-tests/_privacy_loss.py rename to dp_wizard/utils/code_generators/no-tests/_privacy_loss.py diff --git a/dp_wizard/utils/templates/no-tests/_privacy_unit.py b/dp_wizard/utils/code_generators/no-tests/_privacy_unit.py similarity index 100% rename from dp_wizard/utils/templates/no-tests/_privacy_unit.py rename to dp_wizard/utils/code_generators/no-tests/_privacy_unit.py diff --git a/dp_wizard/utils/templates/no-tests/_query.py b/dp_wizard/utils/code_generators/no-tests/_query.py similarity index 100% rename from dp_wizard/utils/templates/no-tests/_query.py rename to dp_wizard/utils/code_generators/no-tests/_query.py diff --git a/dp_wizard/utils/templates/no-tests/_script.py b/dp_wizard/utils/code_generators/no-tests/_script.py similarity index 100% rename from dp_wizard/utils/templates/no-tests/_script.py rename to dp_wizard/utils/code_generators/no-tests/_script.py diff --git a/dp_wizard/utils/templates/__init__.py b/dp_wizard/utils/templates/__init__.py deleted file mode 100644 index 591e1eb..0000000 --- a/dp_wizard/utils/templates/__init__.py +++ /dev/null @@ -1,271 +0,0 @@ -""" -Strings of ALL CAPS are replaced in these templates. -Keeping them in a format which can actually be parsed as python -makes some things easier, but it is also reinventing the wheel. -We may revisit this. -""" - -from pathlib import Path -import re -from dp_wizard.utils.csv_helper import name_to_identifier - - -class _Template: - def __init__(self, path, template=None): - if path is not None: - self._path = f"_{path}.py" - template_path = Path(__file__).parent / "no-tests" / self._path - self._template = template_path.read_text() - if template is not None: - if path is not None: - raise Exception('"path" and "template" are mutually exclusive') - self._path = "template-instead-of-path" - self._template = template - self._initial_slots = self._find_slots() - - def _find_slots(self): - # Slots: - # - are all caps or underscores - # - have word boundary on either side - # - are at least three characters - slot_re = r"\b[A-Z][A-Z_]{2,}\b" - return set(re.findall(slot_re, self._template)) - - def fill_expressions(self, **kwargs): - for k, v in kwargs.items(): - k_re = re.escape(k) - self._template = re.sub(rf"\b{k_re}\b", str(v), self._template) - return self - - def fill_values(self, **kwargs): - for k, v in kwargs.items(): - k_re = re.escape(k) - self._template = re.sub(rf"\b{k_re}\b", repr(v), self._template) - return self - - def fill_blocks(self, **kwargs): - for k, v in kwargs.items(): - - def match_indent(match): - # This does what we want, but binding is confusing. - return "\n".join( - match.group(1) + line for line in v.split("\n") # noqa: B023 - ) - - k_re = re.escape(k) - self._template = re.sub( - rf"^([ \t]*){k_re}$", - match_indent, - self._template, - flags=re.MULTILINE, - ) - return self - - def __str__(self): - unfilled_slots = self._initial_slots & self._find_slots() - if unfilled_slots: - raise Exception( - f"Template {self._path} has unfilled slots: " - f'{", ".join(sorted(unfilled_slots))}\n\n{self._template}' - ) - return self._template - - -def _make_margins_dict(bin_names): - # TODO: Don't worry too much about the formatting here. - # Plan to run the output through black for consistency. - # https://github.com/opendp/dp-creator-ii/issues/50 - margins = ( - [ - """ - (): dp.polars.Margin( - public_info="lengths", - ),""" - ] - + [ - f""" - ("{bin_name}",): dp.polars.Margin( - public_info="keys", - ),""" - for bin_name in bin_names - ] - ) - - margins_dict = "{" + "".join(margins) + "\n }" - return margins_dict - - -def _make_context_for_notebook(csv_path, contributions, epsilon, weights, column_names): - privacy_unit_block = make_privacy_unit_block(contributions) - privacy_loss_block = make_privacy_loss_block(epsilon) - margins_dict = _make_margins_dict([f"{name}_bin" for name in column_names]) - columns = ", ".join([f"{name}_config" for name in column_names]) - return str( - _Template("context") - .fill_expressions( - MARGINS_DICT=margins_dict, - COLUMNS=columns, - ) - .fill_values( - CSV_PATH=csv_path, - WEIGHTS=weights, - ) - .fill_blocks( - PRIVACY_UNIT_BLOCK=privacy_unit_block, - PRIVACY_LOSS_BLOCK=privacy_loss_block, - ) - ) - - -def _make_context_for_script(contributions, epsilon, weights, column_names): - privacy_unit_block = make_privacy_unit_block(contributions) - privacy_loss_block = make_privacy_loss_block(epsilon) - margins_dict = _make_margins_dict([f"{name}_bin" for name in column_names]) - columns = ",".join([f"{name}_config" for name in column_names]) - return str( - _Template("context") - .fill_expressions( - CSV_PATH="csv_path", - MARGINS_DICT=margins_dict, - COLUMNS=columns, - ) - .fill_values( - WEIGHTS=weights, - ) - .fill_blocks( - PRIVACY_UNIT_BLOCK=privacy_unit_block, - PRIVACY_LOSS_BLOCK=privacy_loss_block, - MARGINS_DICT=margins_dict, - ) - ) - - -def _make_imports(): - return ( - str(_Template("imports").fill_values()) - + (Path(__file__).parent.parent / "shared.py").read_text() - ) - - -def _make_columns(columns): - return "\n".join( - make_column_config_block( - name=name, - lower_bound=col["lower_bound"], - upper_bound=col["upper_bound"], - bin_count=col["bin_count"], - ) - for name, col in columns.items() - ) - - -def _make_query(column_name): - indentifier = name_to_identifier(column_name) - return str( - _Template("query") - .fill_values( - BIN_NAME=f"{indentifier}_bin", - ) - .fill_expressions( - QUERY_NAME=f"{indentifier}_query", - ACCURACY_NAME=f"{indentifier}_accuracy", - HISTOGRAM_NAME=f"{indentifier}_histogram", - ) - ) - - -def _make_queries(column_names): - return "confidence = 0.95\n\n" + "\n".join( - _make_query(column_name) for column_name in column_names - ) - - -def make_notebook_py(csv_path, contributions, epsilon, columns): - return str( - _Template("notebook").fill_blocks( - IMPORTS_BLOCK=_make_imports(), - COLUMNS_BLOCK=_make_columns(columns), - CONTEXT_BLOCK=_make_context_for_notebook( - csv_path=csv_path, - contributions=contributions, - epsilon=epsilon, - weights=[column["weight"] for column in columns.values()], - column_names=[name_to_identifier(name) for name in columns.keys()], - ), - QUERIES_BLOCK=_make_queries(columns.keys()), - ) - ) - - -def make_script_py(contributions, epsilon, columns): - return str( - _Template("script").fill_blocks( - IMPORTS_BLOCK=_make_imports(), - COLUMNS_BLOCK=_make_columns(columns), - CONTEXT_BLOCK=_make_context_for_script( - # csv_path is a CLI parameter in the script - contributions=contributions, - epsilon=epsilon, - weights=[column["weight"] for column in columns.values()], - column_names=[name_to_identifier(name) for name in columns.keys()], - ), - QUERIES_BLOCK=_make_queries(columns.keys()), - ) - ) - - -def make_privacy_unit_block(contributions): - return str(_Template("privacy_unit").fill_values(CONTRIBUTIONS=contributions)) - - -def make_privacy_loss_block(epsilon): - return str(_Template("privacy_loss").fill_values(EPSILON=epsilon)) - - -def make_column_config_block(name, lower_bound, upper_bound, bin_count): - """ - >>> print(make_column_config_block( - ... name="HW GRADE", - ... lower_bound=0, - ... upper_bound=100, - ... bin_count=10 - ... )) - # From the public information, determine the bins for 'HW GRADE': - hw_grade_cut_points = make_cut_points( - lower_bound=0, - upper_bound=100, - bin_count=10, - ) - - # Use these bins to define a Polars column: - hw_grade_config = ( - pl.col('HW GRADE') - .cut(hw_grade_cut_points) - .alias('hw_grade_bin') # Give the new column a name. - .cast(pl.String) - ) - - """ - snake_name = _snake_case(name) - return str( - _Template("column_config") - .fill_expressions( - CUT_LIST_NAME=f"{snake_name}_cut_points", - POLARS_CONFIG_NAME=f"{snake_name}_config", - ) - .fill_values( - LOWER_BOUND=lower_bound, - UPPER_BOUND=upper_bound, - BIN_COUNT=bin_count, - COLUMN_NAME=name, - BIN_COLUMN_NAME=f"{snake_name}_bin", - ) - ) - - -def _snake_case(name: str): - """ - >>> _snake_case("HW GRADE") - 'hw_grade' - """ - return re.sub(r"\W+", "_", name.lower()) diff --git a/tests/utils/test_code_generators.py b/tests/utils/test_code_generators.py new file mode 100644 index 0000000..aad08d4 --- /dev/null +++ b/tests/utils/test_code_generators.py @@ -0,0 +1,202 @@ +from tempfile import NamedTemporaryFile +import subprocess +from pathlib import Path +import pytest +import opendp.prelude as dp +from dp_wizard.utils.code_generators import ( + Template, + ScriptGenerator, + NotebookGenerator, + AnalysisPlan, + AnalysisPlanColumn, +) + + +fixtures_path = Path(__file__).parent.parent / "fixtures" +fake_csv = "tests/fixtures/fake.csv" + + +def test_param_conflict(): + with pytest.raises(Exception, match=r"mutually exclusive"): + Template("context", template="Not allowed if path present") + + +def test_fill_expressions(): + template = Template(None, template="No one VERB the ADJ NOUN!") + filled = str( + template.fill_expressions( + VERB="expects", + ADJ="Spanish", + NOUN="Inquisition", + ) + ) + assert filled == "No one expects the Spanish Inquisition!" + + +def test_fill_expressions_missing_slot_in_template(): + template = Template(None, template="No one ... the ADJ NOUN!") + with pytest.raises(Exception, match=r"No 'VERB' slot to fill with 'expects'"): + str( + template.fill_expressions( + VERB="expects", + ADJ="Spanish", + NOUN="Inquisition", + ) + ) + + +def test_fill_expressions_extra_slot_in_template(): + template = Template(None, template="No one VERB ARTICLE ADJ NOUN!") + with pytest.raises(Exception, match=r"'ARTICLE' slot not filled"): + str( + template.fill_expressions( + VERB="expects", + ADJ="Spanish", + NOUN="Inquisition", + ) + ) + + +def test_fill_values(): + template = Template(None, template="assert [STRING] * NUM == LIST") + filled = str( + template.fill_values( + STRING="🙂", + NUM=3, + LIST=["🙂", "🙂", "🙂"], + ) + ) + assert filled == "assert ['🙂'] * 3 == ['🙂', '🙂', '🙂']" + + +def test_fill_values_missing_slot_in_template(): + template = Template(None, template="assert [STRING] * ... == LIST") + with pytest.raises(Exception, match=r"No 'NUM' slot to fill with '3'"): + str( + template.fill_values( + STRING="🙂", + NUM=3, + LIST=["🙂", "🙂", "🙂"], + ) + ) + + +def test_fill_values_extra_slot_in_template(): + template = Template(None, template="CMD [STRING] * NUM == LIST") + with pytest.raises(Exception, match=r"'CMD' slot not filled"): + str( + template.fill_values( + STRING="🙂", + NUM=3, + LIST=["🙂", "🙂", "🙂"], + ) + ) + + +def test_fill_blocks(): + # "OK" is less than three characters, so it is not a slot. + template = Template( + None, + template="""# MixedCase is OK + +FIRST + +with fake: + SECOND + if True: + THIRD +""", + ) + template.fill_blocks( + FIRST="\n".join(f"import {i}" for i in "abc"), + SECOND="\n".join(f"f({i})" for i in "123"), + THIRD="\n".join(f"{i}()" for i in "xyz"), + ) + assert ( + str(template) + == """# MixedCase is OK + +import a +import b +import c + +with fake: + f(1) + f(2) + f(3) + if True: + x() + y() + z() +""" + ) + + +def test_fill_blocks_missing_slot_in_template_alone(): + template = Template(None, template="No block slot") + with pytest.raises(Exception, match=r"No 'SLOT' slot"): + str(template.fill_blocks(SLOT="placeholder")) + + +def test_fill_blocks_missing_slot_in_template_not_alone(): + template = Template(None, template="No block SLOT") + with pytest.raises( + Exception, match=r"Block slots must be alone on line; No 'SLOT' slot" + ): + str(template.fill_blocks(SLOT="placeholder")) + + +def test_fill_blocks_extra_slot_in_template(): + template = Template(None, template="EXTRA\nSLOT") + with pytest.raises(Exception, match=r"'EXTRA' slot not filled"): + str(template.fill_blocks(SLOT="placeholder")) + + +def test_make_notebook(): + notebook = NotebookGenerator( + AnalysisPlan( + csv_path=fake_csv, + contributions=1, + epsilon=1, + columns={ + # For a strong test, use a column whose name + # doesn't work as a python identifier. + "hw-number": AnalysisPlanColumn( + lower_bound=5, + upper_bound=15, + bin_count=20, + weight=4, + ) + }, + ) + ).make_py() + print(notebook) + globals = {} + exec(notebook, globals) + assert isinstance(globals["context"], dp.Context) + + +def test_make_script(): + script = ScriptGenerator( + AnalysisPlan( + csv_path=None, + contributions=1, + epsilon=1, + columns={ + "hw-number": AnalysisPlanColumn( + lower_bound=5, + upper_bound=15, + bin_count=20, + weight=4, + ) + }, + ) + ).make_py() + print(script) + + with NamedTemporaryFile(mode="w") as fp: + fp.write(script) + fp.flush() + + result = subprocess.run(["python", fp.name, "--csv", fake_csv]) + assert result.returncode == 0 diff --git a/tests/utils/test_templates.py b/tests/utils/test_templates.py deleted file mode 100644 index 7bcdfc3..0000000 --- a/tests/utils/test_templates.py +++ /dev/null @@ -1,133 +0,0 @@ -from tempfile import NamedTemporaryFile -import subprocess -from pathlib import Path -import re -import pytest -import opendp.prelude as dp -from dp_wizard.utils.templates import _Template, make_notebook_py, make_script_py - - -fixtures_path = Path(__file__).parent.parent / "fixtures" -fake_csv = "tests/fixtures/fake.csv" - - -def test_param_conflict(): - with pytest.raises(Exception, match=r"mutually exclusive"): - _Template("context", template="Not allowed if path present") - - -def test_fill_expressions(): - template = _Template(None, template="No one VERB the ADJ NOUN!") - filled = str( - template.fill_expressions( - VERB="expects", - ADJ="Spanish", - NOUN="Inquisition", - ) - ) - assert filled == "No one expects the Spanish Inquisition!" - - -def test_fill_values(): - template = _Template(None, template="assert [STRING] * NUM == LIST") - filled = str( - template.fill_values( - STRING="🙂", - NUM=3, - LIST=["🙂", "🙂", "🙂"], - ) - ) - assert filled == "assert ['🙂'] * 3 == ['🙂', '🙂', '🙂']" - - -def test_fill_blocks(): - # "OK" is less than three characters, so it is not a slot. - template = _Template( - None, - template="""# MixedCase is OK - -FIRST - -with fake: - SECOND - if True: - THIRD -""", - ) - template.fill_blocks( - FIRST="\n".join(f"import {i}" for i in "abc"), - SECOND="\n".join(f"f({i})" for i in "123"), - THIRD="\n".join(f"{i}()" for i in "xyz"), - ) - assert ( - str(template) - == """# MixedCase is OK - -import a -import b -import c - -with fake: - f(1) - f(2) - f(3) - if True: - x() - y() - z() -""" - ) - - -def test_fill_template_unfilled_slots(): - context_template = _Template("context") - with pytest.raises( - Exception, - match=re.escape("context.py has unfilled slots"), - ): - str(context_template.fill_values()) - - -def test_make_notebook(): - notebook = make_notebook_py( - csv_path=fake_csv, - contributions=1, - epsilon=1, - columns={ - # For a strong test, use a column whose name - # doesn't work as a python identifier. - "hw-number": { - "lower_bound": 5, - "upper_bound": 15, - "bin_count": 20, - "weight": 4, - } - }, - ) - print(notebook) - globals = {} - exec(notebook, globals) - assert isinstance(globals["context"], dp.Context) - - -def test_make_script(): - script = make_script_py( - contributions=1, - epsilon=1, - columns={ - "hw-number": { - "lower_bound": 5, - "upper_bound": 15, - "bin_count": 20, - "weight": 4, - } - }, - ) - print(script) - - with NamedTemporaryFile(mode="w") as fp: - fp.write(script) - fp.flush() - - result = subprocess.run(["python", fp.name, "--csv", fake_csv]) - assert result.returncode == 0