From 236be93a371ad4d08175f17f15ee6110d372b8b8 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Fri, 18 Oct 2024 15:18:10 -0400 Subject: [PATCH] Add `--demo` CLI option (#61) * move argparse details to new file * add underscores to distinguish private functions * Copy CSV generation from notebook in other branch * pytest.ini. Add doctests * tweak ranges on fake csv * a little more coverage --- .pytest.ini | 10 +++ README.md | 5 +- dp_creator_ii/__init__.py | 39 +-------- dp_creator_ii/app/dataset_panel.py | 23 ++--- dp_creator_ii/argparse_helpers.py | 113 +++++++++++++++++++++++++ dp_creator_ii/tests/test_arg_parser.py | 10 +-- 6 files changed, 139 insertions(+), 61 deletions(-) create mode 100644 .pytest.ini create mode 100644 dp_creator_ii/argparse_helpers.py diff --git a/.pytest.ini b/.pytest.ini new file mode 100644 index 0000000..2dc4d6d --- /dev/null +++ b/.pytest.ini @@ -0,0 +1,10 @@ +[pytest] + +# Treat warnings as errors: +filterwarnings = + error + +addopts = --doctest-glob '*.md' --doctest-modules --ignore dp_creator_ii/templates/ --ignore dp_creator_ii/tests/fixtures/ + +# If an xfail starts passing unexpectedly, that should count as a failure: +xfail_strict=true diff --git a/README.md b/README.md index ff89a30..89101c8 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,13 @@ We plan to implement a [proof of concept](https://docs.google.com/document/d/1dt ## Usage ``` -usage: dp-creator-ii [-h] [--csv CSV_PATH] [--contrib CONTRIB] - -DP Creator II makes it easier to get started with Differential Privacy. +usage: dp-creator-ii [-h] [--csv CSV_PATH] [--contrib CONTRIB] [--demo] options: -h, --help show this help message and exit --csv CSV_PATH Path to CSV containing private data --contrib CONTRIB How many rows can an individual contribute? + --demo Use generated fake CSV for a quick demo ``` diff --git a/dp_creator_ii/__init__.py b/dp_creator_ii/__init__.py index 7fffc12..337946c 100644 --- a/dp_creator_ii/__init__.py +++ b/dp_creator_ii/__init__.py @@ -1,47 +1,16 @@ """DP Creator II makes it easier to get started with Differential Privacy.""" -from pathlib import Path -from argparse import ArgumentParser, ArgumentTypeError - import shiny +from dp_creator_ii.argparse_helpers import get_csv_contrib __version__ = "0.0.1" -def existing_csv(arg): - path = Path(arg) - if not path.exists(): - raise ArgumentTypeError(f"No such file: {arg}") - if path.suffix != ".csv": - raise ArgumentTypeError(f'Must have ".csv" extension: {arg}') - return path - - -def get_arg_parser(): - parser = ArgumentParser(description=__doc__) - parser.add_argument( - "--csv", - dest="csv_path", - type=existing_csv, - help="Path to CSV containing private data", - ) - parser.add_argument( - "--contrib", - dest="contributions", - metavar="CONTRIB", - type=int, - default=1, - help="How many rows can an individual contribute?", - ) - return parser - - def main(): # pragma: no cover - # We call parse_args() again inside the app. - # We only call it here so "--help" is handled, - # and to validate inputs. - get_arg_parser().parse_args() + # We only call this here so "--help" is handled, + # and to validate inputs before starting the server. + get_csv_contrib() shiny.run_app( app="dp_creator_ii.app", diff --git a/dp_creator_ii/app/dataset_panel.py b/dp_creator_ii/app/dataset_panel.py index 58f43a9..352c8e8 100644 --- a/dp_creator_ii/app/dataset_panel.py +++ b/dp_creator_ii/app/dataset_panel.py @@ -1,26 +1,13 @@ -from sys import argv - from shiny import ui, reactive, render -from dp_creator_ii import get_arg_parser +from dp_creator_ii.argparse_helpers import get_csv_contrib from dp_creator_ii.csv_helper import read_field_names from dp_creator_ii.app.ui_helpers import output_code_sample from dp_creator_ii.template import make_privacy_unit_block -def get_args(): - arg_parser = get_arg_parser() - if argv[1:3] == ["run", "--port"]: - # We are running a Playwright test, - # and ARGV is polluted, so override: - return arg_parser.parse_args([]) - else: - # Normal parsing: - return arg_parser.parse_args() - - def dataset_ui(): - args = get_args() + (_csv_path, contributions) = get_csv_contrib() return ui.nav_panel( "Select Dataset", @@ -34,7 +21,7 @@ def dataset_ui(): 'This is the "unit of privacy" which will be protected.' ), ui.output_text("csv_fields"), - ui.input_numeric("contributions", "Contributions", args.contributions), + ui.input_numeric("contributions", "Contributions", contributions), output_code_sample("unit_of_privacy_python"), ui.input_action_button("go_to_analysis", "Define analysis"), value="dataset_panel", @@ -42,9 +29,9 @@ def dataset_ui(): def dataset_server(input, output, session): - args = get_args() + (csv_path, _contributions) = get_csv_contrib() - csv_path_from_cli_value = reactive.value(args.csv_path) + csv_path_from_cli_value = reactive.value(csv_path) @reactive.calc def csv_path_calc(): diff --git a/dp_creator_ii/argparse_helpers.py b/dp_creator_ii/argparse_helpers.py new file mode 100644 index 0000000..b39bd8d --- /dev/null +++ b/dp_creator_ii/argparse_helpers.py @@ -0,0 +1,113 @@ +from sys import argv +from pathlib import Path +from argparse import ArgumentParser, ArgumentTypeError +import csv +import random +from warnings import warn + + +def _existing_csv_type(arg): + path = Path(arg) + if not path.exists(): + raise ArgumentTypeError(f"No such file: {arg}") + if path.suffix != ".csv": + raise ArgumentTypeError(f'Must have ".csv" extension: {arg}') + return path + + +def _get_arg_parser(): + parser = ArgumentParser(description=__doc__) + parser.add_argument( + "--csv", + dest="csv_path", + type=_existing_csv_type, + help="Path to CSV containing private data", + ) + parser.add_argument( + "--contrib", + dest="contributions", + metavar="CONTRIB", + type=int, + default=1, + help="How many rows can an individual contribute?", + ) + parser.add_argument( + "--demo", action="store_true", help="Use generated fake CSV for a quick demo" + ) + return parser + + +def _get_args(): + """ + >>> _get_args() + Namespace(csv_path=None, contributions=1, demo=False) + """ + arg_parser = _get_arg_parser() + if "--port" in argv or "-v" in argv or "-k" in argv: + # We are running a test, + # and ARGV is polluted, so override: + return arg_parser.parse_args([]) + else: + # Normal parsing: + return arg_parser.parse_args() # pragma: no cover + + +def _clip(n, lower, upper): + """ + >>> _clip(-5, 0, 10) + 0 + >>> _clip(5, 0, 10) + 5 + >>> _clip(15, 0, 10) + 10 + """ + return max(min(n, upper), lower) + + +def _get_demo_csv_contrib(): + """ + >>> csv_path, contributions = _get_demo_csv_contrib() + >>> with open(csv_path, newline="") as csv_handle: + ... reader = csv.DictReader(csv_handle) + ... reader.fieldnames + ... rows = list(reader) + ... rows[0] + ... rows[-1] + ['student_id', 'class_year', 'hw_number', 'grade'] + {'student_id': '1', 'class_year': '2', 'hw_number': '1', 'grade': '73'} + {'student_id': '100', 'class_year': '1', 'hw_number': '10', 'grade': '78'} + """ + random.seed(0) # So the mock data will be stable across runs. + + csv_path = "/tmp/demo.csv" + contributions = 10 + + with open(csv_path, "w", newline="") as demo_handle: + fields = ["student_id", "class_year", "hw_number", "grade"] + writer = csv.DictWriter(demo_handle, fieldnames=fields) + writer.writeheader() + for student_id in range(1, 101): + class_year = int(_clip(random.gauss(2, 1), 1, 4)) + # Older students do slightly better in the class: + mean_grade = random.gauss(80, 5) + class_year * 2 + for hw_number in range(1, contributions + 1): + grade = int(_clip(random.gauss(mean_grade, 5), 0, 100)) + writer.writerow( + { + "student_id": student_id, + "class_year": class_year, + "hw_number": hw_number, + "grade": grade, + } + ) + + return csv_path, contributions + + +def get_csv_contrib(): # pragma: no cover + args = _get_args() + if args.demo: + if args.csv_path is not None or args.contributions is not None: + warn('"--demo" overrides "--csv" and "--contrib"') + return _get_demo_csv_contrib() + return (args.csv_path, args.contributions) diff --git a/dp_creator_ii/tests/test_arg_parser.py b/dp_creator_ii/tests/test_arg_parser.py index f57d45e..737bcf7 100644 --- a/dp_creator_ii/tests/test_arg_parser.py +++ b/dp_creator_ii/tests/test_arg_parser.py @@ -3,12 +3,12 @@ import pytest -from dp_creator_ii import get_arg_parser, existing_csv +from dp_creator_ii.argparse_helpers import _get_arg_parser, _existing_csv_type def test_help(): help = ( - get_arg_parser() + _get_arg_parser() .format_help() # argparse doesn't actually know the name of the script # and inserts the name of the running program instead. @@ -25,14 +25,14 @@ def test_help(): def test_arg_validation_no_file(): with pytest.raises(ArgumentTypeError, match="No such file: no-such-file"): - existing_csv("no-such-file") + _existing_csv_type("no-such-file") def test_arg_validation_not_csv(): with pytest.raises(ArgumentTypeError, match='Must have ".csv" extension:'): - existing_csv(Path(__file__).parent / "fixtures" / "fake.ipynb") + _existing_csv_type(Path(__file__).parent / "fixtures" / "fake.ipynb") def test_arg_validation_works(): - path = existing_csv(Path(__file__).parent / "fixtures" / "fake.csv") + path = _existing_csv_type(Path(__file__).parent / "fixtures" / "fake.csv") assert path.name == "fake.csv"