Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix & temporarily disable raytune tutorial, print tutorial execution errors & memory usage #1974

Closed
wants to merge 14 commits into from
Closed
56 changes: 37 additions & 19 deletions scripts/make_tutorials.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@

import nbformat
from bs4 import BeautifulSoup
from memory_profiler import memory_usage
from nbconvert import HTMLExporter, ScriptExporter

TUTORIALS_TO_SKIP = [
"raytune_pytorch_cnn", # TODO: Times out CI but passes locally. Investigate.
]


TEMPLATE = """const CWD = process.cwd();

Expand Down Expand Up @@ -147,8 +152,7 @@ def gen_tutorials(
# prepare paths for converted tutorials & files
os.makedirs(os.path.join(repo_dir, "website", "_tutorials"), exist_ok=True)
os.makedirs(os.path.join(repo_dir, "website", "static", "files"), exist_ok=True)
if smoke_test:
os.environ["SMOKE_TEST"] = str(smoke_test)
env = {"SMOKE_TEST": "True"} if smoke_test else None

for config in tutorial_configs:
tid = config["id"]
Expand All @@ -162,32 +166,46 @@ def gen_tutorials(
nb_str = infile.read()
nb = nbformat.reads(nb_str, nbformat.NO_CONVERT)

total_time = None
if exec_tutorials and exec_on_build:
if tid in TUTORIALS_TO_SKIP:
print(f"Skipping {tid}")
continue
tutorial_path = Path(paths["tutorial_path"])
print("Executing tutorial {}".format(tid))
start_time = time.time()
start_time = time.monotonic()

# try / catch failures for now
# will re-raise at the end
# Try / catch failures for now. We will re-raise at the end.
# TODO: [T163244135] Speed up tutorials and reduce timeout limits.
saitcakmak marked this conversation as resolved.
Show resolved Hide resolved
timeout_minutes = 15 if smoke_test else 150
try:
# Execute notebook.
# TODO: [T163244135] Speed up tutorials and reduce timeout limits.
timeout_minutes = 15 if smoke_test else 150
run_script(tutorial=tutorial_path, timeout_minutes=timeout_minutes)
total_time = time.time() - start_time
mem_usage, run_out = memory_usage(
(run_script, (tutorial_path, timeout_minutes), {"env": env}),
retval=True,
include_children=True,
)
total_time = time.monotonic() - start_time
print(
"Done executing tutorial {}. Took {:.2f} seconds.".format(
tid, total_time
)
f"Finished executing tutorial {tid} in {total_time:.2f} seconds. "
f"Starting memory usage was {mem_usage[0]} MB & "
f"the peak memory usage was {max(mem_usage)} MB."
)
except Exception as exc:
except subprocess.TimeoutExpired:
has_errors = True
print("Couldn't execute tutorial {}!".format(tid))
print(exc)
total_time = None
else:
total_time = None

print(
f"Tutorial {tid} exceeded the maximum runtime of "
f"{timeout_minutes} minutes."
)
try:
run_out.check_returncode()
except subprocess.CalledProcessError:
has_errors = True
print(
f"Encountered error running tutorial {tid}: \n"
f"stdout: \n {run_out.stdout} \n"
f"stderr: \n {run_out.stderr} \n"
)
# convert notebook to HTML
exporter = HTMLExporter(template_name="classic")
html, _ = exporter.from_notebook_node(nb)
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
"pyro-ppl", # Required for to call run_inference.
"pytorch-lightning", # For the early stopping tutorial.
"papermill", # For executing the tutorials.
"memory_profiler", # For measuring memory usage of the tutorials.
]


Expand Down
21 changes: 12 additions & 9 deletions tutorials/raytune_pytorch_cnn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"import logging\n",
"\n",
"from ray import tune\n",
"from ray.tune import report\n",
"from ray.train import report\n",
"from ray.tune.search.ax import AxSearch\n",
"\n",
"logger = logging.getLogger(tune.__name__)\n",
Expand Down Expand Up @@ -174,12 +174,14 @@
" device=device,\n",
" )\n",
" report(\n",
" mean_accuracy=evaluate(\n",
" net=net,\n",
" data_loader=valid_loader,\n",
" dtype=torch.float,\n",
" device=device,\n",
" )\n",
" {\n",
" \"mean_accuracy\": evaluate(\n",
" net=net,\n",
" data_loader=valid_loader,\n",
" dtype=torch.float,\n",
" device=device,\n",
" )\n",
" }\n",
" )"
]
},
Expand All @@ -191,7 +193,8 @@
},
"source": [
"## 4. Run optimization\n",
"Execute the Ax optimization and trial evaluation in RayTune using [AxSearch algorithm](https://ray.readthedocs.io/en/latest/tune-searchalg.html#ax-search):"
"Execute the Ax optimization and trial evaluation in RayTune using [AxSearch algorithm](https://ray.readthedocs.io/en/latest/tune-searchalg.html#ax-search). \n",
"We only run 10 trials for demonstration. It is generally recommended to run more trials for best results."
]
},
{
Expand All @@ -211,7 +214,7 @@
"algo = tune.search.ConcurrencyLimiter(algo, max_concurrent=3)\n",
"tune.run(\n",
" train_evaluate,\n",
" num_samples=30,\n",
" num_samples=10,\n",
" search_alg=algo,\n",
" verbose=0, # Set this level to 1 to see status updates and to 2 to also see trial results.\n",
" # To use GPU, specify: resources_per_trial={\"gpu\": 1}.\n",
Expand Down
Loading