Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(WIP) Tests: Make flaky instance life cycle test more robust. #97

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a552ed2
(WIP) Tests: Make flaky instance life cycle test more robust.
csadorf Feb 15, 2022
3cbb521
fixup! (WIP) Tests: Make flaky instance life cycle test more robust.
csadorf Feb 15, 2022
89368f9
DEBUG: Increase output verbosity.
csadorf Feb 15, 2022
7fc19a9
Capture logs.
csadorf Feb 15, 2022
584a62d
DEBUG: disable container logging
csadorf Feb 15, 2022
5acecf2
Consistently use application logger.
csadorf Feb 16, 2022
46707d1
Increase logging verbosity.
csadorf Feb 16, 2022
5622379
DEBUG: inspect the result error state.
csadorf Feb 16, 2022
8bc27db
Increase logging verbosity.
csadorf Feb 16, 2022
953adda
Revert "DEBUG: disable container logging"
csadorf Feb 16, 2022
20a443f
fixup! DEBUG: inspect the result error state.
csadorf Feb 16, 2022
585d107
Slight refactor of wait_for_services().
csadorf Feb 16, 2022
65d8c0d
DEBUG: Do not curl in thread.
csadorf Feb 16, 2022
ecf739c
Revert "DEBUG: Do not curl in thread."
csadorf Feb 16, 2022
cd35227
DBEUG: Try to mitigate phantom curl error.
csadorf Feb 16, 2022
c284e19
fixup! DBEUG: Try to mitigate phantom curl error.
csadorf Feb 16, 2022
db762c2
fixup! DBEUG: Try to mitigate phantom curl error.
csadorf Feb 16, 2022
d36b98a
fixup! DBEUG: Try to mitigate phantom curl error.
csadorf Feb 16, 2022
2ab5360
fixup! DBEUG: Try to mitigate phantom curl error.
csadorf Feb 16, 2022
a182a36
fixup! DBEUG: Try to mitigate phantom curl error.
csadorf Feb 16, 2022
b0bf20e
fixup! DBEUG: Try to mitigate phantom curl error.
csadorf Feb 16, 2022
cb10901
fixup! DBEUG: Try to mitigate phantom curl error.
csadorf Feb 16, 2022
649e740
Debug: Manually restart instance.
csadorf Feb 16, 2022
edfe69d
DEBUG: use class scope for docker_client fixture.
csadorf Feb 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions aiidalab_launch/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,11 @@ async def _async_start(
elif status is InstanceStatus.UP and restart:
with spinner("Restarting container..."):
if configuration_changed:
print("stopping...")
instance.stop()
print("recreating...")
instance.recreate()
print("starting...")
instance.start()
else:
instance.restart()
Expand Down Expand Up @@ -394,6 +397,7 @@ async def _async_start(
echo_logs = asyncio.create_task(instance.echo_logs())
await asyncio.wait_for(instance.wait_for_services(), timeout=wait)
echo_logs.cancel()
LOGGER.debug("AiiDAlab instance ready.")
except asyncio.TimeoutError:
raise click.ClickException(
f"AiiDAlab instance did not start up within the provided wait period ({wait})."
Expand All @@ -404,14 +408,19 @@ async def _async_start(
"the container output logs by increasing the output "
"verbosity with 'aiidalab-launch -vvv start'."
)
LOGGER.debug("Preparing startup message.")
msg_startup = (
MSG_STARTUP_SSH
if (show_ssh_help or not webbrowser_available())
else MSG_STARTUP
)
print("generate url...")
url = instance.url()
print("grabbing host ports...")
host_ports = instance.host_ports()
print("asserting host ports...")
assert len(host_ports) > 0
print("presenting startup message...")
click.secho(
msg_startup.format(
url=url,
Expand Down
94 changes: 55 additions & 39 deletions aiidalab_launch/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
from __future__ import annotations

import asyncio
import logging
from contextlib import contextmanager
from dataclasses import dataclass
from enum import Enum, auto
from pathlib import Path, PurePosixPath
from secrets import token_hex
from shutil import rmtree
from time import time
from typing import Any, AsyncGenerator, Generator

import docker
Expand Down Expand Up @@ -92,9 +92,11 @@ def container(self) -> Container | None:
self._container = self._get_container()
return self._container

def _requires_container(self) -> None:
if self.container is None:
def _requires_container(self) -> Container:
container = self.container
if container is None:
raise RequiresContainerInstance
return container

def _conda_mount(self) -> docker.types.Mount:
return docker.types.Mount(
Expand Down Expand Up @@ -181,22 +183,25 @@ def start(self) -> None:
def restart(self) -> None:
self._requires_container()
assert self.container is not None
self.container.restart()
self._run_post_start()
# self.container.restart()
# self._run_post_start()
self.stop(timeout=20)
self.start()

def _run_post_start(self) -> None:
assert self.container is not None
logging.debug("Run post-start commands.")
LOGGER.debug("Run post-start commands.")

logging.debug("Ensure ~/.conda directory is owned by the system user.")
LOGGER.debug("Ensure ~/.conda directory is owned by the system user.")
exit_code, _ = self.container.exec_run(
f"chown -R 1000:1000 /home/{self.profile.system_user}/.conda",
privileged=True,
)
if exit_code != 0:
logging.warn(
LOGGER.warn(
"Failed to ensure ~/.conda directory is owned by the system user."
)
LOGGER.debug("The ~/.conda directory is owned by the system user.")

def stop(self, timeout: float | None = None) -> None:
self._requires_container()
Expand All @@ -217,7 +222,7 @@ def remove(self, conda: bool = False, data: bool = False) -> None:
try:
self.client.volumes.get(self.profile.conda_volume_name()).remove()
except docker.errors.NotFound: # already removed
logging.debug(
LOGGER.debug(
f"Failed to remove conda volume '{self.profile.conda_volume_name()}', likely already removed."
)
except Exception as error: # unexpected error
Expand Down Expand Up @@ -266,54 +271,65 @@ async def echo_logs(self) -> None:
async for chunk in logs:
LOGGER.debug(f"{self.container.id}: {chunk.decode('utf-8').strip()}")

async def _init_scripts_finished(self) -> None:
assert self.container is not None
@staticmethod
async def _init_scripts_finished(container: Container) -> None:
loop = asyncio.get_event_loop()
logging.info("Waiting for init services to finish...")
LOGGER.debug("Waiting for init services to finish...")
result = await loop.run_in_executor(
None, self.container.exec_run, "wait-for-services"
None, container.exec_run, "wait-for-services"
)
if result.exit_code != 0:
raise FailedToWaitForServices(
"Failed to check for init processes to complete."
)
LOGGER.debug("Init services finished.")

async def _notebook_service_online(self) -> None:
assert self.container is not None
async def _notebook_service_online(self, container: Container) -> None:
loop = asyncio.get_event_loop()
logging.info("Waiting for notebook service to become reachable...")
LOGGER.debug("Waiting for notebook service to become reachable...")
while True:
result = await loop.run_in_executor(
None,
self.container.exec_run,
"curl --fail-early --fail --silent --max-time 1.0 http://localhost:8888",
)
if result.exit_code == 0:
return # jupyter is online
elif result.exit_code in (7, 28):
await asyncio.sleep(1) # jupyter not yet reachable
continue
else:
raise FailedToWaitForServices("Failed to reach notebook service.")

async def _host_port_assigned(self) -> None:
container = self.container
assert container is not None
try:
LOGGER.debug("Curl notebook...")
result = await loop.run_in_executor(
None,
container.exec_run,
"curl --fail-early --fail --silent --max-time 1.0 http://localhost:8888",
)
if result.exit_code == 0:
LOGGER.debug("Notebook service reachable.")
return # jupyter is online
elif result.exit_code in (7, 28):
await asyncio.sleep(2) # jupyter not yet reachable
continue
else:
raise FailedToWaitForServices("Failed to reach notebook service.")
except docker.errors.APIError:
LOGGER.error("Failed to reach notebook service. Aborting.")
raise FailedToWaitForServices(
"Failed to reach notebook service (unable to reach container."
)

@staticmethod
async def _host_port_assigned(container: Container) -> None:
LOGGER.debug("Waiting for host port to be assigned...")
while True:
container.reload()
if any(_get_host_ports(container)):
LOGGER.debug("Host port assigned.")
break
asyncio.sleep(1)

async def wait_for_services(self) -> None:
if self.container is None:
raise RuntimeError("Instance was not created.")

LOGGER.info(f"Waiting for services to come up ({self.container.id})...")
container = self._requires_container()
LOGGER.info(f"Waiting for services to come up ({container.id})...")
start = time()
await asyncio.gather(
self._init_scripts_finished(),
self._notebook_service_online(),
self._host_port_assigned(),
self._init_scripts_finished(container),
self._notebook_service_online(container),
self._host_port_assigned(container),
)
LOGGER.info(
f"Services came up after {time() - start:.1f} seconds ({container.id})."
)

async def status(self, timeout: float | None = 5.0) -> AiidaLabInstanceStatus:
Expand Down
7 changes: 4 additions & 3 deletions aiidalab_launch/util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
import logging
import re
import webbrowser
from contextlib import contextmanager
Expand All @@ -15,6 +14,8 @@
from packaging.version import Version, parse
from requests_cache import CachedSession

from .core import LOGGER

MSG_UNABLE_TO_COMMUNICATE_WITH_CLIENT = (
"Unable to communicate with docker on this host. This error usually indicates "
"that Docker is either not installed on this system, that the docker service is "
Expand Down Expand Up @@ -143,10 +144,10 @@ def get_latest_version(timeout: float = 0.1) -> Optional[Version]:
)
return parse(releases[-1]) if releases else None
except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout):
logging.debug("Timed out while requesting latest version.")
LOGGER.debug("Timed out while requesting latest version.")
return None
except OSError as error:
logging.debug(f"Error while requesting latest version: {error}")
LOGGER.debug(f"Error while requesting latest version: {error}")
return None


Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def monkeypatch_session():
m.undo()


@pytest.fixture(scope="session")
@pytest.fixture(scope="class")
def docker_client():
try:
yield docker.from_env()
Expand Down
25 changes: 17 additions & 8 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
This is the test module for the project's command-line interface (CLI)
module.
"""
import logging
from time import sleep

import docker
import pytest
from click.testing import CliRunner, Result
Expand Down Expand Up @@ -163,7 +166,9 @@ def test_remove_running_profile(self):
@pytest.mark.slow
@pytest.mark.trylast
class TestInstanceLifecycle:
def test_start_stop_reset(self, instance, docker_client):
def test_start_stop_reset(self, instance, docker_client, caplog):
caplog.set_level(logging.DEBUG)

def get_volume(volume_name):
try:
return docker_client.volumes.get(volume_name)
Expand All @@ -186,27 +191,31 @@ def assert_status_down():

# Start instance.
runner: CliRunner = CliRunner()
result: Result = runner.invoke(
cli.cli, ["-vvv", "start", "--no-browser", "--wait=300"]
)
result: Result = runner.invoke(cli.cli, ["start", "--no-browser", "--wait=300"])
assert result.exit_code == 0

assert_status_up()
assert get_volume(instance.profile.home_mount)
assert get_volume(instance.profile.conda_volume_name())

# Start instance again – should be noop.
result: Result = runner.invoke(
cli.cli, ["-vvv", "start", "--no-browser", "--wait=300"]
)
result: Result = runner.invoke(cli.cli, ["start", "--no-browser", "--wait=300"])
assert "Container was already running" in result.output.strip()
assert result.exit_code == 0
assert_status_up()

# Restart instance.
sleep(5) # Do not try to restart immediately.
result: Result = runner.invoke(
cli.cli, ["-vvv", "start", "--no-browser", "--wait=600", "--restart"]
cli.cli, ["start", "--no-browser", "--wait=120", "--restart"]
)
print(result.output)
print(result.exception)
print(result.exc_info)
print("Traceback:")
import traceback

traceback.print_tb(result.exc_info[2])
assert result.exit_code == 0
assert_status_up()

Expand Down