diff --git a/aiidalab_launch/__main__.py b/aiidalab_launch/__main__.py index 8ec4e0b..ec21d0e 100644 --- a/aiidalab_launch/__main__.py +++ b/aiidalab_launch/__main__.py @@ -353,8 +353,11 @@ async def _async_start( elif status is InstanceStatus.UP and restart: with spinner("Restarting container..."): if configuration_changed: + print("stopping...") instance.stop() + print("recreating...") instance.recreate() + print("starting...") instance.start() else: instance.restart() @@ -394,6 +397,7 @@ async def _async_start( echo_logs = asyncio.create_task(instance.echo_logs()) await asyncio.wait_for(instance.wait_for_services(), timeout=wait) echo_logs.cancel() + LOGGER.debug("AiiDAlab instance ready.") except asyncio.TimeoutError: raise click.ClickException( f"AiiDAlab instance did not start up within the provided wait period ({wait})." @@ -404,14 +408,19 @@ async def _async_start( "the container output logs by increasing the output " "verbosity with 'aiidalab-launch -vvv start'." ) + LOGGER.debug("Preparing startup message.") msg_startup = ( MSG_STARTUP_SSH if (show_ssh_help or not webbrowser_available()) else MSG_STARTUP ) + print("generate url...") url = instance.url() + print("grabbing host ports...") host_ports = instance.host_ports() + print("asserting host ports...") assert len(host_ports) > 0 + print("presenting startup message...") click.secho( msg_startup.format( url=url, diff --git a/aiidalab_launch/instance.py b/aiidalab_launch/instance.py index 05ee57b..7712613 100644 --- a/aiidalab_launch/instance.py +++ b/aiidalab_launch/instance.py @@ -3,13 +3,13 @@ from __future__ import annotations import asyncio -import logging from contextlib import contextmanager from dataclasses import dataclass from enum import Enum, auto from pathlib import Path, PurePosixPath from secrets import token_hex from shutil import rmtree +from time import time from typing import Any, AsyncGenerator, Generator import docker @@ -92,9 +92,11 @@ def container(self) -> Container | None: self._container = self._get_container() return self._container - def _requires_container(self) -> None: - if self.container is None: + def _requires_container(self) -> Container: + container = self.container + if container is None: raise RequiresContainerInstance + return container def _conda_mount(self) -> docker.types.Mount: return docker.types.Mount( @@ -181,22 +183,25 @@ def start(self) -> None: def restart(self) -> None: self._requires_container() assert self.container is not None - self.container.restart() - self._run_post_start() + # self.container.restart() + # self._run_post_start() + self.stop(timeout=20) + self.start() def _run_post_start(self) -> None: assert self.container is not None - logging.debug("Run post-start commands.") + LOGGER.debug("Run post-start commands.") - logging.debug("Ensure ~/.conda directory is owned by the system user.") + LOGGER.debug("Ensure ~/.conda directory is owned by the system user.") exit_code, _ = self.container.exec_run( f"chown -R 1000:1000 /home/{self.profile.system_user}/.conda", privileged=True, ) if exit_code != 0: - logging.warn( + LOGGER.warn( "Failed to ensure ~/.conda directory is owned by the system user." ) + LOGGER.debug("The ~/.conda directory is owned by the system user.") def stop(self, timeout: float | None = None) -> None: self._requires_container() @@ -217,7 +222,7 @@ def remove(self, conda: bool = False, data: bool = False) -> None: try: self.client.volumes.get(self.profile.conda_volume_name()).remove() except docker.errors.NotFound: # already removed - logging.debug( + LOGGER.debug( f"Failed to remove conda volume '{self.profile.conda_volume_name()}', likely already removed." ) except Exception as error: # unexpected error @@ -266,54 +271,65 @@ async def echo_logs(self) -> None: async for chunk in logs: LOGGER.debug(f"{self.container.id}: {chunk.decode('utf-8').strip()}") - async def _init_scripts_finished(self) -> None: - assert self.container is not None + @staticmethod + async def _init_scripts_finished(container: Container) -> None: loop = asyncio.get_event_loop() - logging.info("Waiting for init services to finish...") + LOGGER.debug("Waiting for init services to finish...") result = await loop.run_in_executor( - None, self.container.exec_run, "wait-for-services" + None, container.exec_run, "wait-for-services" ) if result.exit_code != 0: raise FailedToWaitForServices( "Failed to check for init processes to complete." ) + LOGGER.debug("Init services finished.") - async def _notebook_service_online(self) -> None: - assert self.container is not None + async def _notebook_service_online(self, container: Container) -> None: loop = asyncio.get_event_loop() - logging.info("Waiting for notebook service to become reachable...") + LOGGER.debug("Waiting for notebook service to become reachable...") while True: - result = await loop.run_in_executor( - None, - self.container.exec_run, - "curl --fail-early --fail --silent --max-time 1.0 http://localhost:8888", - ) - if result.exit_code == 0: - return # jupyter is online - elif result.exit_code in (7, 28): - await asyncio.sleep(1) # jupyter not yet reachable - continue - else: - raise FailedToWaitForServices("Failed to reach notebook service.") - - async def _host_port_assigned(self) -> None: - container = self.container - assert container is not None + try: + LOGGER.debug("Curl notebook...") + result = await loop.run_in_executor( + None, + container.exec_run, + "curl --fail-early --fail --silent --max-time 1.0 http://localhost:8888", + ) + if result.exit_code == 0: + LOGGER.debug("Notebook service reachable.") + return # jupyter is online + elif result.exit_code in (7, 28): + await asyncio.sleep(2) # jupyter not yet reachable + continue + else: + raise FailedToWaitForServices("Failed to reach notebook service.") + except docker.errors.APIError: + LOGGER.error("Failed to reach notebook service. Aborting.") + raise FailedToWaitForServices( + "Failed to reach notebook service (unable to reach container." + ) + + @staticmethod + async def _host_port_assigned(container: Container) -> None: + LOGGER.debug("Waiting for host port to be assigned...") while True: container.reload() if any(_get_host_ports(container)): + LOGGER.debug("Host port assigned.") break asyncio.sleep(1) async def wait_for_services(self) -> None: - if self.container is None: - raise RuntimeError("Instance was not created.") - - LOGGER.info(f"Waiting for services to come up ({self.container.id})...") + container = self._requires_container() + LOGGER.info(f"Waiting for services to come up ({container.id})...") + start = time() await asyncio.gather( - self._init_scripts_finished(), - self._notebook_service_online(), - self._host_port_assigned(), + self._init_scripts_finished(container), + self._notebook_service_online(container), + self._host_port_assigned(container), + ) + LOGGER.info( + f"Services came up after {time() - start:.1f} seconds ({container.id})." ) async def status(self, timeout: float | None = 5.0) -> AiidaLabInstanceStatus: diff --git a/aiidalab_launch/util.py b/aiidalab_launch/util.py index 97530ac..ec5e1d8 100644 --- a/aiidalab_launch/util.py +++ b/aiidalab_launch/util.py @@ -1,5 +1,4 @@ import asyncio -import logging import re import webbrowser from contextlib import contextmanager @@ -15,6 +14,8 @@ from packaging.version import Version, parse from requests_cache import CachedSession +from .core import LOGGER + MSG_UNABLE_TO_COMMUNICATE_WITH_CLIENT = ( "Unable to communicate with docker on this host. This error usually indicates " "that Docker is either not installed on this system, that the docker service is " @@ -143,10 +144,10 @@ def get_latest_version(timeout: float = 0.1) -> Optional[Version]: ) return parse(releases[-1]) if releases else None except (requests.exceptions.Timeout, requests.exceptions.ReadTimeout): - logging.debug("Timed out while requesting latest version.") + LOGGER.debug("Timed out while requesting latest version.") return None except OSError as error: - logging.debug(f"Error while requesting latest version: {error}") + LOGGER.debug(f"Error while requesting latest version: {error}") return None diff --git a/tests/conftest.py b/tests/conftest.py index bf32617..9bba7c6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,7 +44,7 @@ def monkeypatch_session(): m.undo() -@pytest.fixture(scope="session") +@pytest.fixture(scope="class") def docker_client(): try: yield docker.from_env() diff --git a/tests/test_cli.py b/tests/test_cli.py index 72f488b..f006494 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,6 +7,9 @@ This is the test module for the project's command-line interface (CLI) module. """ +import logging +from time import sleep + import docker import pytest from click.testing import CliRunner, Result @@ -163,7 +166,9 @@ def test_remove_running_profile(self): @pytest.mark.slow @pytest.mark.trylast class TestInstanceLifecycle: - def test_start_stop_reset(self, instance, docker_client): + def test_start_stop_reset(self, instance, docker_client, caplog): + caplog.set_level(logging.DEBUG) + def get_volume(volume_name): try: return docker_client.volumes.get(volume_name) @@ -186,9 +191,7 @@ def assert_status_down(): # Start instance. runner: CliRunner = CliRunner() - result: Result = runner.invoke( - cli.cli, ["-vvv", "start", "--no-browser", "--wait=300"] - ) + result: Result = runner.invoke(cli.cli, ["start", "--no-browser", "--wait=300"]) assert result.exit_code == 0 assert_status_up() @@ -196,17 +199,23 @@ def assert_status_down(): assert get_volume(instance.profile.conda_volume_name()) # Start instance again – should be noop. - result: Result = runner.invoke( - cli.cli, ["-vvv", "start", "--no-browser", "--wait=300"] - ) + result: Result = runner.invoke(cli.cli, ["start", "--no-browser", "--wait=300"]) assert "Container was already running" in result.output.strip() assert result.exit_code == 0 assert_status_up() # Restart instance. + sleep(5) # Do not try to restart immediately. result: Result = runner.invoke( - cli.cli, ["-vvv", "start", "--no-browser", "--wait=600", "--restart"] + cli.cli, ["start", "--no-browser", "--wait=120", "--restart"] ) + print(result.output) + print(result.exception) + print(result.exc_info) + print("Traceback:") + import traceback + + traceback.print_tb(result.exc_info[2]) assert result.exit_code == 0 assert_status_up()