diff --git a/src/aap_eda/services/activation/engine/common.py b/src/aap_eda/services/activation/engine/common.py index 70522f3f7..d624d3c88 100644 --- a/src/aap_eda/services/activation/engine/common.py +++ b/src/aap_eda/services/activation/engine/common.py @@ -129,7 +129,7 @@ def stop(self, container_id: str, logger: LogHandler) -> None: raise exceptions.ContainerStopError(e) from e @abstractmethod - def update_logs(self, container_id: str, logger: LogHandler) -> None: + def update_logs(self, container_id: str, log_handler: LogHandler) -> None: try: # Implementation ... diff --git a/src/aap_eda/services/activation/engine/kubernetes.py b/src/aap_eda/services/activation/engine/kubernetes.py index a0ecc23a1..32d441a03 100644 --- a/src/aap_eda/services/activation/engine/kubernetes.py +++ b/src/aap_eda/services/activation/engine/kubernetes.py @@ -138,9 +138,9 @@ def _cleanup(self, job_name: str, log_handler: LogHandler): self._delete_services() self._delete_job() - def update_logs(self, job_name: str, log_handler: LogHandler) -> None: + def update_logs(self, container_id: str, log_handler: LogHandler) -> None: try: - pod = self._get_job_pod(job_name) + pod = self._get_job_pod(container_id) container_status = pod.status.container_statuses[0] if ( container_status.state.running @@ -172,13 +172,13 @@ def update_logs(self, job_name: str, log_handler: LogHandler) -> None: log_handler.flush() log_handler.set_log_read_at(dt) else: - LOGGER.warning(f"Pod with label {job_name} not found.") + LOGGER.warning(f"Pod with label {container_id} not found.") log_handler.write( - f"Pod with label {job_name} not found.", True + f"Pod with label {container_id} not found.", True ) except ApiException as e: LOGGER.exception( - "Failed to fetch pod logs: " f"{job_name}; error: {str(e)}" + "Failed to fetch pod logs: " f"{container_id}; error: {str(e)}" ) raise diff --git a/src/aap_eda/services/activation/exceptions.py b/src/aap_eda/services/activation/exceptions.py index c7588330c..e423e3830 100644 --- a/src/aap_eda/services/activation/exceptions.py +++ b/src/aap_eda/services/activation/exceptions.py @@ -25,6 +25,10 @@ class ActivationStopError(Exception): pass +class ActivationMonitorError(Exception): + pass + + class ActivationInstanceNotFound(ActivationException): pass diff --git a/src/aap_eda/services/activation/manager.py b/src/aap_eda/services/activation/manager.py index 76714671b..fe384face 100644 --- a/src/aap_eda/services/activation/manager.py +++ b/src/aap_eda/services/activation/manager.py @@ -354,32 +354,67 @@ def restart(self): self.start() def monitor(self): + # TODO: we should check if the db_instance is good + LOGGER.info(f"Monitoring activation id: {self.db_instance.id}") try: - self._set_activation_instance() - status = self.container_engine.get_status( + self._check_latest_instance() + except ( + exceptions.ActivationInstanceNotFound, + exceptions.ActivationInstancePodIdNotFound, + ) as e: + LOGGER.error(f"Monitor operation Failed: {e}") + raise exceptions.ActivationMonitorError(f"{e}") + + log_handler = DBLogger(self.latest_instance.id) + # TODO: long try block, we should be more specific + try: + container_status = self.container_engine.get_status( self.latest_instance.activation_pod_id ) - LOGGER.info(f"Current status is {status}") - if status in [ActivationStatus.COMPLETED, ActivationStatus.FAILED]: - self.update_logs() - log_handler = DBLogger(self.latest_instance.id) - self.container_engine.cleanup( - self.latest_instance.activation_pod_id, log_handler + LOGGER.info( + f"Current status of instance {self.latest_instance.id} " + f"is {container_status}", + ) + # TODO: implement restart policy logic + if container_status in [ + ActivationStatus.COMPLETED, + ActivationStatus.FAILED, + ]: + # TODO: it should be the cleanup method + # stop is implicit in the cleanup method + # stop is not clear that it performs a cleanup + # but there is not any stop without cleanup + self.container_engine.stop( + self.latest_instance.activation_pod_id, + log_handler, + ) + self._set_activation_status(container_status) + self._set_activation_instance_status(container_status) + self._set_activation_pod_id(pod_id=None) + elif container_status == ActivationStatus.RUNNING: + LOGGER.info( + "Updating logs of activation instance " + f"{self.latest_instance.id}", + ) + # TODO: catch exceptions + self.container_engine.update_logs( + self.latest_instance.activation_pod_id, + log_handler, ) - self._set_status(status, None) - elif status == ActivationStatus.RUNNING: - LOGGER.info("Updating logs") - self.update_logs() - except exceptions.ActivationException as e: + except engine_exceptions.ContainerEngineError as e: + # TODO: ensure we handle all the exceptions + # and we set the status correctly self._set_status(ActivationStatus.FAILED, None, "f{e}") - LOGGER.error(f"Monitor Failed {e}") + LOGGER.error(f"Monitor operation Failed {e}") def update_logs(self): - # TODO: Get the Activation Instance from Activation - self._set_activation_instance() + """Update the logs of the latest instance of the activation.""" log_handler = DBLogger(self.latest_instance.id) + # TODO: check latest instance + # TODO: catch exceptions from the engine self.container_engine.update_logs( - self.latest_instance.activation_pod_id, log_handler + container_id=self.latest_instance.activation_pod_id, + log_handler=log_handler, ) def _create_activation_instance(self):