From 8c7ce2385557de2860a8fbd813d586259a773c8f Mon Sep 17 00:00:00 2001 From: Jacob Callahan Date: Tue, 19 Nov 2024 16:21:37 -0500 Subject: [PATCH] Add initial attempts to automatically clean up dangling hosts Now, when workflows fail, Broker will attempt to find a handgling host and check it in if found. --- broker/config_manager.py | 2 +- broker/config_migrations/example_migration.py | 25 ++++++++ broker/config_migrations/v0_6_3.py | 22 +++++++ broker/providers/ansible_tower.py | 64 +++++++++++++++++-- 4 files changed, 107 insertions(+), 6 deletions(-) create mode 100644 broker/config_migrations/example_migration.py create mode 100644 broker/config_migrations/v0_6_3.py diff --git a/broker/config_manager.py b/broker/config_manager.py index f5c21d7..d58adda 100644 --- a/broker/config_manager.py +++ b/broker/config_manager.py @@ -99,7 +99,7 @@ def _get_migrations(self, force_version=None): migrations = [] for _, name, _ in pkgutil.iter_modules(config_migrations.__path__): module = importlib.import_module(f"broker.config_migrations.{name}") - if hasattr(module, "run_migrations"): + if hasattr(module, "run_migrations") and "example" not in module.__name__: if force_version and force_version == file_name_to_ver(name): migrations.append(module) break diff --git a/broker/config_migrations/example_migration.py b/broker/config_migrations/example_migration.py new file mode 100644 index 0000000..fb82976 --- /dev/null +++ b/broker/config_migrations/example_migration.py @@ -0,0 +1,25 @@ +"""Config migrations for versions older than 0.6.1 to 0.6.1. + +Copy this file to a new file in the same directory and modify it to create a new migration. +The new file must be named `vX_Y_Z.py` where X_Y_Z is the version you are migrating to. + +e.g. cp example_migration.py v0_6_1.py +""" + +from logzero import logger + +TO_VERSION = "0.6.1" + + +def example_migration(config_dict): + """Migrations should modify the config_dict in place and return it.""" + config_dict["example_key"] = "example_value" + return config_dict + + +def run_migrations(config_dict): + """Run all migrations.""" + logger.info(f"Running config migrations for {TO_VERSION}.") + config_dict = example_migration(config_dict) + config_dict["_version"] = TO_VERSION + return config_dict diff --git a/broker/config_migrations/v0_6_3.py b/broker/config_migrations/v0_6_3.py new file mode 100644 index 0000000..997d23e --- /dev/null +++ b/broker/config_migrations/v0_6_3.py @@ -0,0 +1,22 @@ +"""Config migrations for versions older than 0.6.3 to 0.6.3.""" + +from logzero import logger + +TO_VERSION = "0.6.3" + + +def add_dangling_behavior(config_dict): + """Add the dangling_behavior config to AnsibleTower.""" + if "AnsibleTower" in config_dict: + if "dangling_behavior" not in config_dict["AnsibleTower"]: + logger.debug("Adding dangling_behavior to AnsibleTower.") + config_dict["AnsibleTower"]["dangling_behavior"] = "checkin" + return config_dict + + +def run_migrations(config_dict): + """Run all migrations.""" + logger.info(f"Running config migrations for {TO_VERSION}.") + config_dict = add_dangling_behavior(config_dict) + config_dict["_version"] = TO_VERSION + return config_dict diff --git a/broker/providers/ansible_tower.py b/broker/providers/ansible_tower.py index 6684a2e..11f5311 100644 --- a/broker/providers/ansible_tower.py +++ b/broker/providers/ansible_tower.py @@ -8,9 +8,10 @@ import click from dynaconf import Validator from logzero import logger +from rich.prompt import Prompt from broker import exceptions -from broker.helpers import eval_filter, find_origin +from broker.helpers import eval_filter, find_origin, update_inventory from broker.settings import settings try: @@ -23,7 +24,7 @@ def convert_pseudonamespaces(attr_dict): - """Recursively convert PsuedoNamespace objects into dictionaries.""" + """Recursively convert PseudoNamespace objects into dictionaries.""" out_dict = {} for key, value in attr_dict.items(): if isinstance(value, awxkit.utils.PseudoNamespace): @@ -121,6 +122,7 @@ class AnsibleTower(Provider): | Validator("ANSIBLETOWER.token", must_exist=True) ), Validator("ANSIBLETOWER.inventory", default=None), + Validator("ANSIBLETOWER.dangling_behavior", default="checkin"), ] _checkout_options = [ @@ -164,11 +166,12 @@ def __init__(self, **kwargs): self.uname = settings.ANSIBLETOWER.get("username") self.pword = settings.ANSIBLETOWER.get("password") self.token = settings.ANSIBLETOWER.get("token") + self.dangling_behavior = settings.ANSIBLETOWER.get("dangling_behavior") self._inventory = kwargs.get("tower_inventory") or settings.ANSIBLETOWER.inventory # Init the class itself config = kwargs.get("config") root = kwargs.get("root") - self._v2, self.username = get_awxkit_and_uname( + self._v2, self.uname = get_awxkit_and_uname( config=config, root=root, url=self.url, @@ -374,6 +377,54 @@ def _get_failure_messages(self, workflow): else: return failure_messages + def _try_get_dangling_hosts(self, failed_workflow): + """Get one or more hosts that may have been left behind by a failed workflow.""" + hosts = [] + for node in failed_workflow.get_related("workflow_nodes").results: + if not (job_fields := node.summary_fields.get("job", {})) or job_fields.get( + "failed" + ): # skip jobs with no summary fields and failed jobs + continue + if jobs := self._v2.jobs.get(id=job_fields["id"]).results: + if vm_name := jobs[0].artifacts.get("vm_name"): + hosts.append(vm_name) + return list(set(hosts)) + + def handle_dangling_hosts(self, job): + """Attempt to check in dangling hosts associated with the given job.""" + dangling_hosts = self._try_get_dangling_hosts(job) + if not dangling_hosts: + logger.debug("No dangling hosts found for the failed job.") + return + dangling_behavior = self.dangling_behavior + for dangling_host in dangling_hosts: + logger.info(f"Found dangling host: {dangling_host}") + if dangling_behavior == "prompt": + choice = Prompt.ask( + "What would you like to do with this host? [c/s/cA/sA]\n", + "Checkin (c), Store (s), Checkin All (cA), Store All (sA)", + choices=["c", "s", "cA", "sA"], + ) + if choice == "cA": + dangling_behavior = "checkin" + elif choice == "sA": + dangling_behavior = "store" + else: + choice = None + # handle checkins + if choice == "c" or dangling_behavior == "checkin": + try: + self.release(dangling_host) + logger.debug(f"Successfully checked in dangling host: {dangling_host}") + except exceptions.BrokerError: + logger.warning(f"Failed to check in dangling host: {dangling_host}") + elif choice == "s" or dangling_behavior == "store": + logger.debug(f"Storing dangling host: {dangling_host}") + host = self._v2.hosts.get(name=dangling_host).results[0] + host = self._compile_host_info(host) + host["failed"] = True + update_inventory(add=host) + def _compile_host_info(self, host): try: host_facts = host.related.ansible_facts.get() @@ -601,12 +652,15 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor logger.info(f"Waiting for job: \nAPI: {job_api_url}\nUI: {job_ui_url}") job.wait_until_completed(timeout=settings.ANSIBLETOWER.workflow_timeout) if job.status != "successful": + failure_message = self._get_failure_messages(job) message_data = { f"{subject.capitalize()} Status": job.status, - "Reason(s)": self._get_failure_messages(job), + "Reason(s)": failure_message, "URL": job_ui_url, } helpers.emit(message_data) + if "was automatically checked-in" not in failure_message: + self.handle_dangling_hosts(job) raise JobExecutionError(message_data=message_data["Reason(s)"]) if strategy := kwargs.pop("artifacts", None): return self._merge_artifacts(job, strategy=strategy) @@ -614,7 +668,7 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor def get_inventory(self, user=None): """Compile a list of hosts based on any inventory a user's name is mentioned.""" - user = user or self.username + user = user or self.uname invs = [ inv for inv in self._v2.inventory.get(page_size=200).results