diff --git a/lib/charms/mongodb/v0/config_server_interface.py b/lib/charms/mongodb/v0/config_server_interface.py index dadf4199f..cd3cb6034 100644 --- a/lib/charms/mongodb/v0/config_server_interface.py +++ b/lib/charms/mongodb/v0/config_server_interface.py @@ -42,7 +42,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 12 +LIBPATCH = 6 class ClusterProvider(Object): diff --git a/lib/charms/mongodb/v0/helpers.py b/lib/charms/mongodb/v0/helpers.py new file mode 100644 index 000000000..38dda403c --- /dev/null +++ b/lib/charms/mongodb/v0/helpers.py @@ -0,0 +1,274 @@ +"""Simple functions, which can be used in both K8s and VM charms.""" + +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. +import json +import logging +import os +import secrets +import string +import subprocess +from typing import List, Optional, Union + +from charms.mongodb.v0.mongodb import MongoDBConfiguration, MongoDBConnection +from ops.model import ( + ActiveStatus, + BlockedStatus, + MaintenanceStatus, + StatusBase, + WaitingStatus, +) +from pymongo.errors import AutoReconnect, ServerSelectionTimeoutError + +from config import Config + +# The unique Charmhub library identifier, never change it +LIBID = "b9a7fe0c38d8486a9d1ce94c27d4758e" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 8 + + +# path to store mongodb ketFile +KEY_FILE = "keyFile" +TLS_EXT_PEM_FILE = "external-cert.pem" +TLS_EXT_CA_FILE = "external-ca.crt" +TLS_INT_PEM_FILE = "internal-cert.pem" +TLS_INT_CA_FILE = "internal-ca.crt" + +MONGODB_COMMON_DIR = "/var/snap/charmed-mongodb/common" +MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current" + + +DATA_DIR = "/var/lib/mongodb" +CONF_DIR = "/etc/mongod" +LOG_DIR = "/var/lib/mongodb" +MONGODB_LOG_FILENAME = "mongodb.log" +logger = logging.getLogger(__name__) + + +# noinspection GrazieInspection +def get_create_user_cmd( + config: MongoDBConfiguration, mongo_path="charmed-mongodb.mongosh" +) -> List[str]: + """Creates initial admin user for MongoDB. + + Initial admin user can be created only through localhost connection. + see https://www.mongodb.com/docs/manual/core/localhost-exception/ + unfortunately, pymongo not able to create connection which considered + as local connection by MongoDB, even if socket connection used. + As result where are only hackish ways to create initial user. + It is needed to install mongodb-clients inside charm container to make + this function work correctly + """ + return [ + mongo_path, + "mongodb://localhost/admin", + "--quiet", + "--eval", + "db.createUser({" + f" user: '{config.username}'," + " pwd: passwordPrompt()," + " roles:[" + " {'role': 'userAdminAnyDatabase', 'db': 'admin'}, " + " {'role': 'readWriteAnyDatabase', 'db': 'admin'}, " + " {'role': 'clusterAdmin', 'db': 'admin'}, " + " ]," + " mechanisms: ['SCRAM-SHA-256']," + " passwordDigestor: 'server'," + "})", + ] + + +def get_mongod_args( + config: MongoDBConfiguration, + auth: bool = True, + snap_install: bool = False, +) -> str: + """Construct the MongoDB startup command line. + + Returns: + A string representing the command used to start MongoDB. + """ + full_data_dir = f"{MONGODB_COMMON_DIR}{DATA_DIR}" if snap_install else DATA_DIR + full_conf_dir = f"{MONGODB_SNAP_DATA_DIR}{CONF_DIR}" if snap_install else CONF_DIR + full_log_dir = f"{MONGODB_SNAP_DATA_DIR}{LOG_DIR}" if snap_install else LOG_DIR + # in k8s the default logging options that are used for the vm charm are ignored and logs are + # the output of the container. To enable logging to a file it must be set explicitly + logging_options = "" if snap_install else f"--logpath={full_log_dir}/{MONGODB_LOG_FILENAME}" + cmd = [ + # bind to localhost and external interfaces + "--bind_ip_all", + # part of replicaset + f"--replSet={config.replset}", + # db must be located within the snap common directory since the snap is strictly confined + f"--dbpath={full_data_dir}", + "--auditDestination=file", + f"--auditFormat={Config.AuditLog.FORMAT}", + f"--auditPath={full_log_dir}/{Config.AuditLog.FILE_NAME}", + logging_options, + ] + if auth: + cmd.extend(["--auth"]) + + if auth and not config.tls_internal: + # keyFile cannot be used without auth and cannot be used in tandem with internal TLS + cmd.extend( + [ + "--clusterAuthMode=keyFile", + f"--keyFile={full_conf_dir}/{KEY_FILE}", + ] + ) + + if config.tls_external: + cmd.extend( + [ + f"--tlsCAFile={full_conf_dir}/{TLS_EXT_CA_FILE}", + f"--tlsCertificateKeyFile={full_conf_dir}/{TLS_EXT_PEM_FILE}", + # allow non-TLS connections + "--tlsMode=preferTLS", + ] + ) + + # internal TLS can be enabled only in external is enabled + if config.tls_internal and config.tls_external: + cmd.extend( + [ + "--clusterAuthMode=x509", + "--tlsAllowInvalidCertificates", + f"--tlsClusterCAFile={full_conf_dir}/{TLS_INT_CA_FILE}", + f"--tlsClusterFile={full_conf_dir}/{TLS_INT_PEM_FILE}", + ] + ) + + cmd.append("\n") + return " ".join(cmd) + + +def generate_password() -> str: + """Generate a random password string. + + Returns: + A random password string. + """ + choices = string.ascii_letters + string.digits + return "".join([secrets.choice(choices) for _ in range(32)]) + + +def generate_keyfile() -> str: + """Key file used for authentication between replica set peers. + + Returns: + A maximum allowed random string. + """ + choices = string.ascii_letters + string.digits + return "".join([secrets.choice(choices) for _ in range(1024)]) + + +def build_unit_status(mongodb_config: MongoDBConfiguration, unit_ip: str) -> StatusBase: + """Generates the status of a unit based on its status reported by mongod.""" + try: + with MongoDBConnection(mongodb_config) as mongo: + replset_status = mongo.get_replset_status() + + if unit_ip not in replset_status: + return WaitingStatus("Member being added..") + + replica_status = replset_status[unit_ip] + + if replica_status == "PRIMARY": + return ActiveStatus("Primary") + elif replica_status == "SECONDARY": + return ActiveStatus("") + elif replica_status in ["STARTUP", "STARTUP2", "ROLLBACK", "RECOVERING"]: + return WaitingStatus("Member is syncing...") + elif replica_status == "REMOVED": + return WaitingStatus("Member is removing...") + else: + return BlockedStatus(replica_status) + except ServerSelectionTimeoutError as e: + # ServerSelectionTimeoutError is commonly due to ReplicaSetNoPrimary + logger.debug("Got error: %s, while checking replica set status", str(e)) + return WaitingStatus("Waiting for primary re-election..") + except AutoReconnect as e: + # AutoReconnect is raised when a connection to the database is lost and an attempt to + # auto-reconnect will be made by pymongo. + logger.debug("Got error: %s, while checking replica set status", str(e)) + return WaitingStatus("Waiting to reconnect to unit..") + + +def copy_licenses_to_unit(): + """Copies licenses packaged in the snap to the charm's licenses directory.""" + os.makedirs("src/licenses", exist_ok=True) + subprocess.check_output("cp LICENSE src/licenses/LICENSE-charm", shell=True) + subprocess.check_output( + "cp -r /snap/charmed-mongodb/current/licenses/* src/licenses", shell=True + ) + + +_StrOrBytes = Union[str, bytes] + + +def process_pbm_error_k8s(status_str: str, unit_name: str) -> Optional[str]: + """Processes the pbm error for the k8s charm. + + Unlike the VM charm, the K8s pbm command does not cause an exception when it fails and it is + necessary to process the errors manually + """ + try: + status_str = json.loads(status_str) + for node_info in status_str["cluster"][0]["nodes"]: + if unit_name.replace("/", "-") not in node_info["host"]: + continue + + return process_pbm_error(node_info["errors"][0]) + except KeyError: + # if the keys for parsing errors are not present, proceed as normal + pass + + +def process_pbm_error(error_string: Optional[_StrOrBytes]) -> str: + """Parses pbm error string and returns a user friendly message.""" + message = "couldn't configure s3 backup option" + if not error_string: + return message + if isinstance(error_string, bytes): + error_string = error_string.decode("utf-8") + if "status code: 403" in error_string: # type: ignore + message = "s3 credentials are incorrect." + elif "status code: 404" in error_string: # type: ignore + message = "s3 configurations are incompatible." + elif "status code: 301" in error_string: # type: ignore + message = "s3 configurations are incompatible." + return message + + +def current_pbm_op(pbm_status: str) -> str: + """Parses pbm status for the operation that pbm is running.""" + pbm_status = json.loads(pbm_status) + return pbm_status["running"] if "running" in pbm_status else "" + + +def process_pbm_status(pbm_status: str) -> StatusBase: + """Parses current pbm operation and returns unit status.""" + current_op = current_pbm_op(pbm_status) + # no operations are currently running with pbm + if current_op == {}: + return ActiveStatus("") + + if current_op["type"] == "backup": + backup_id = current_op["name"] + return MaintenanceStatus(f"backup started/running, backup id:'{backup_id}'") + + if current_op["type"] == "restore": + backup_id = current_op["name"] + return MaintenanceStatus(f"restore started/running, backup id:'{backup_id}'") + + if current_op["type"] == "resync": + return WaitingStatus("waiting to sync s3 configurations.") + + return ActiveStatus() diff --git a/lib/charms/mongodb/v0/mongodb_backups.py b/lib/charms/mongodb/v0/mongodb_backups.py new file mode 100644 index 000000000..c7fca807d --- /dev/null +++ b/lib/charms/mongodb/v0/mongodb_backups.py @@ -0,0 +1,677 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +"""In this class, we manage backup configurations and actions. + +Specifically backups are handled with Percona Backup MongoDB (pbm). +A user for PBM is created when MongoDB is first started during the start phase. +This user is named "backup". +""" + +import json +import logging +import re +import subprocess +import time +from typing import Dict, List, Optional + +from charms.data_platform_libs.v0.s3 import CredentialsChangedEvent, S3Requirer +from charms.mongodb.v0.helpers import ( + current_pbm_op, + process_pbm_error, + process_pbm_error_k8s, + process_pbm_status, +) +from charms.operator_libs_linux.v2 import snap +from ops.framework import Object +from ops.model import ( + ActiveStatus, + BlockedStatus, + MaintenanceStatus, + StatusBase, + WaitingStatus, +) +from ops.pebble import ExecError +from tenacity import ( + Retrying, + before_log, + retry, + retry_if_exception_type, + stop_after_attempt, + wait_fixed, +) + +# The unique Charmhub library identifier, never change it +LIBID = "18c461132b824ace91af0d7abe85f40e" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 7 + +logger = logging.getLogger(__name__) + +S3_PBM_OPTION_MAP = { + "region": "storage.s3.region", + "bucket": "storage.s3.bucket", + "path": "storage.s3.prefix", + "access-key": "storage.s3.credentials.access-key-id", + "secret-key": "storage.s3.credentials.secret-access-key", + "endpoint": "storage.s3.endpointUrl", + "storage-class": "storage.s3.storageClass", +} +S3_RELATION = "s3-credentials" +REMAPPING_PATTERN = r"\ABackup doesn't match current cluster topology - it has different replica set names. Extra shards in the backup will cause this, for a simple example. The extra/unknown replica set names found in the backup are: ([^,\s]+)([.] Backup has no data for the config server or sole replicaset)?\Z" +PBM_STATUS_CMD = ["status", "-o", "json"] +MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current" +BACKUP_RESTORE_MAX_ATTEMPTS = 5 +BACKUP_RESTORE_ATTEMPT_COOLDOWN = 15 + + +class ResyncError(Exception): + """Raised when pbm is resyncing configurations and is not ready to be used.""" + + +class SetPBMConfigError(Exception): + """Raised when pbm cannot configure a given option.""" + + +class PBMBusyError(Exception): + """Raised when PBM is busy and cannot run another operation.""" + + +class RestoreError(Exception): + """Raised when backup operation is failed.""" + + +class BackupError(Exception): + """Raised when backup operation is failed.""" + + +def _backup_restore_retry_before_sleep(retry_state) -> None: + logger.error( + f"Attempt {retry_state.attempt_number} failed. {BACKUP_RESTORE_MAX_ATTEMPTS - retry_state.attempt_number} attempts left. Retrying after {BACKUP_RESTORE_ATTEMPT_COOLDOWN} seconds." + ), + + +def _backup_retry_stop_condition(retry_state) -> bool: + if isinstance(retry_state.outcome.exception(), BackupError): + return True + return retry_state.attempt_number >= BACKUP_RESTORE_MAX_ATTEMPTS + + +def _restore_retry_stop_condition(retry_state) -> bool: + if isinstance(retry_state.outcome.exception(), RestoreError): + return True + return retry_state.attempt_number >= BACKUP_RESTORE_MAX_ATTEMPTS + + +class MongoDBBackups(Object): + """Manages MongoDB backups.""" + + def __init__(self, charm, substrate="vm"): + """Manager of MongoDB client relations.""" + super().__init__(charm, "client-relations") + self.charm = charm + self.substrate = substrate + + # s3 relation handles the config options for s3 backups + self.s3_client = S3Requirer(self.charm, S3_RELATION) + self.framework.observe( + self.s3_client.on.credentials_changed, self._on_s3_credential_changed + ) + self.framework.observe(self.charm.on.create_backup_action, self._on_create_backup_action) + self.framework.observe(self.charm.on.list_backups_action, self._on_list_backups_action) + self.framework.observe(self.charm.on.restore_action, self._on_restore_action) + + def _on_s3_credential_changed(self, event: CredentialsChangedEvent): + """Sets pbm credentials, resyncs if necessary and reports config errors.""" + # handling PBM configurations requires that MongoDB is running and the pbm snap is + # installed. + action = "configure-pbm" + if not self.charm.db_initialised: + self._defer_action_with_info_log( + event, action, "Set PBM credentials, MongoDB not ready." + ) + return + + if not self.charm.has_backup_service(): + self._defer_action_with_info_log( + event, action, "Set PBM configurations, pbm-agent service not found." + ) + return + + self._configure_pbm_options(event) + + def _on_create_backup_action(self, event) -> None: + action = "backup" + if self.model.get_relation(S3_RELATION) is None: + self._fail_action_with_error_log( + event, + action, + "Relation with s3-integrator charm missing, cannot create backup.", + ) + return + + # only leader can create backups. This prevents multiple backups from being attempted at + # once. + if not self.charm.unit.is_leader(): + self._fail_action_with_error_log( + event, action, "The action can be run only on leader unit." + ) + return + + # cannot create backup if pbm is not ready. This could be due to: resyncing, incompatible, + # options, incorrect credentials, or already creating a backup + pbm_status = self._get_pbm_status() + self.charm.unit.status = pbm_status + + if isinstance(pbm_status, MaintenanceStatus): + self._fail_action_with_error_log( + event, + action, + "Can only create one backup at a time, please wait for current backup to finish.", + ) + return + + if isinstance(pbm_status, WaitingStatus): + self._defer_action_with_info_log( + event, + action, + "Sync-ing configurations needs more time, must wait before creating a backup.", + ) + return + + if isinstance(pbm_status, BlockedStatus): + self._fail_action_with_error_log(event, action, pbm_status.message) + return + + try: + backup_id = self._try_to_backup() + self.charm.unit.status = MaintenanceStatus( + f"backup started/running, backup id:'{backup_id}'" + ) + self._success_action_with_info_log( + event, action, {"backup-status": f"backup started. backup id: {backup_id}"} + ) + except (subprocess.CalledProcessError, ExecError, Exception) as e: + self._fail_action_with_error_log(event, action, str(e)) + return + + def _on_list_backups_action(self, event) -> None: + action = "list-backups" + if self.model.get_relation(S3_RELATION) is None: + self._fail_action_with_error_log( + event, + action, + "Relation with s3-integrator charm missing, cannot list backups.", + ) + return + + # cannot list backups if pbm is resyncing, or has incompatible options or incorrect + # credentials + pbm_status = self._get_pbm_status() + self.charm.unit.status = pbm_status + + if isinstance(pbm_status, WaitingStatus): + self._defer_action_with_info_log( + event, + action, + "Sync-ing configurations needs more time, must wait before listing backups.", + ) + return + + if isinstance(pbm_status, BlockedStatus): + self._fail_action_with_error_log(event, action, pbm_status.message) + return + + try: + formatted_list = self._generate_backup_list_output() + self._success_action_with_info_log(event, action, {"backups": formatted_list}) + except (subprocess.CalledProcessError, ExecError) as e: + self._fail_action_with_error_log(event, action, str(e)) + return + + def _on_restore_action(self, event) -> None: + action = "restore" + if self.model.get_relation(S3_RELATION) is None: + self._fail_action_with_error_log( + event, + action, + "Relation with s3-integrator charm missing, cannot restore from a backup.", + ) + return + + backup_id = event.params.get("backup-id") + if not backup_id: + self._fail_action_with_error_log(event, action, "Missing backup-id to restore") + return + + # only leader can restore backups. This prevents multiple restores from being attempted at + # once. + if not self.charm.unit.is_leader(): + self._fail_action_with_error_log( + event, action, "The action can be run only on leader unit." + ) + return + + # cannot restore backup if pbm is not ready. This could be due to: resyncing, incompatible, + # options, incorrect credentials, creating a backup, or already performing a restore. + pbm_status = self._get_pbm_status() + self.charm.unit.status = pbm_status + if isinstance(pbm_status, MaintenanceStatus): + self._fail_action_with_error_log( + event, action, "Please wait for current backup/restore to finish." + ) + return + + if isinstance(pbm_status, WaitingStatus): + self._defer_action_with_info_log( + event, + action, + "Sync-ing configurations needs more time, must wait before restoring.", + ) + return + + if isinstance(pbm_status, BlockedStatus): + self._fail_action_with_error_log( + event, action, f"Cannot restore backup {pbm_status.message}." + ) + return + + # sometimes when we are trying to restore pmb can be resyncing, so we need to retry + try: + self._try_to_restore(backup_id) + self.charm.unit.status = MaintenanceStatus( + f"restore started/running, backup id:'{backup_id}'" + ) + self._success_action_with_info_log( + event, action, {"restore-status": "restore started"} + ) + except ResyncError: + raise + except RestoreError as restore_error: + self._fail_action_with_error_log(event, action, str(restore_error)) + + # BEGIN: helper functions + + def _configure_pbm_options(self, event) -> None: + action = "configure-pbm" + try: + self._set_config_options() + self._resync_config_options() + except SetPBMConfigError: + self.charm.unit.status = BlockedStatus("couldn't configure s3 backup options.") + return + except snap.SnapError as e: + logger.error("An exception occurred when starting pbm agent, error: %s.", str(e)) + self.charm.unit.status = BlockedStatus("couldn't start pbm") + return + except ResyncError: + self.charm.unit.status = WaitingStatus("waiting to sync s3 configurations.") + self._defer_action_with_info_log( + event, action, "Sync-ing configurations needs more time." + ) + return + except PBMBusyError: + self.charm.unit.status = WaitingStatus("waiting to sync s3 configurations.") + self._defer_action_with_info_log( + event, + action, + "Cannot update configs while PBM is running, must wait for PBM action to finish.", + ), + return + except ExecError as e: + self.charm.unit.status = BlockedStatus(process_pbm_error(e.stdout)) + return + except subprocess.CalledProcessError as e: + logger.error("Syncing configurations failed: %s", str(e)) + + self.charm.unit.status = self._get_pbm_status() + + def _set_config_options(self): + """Applying given configurations with pbm.""" + # clearing out configurations options before resetting them leads to a quicker reysnc + # process + self.charm.clear_pbm_config_file() + + # the pbm tool can only set one configuration at a time. + for pbm_key, pbm_value in self._get_pbm_configs().items(): + try: + config_cmd = ["config", "--set", f"{pbm_key}={pbm_value}"] + self.charm.run_pbm_command(config_cmd) + except (subprocess.CalledProcessError, ExecError): + logger.error( + "Failed to configure the PBM snap option: %s", + pbm_key, + ) + raise SetPBMConfigError + + def _get_pbm_configs(self) -> Dict: + """Returns a dictionary of desired PBM configurations.""" + pbm_configs = {"storage.type": "s3"} + credentials = self.s3_client.get_s3_connection_info() + for s3_option, s3_value in credentials.items(): + if s3_option not in S3_PBM_OPTION_MAP: + continue + + pbm_configs[S3_PBM_OPTION_MAP[s3_option]] = s3_value + return pbm_configs + + def _resync_config_options(self): + """Attempts to sync pbm config options and sets status in case of failure.""" + self.charm.start_backup_service() + + # pbm has a flakely resync and it is necessary to wait for no actions to be running before + # resync-ing. See: https://jira.percona.com/browse/PBM-1038 + for attempt in Retrying( + stop=stop_after_attempt(20), + wait=wait_fixed(5), + reraise=True, + ): + with attempt: + pbm_status = self._get_pbm_status() + # wait for backup/restore to finish + if isinstance(pbm_status, (MaintenanceStatus)): + raise PBMBusyError + + # if a resync is running restart the service + if isinstance(pbm_status, (WaitingStatus)): + self.charm.restart_backup_service() + raise PBMBusyError + + # wait for re-sync and update charm status based on pbm syncing status. Need to wait for + # 2 seconds for pbm_agent to receive the resync command before verifying. + self.charm.run_pbm_command(["config", "--force-resync"]) + time.sleep(2) + self._wait_pbm_status() + + @retry( + stop=stop_after_attempt(20), + reraise=True, + retry=retry_if_exception_type(ResyncError), + before=before_log(logger, logging.DEBUG), + ) + def _wait_pbm_status(self) -> None: + """Wait for pbm_agent to resolve errors and return the status of pbm. + + The pbm status is set by the pbm_agent daemon which needs time to both resync and resolve + errors in configurations. Resync-ing is a longer process and should take around 5 minutes. + Configuration errors generally occur when the configurations change and pbm_agent is + updating, this is generally quick and should take <15s. If errors are not resolved in 15s + it means there is an incorrect configuration which will require user intervention. + + Retrying for resync is handled by decorator, retrying for configuration errors is handled + within this function. + """ + # on occasion it takes the pbm_agent daemon time to update its configs, meaning that it + # will error for incorrect configurations for <15s before resolving itself. + + for attempt in Retrying( + stop=stop_after_attempt(3), + wait=wait_fixed(5), + reraise=True, + ): + with attempt: + try: + pbm_status = self.charm.run_pbm_command(PBM_STATUS_CMD) + + if "Resync" in current_pbm_op(pbm_status): + # since this process takes several minutes we should let the user know + # immediately. + self.charm.unit.status = WaitingStatus( + "waiting to sync s3 configurations." + ) + raise ResyncError + except ExecError as e: + self.charm.unit.status = BlockedStatus(process_pbm_error(e.stdout)) + + def _get_pbm_status(self) -> Optional[StatusBase]: + """Retrieve pbm status.""" + if not self.charm.has_backup_service(): + return WaitingStatus("waiting for pbm to start") + + if not self.model.get_relation(S3_RELATION): + logger.info("No configurations for backups, not relation to s3-charm.") + return None + + try: + previous_pbm_status = self.charm.unit.status + pbm_status = self.charm.run_pbm_command(PBM_STATUS_CMD) + self._log_backup_restore_result(pbm_status, previous_pbm_status) + unit_status_pbm = process_pbm_status(pbm_status) + + # K8s charms require special processing for pbm errors + pbm_error = ( + process_pbm_error_k8s(pbm_status, self.charm.unit.name) + if self.substrate == "k8s" + else None + ) + if unit_status_pbm == ActiveStatus() and pbm_error: + return BlockedStatus(pbm_error) + + return unit_status_pbm + except ExecError as e: + logger.error(f"Failed to get pbm status. {e}") + return BlockedStatus(process_pbm_error(e.stdout)) + except subprocess.CalledProcessError as e: + # VM deployments only - pbm pipes a return code of 1, but its output shows the true + # error code so it is necessary to parse the output + return BlockedStatus(process_pbm_error(e.output)) + except Exception as e: + # pbm pipes a return code of 1, but its output shows the true error code so it is + # necessary to parse the output + logger.error(f"Failed to get pbm status: {e}") + return BlockedStatus("PBM error") + + def _generate_backup_list_output(self) -> str: + """Generates a list of backups in a formatted table. + + List contains successful, failed, and in progress backups in order of ascending time. + + Raises ExecError if pbm command fails. + """ + backup_list = [] + pbm_status = self.charm.run_pbm_command(["status", "--out=json"]) + # processes finished and failed backups + pbm_status = json.loads(pbm_status) + backups = pbm_status["backups"]["snapshot"] or [] + for backup in backups: + backup_status = "finished" + if backup["status"] == "error": + # backups from a different cluster have an error status, but they should show as + # finished + if self._backup_from_different_cluster(backup.get("error", "")): + backup_status = "finished" + else: + # display reason for failure if available + backup_status = "failed: " + backup.get("error", "N/A") + if backup["status"] not in ["error", "done"]: + backup_status = "in progress" + backup_list.append((backup["name"], backup["type"], backup_status)) + + # process in progress backups + running_backup = pbm_status["running"] + if running_backup.get("type", None) == "backup": + # backups are sorted in reverse order + last_reported_backup = backup_list[0] + # pbm will occasionally report backups that are currently running as failed, so it is + # necessary to correct the backup list in this case. + if last_reported_backup[0] == running_backup["name"]: + backup_list[0] = (last_reported_backup[0], last_reported_backup[1], "in progress") + else: + backup_list.append((running_backup["name"], "logical", "in progress")) + + # sort by time and return formatted output + return self._format_backup_list(sorted(backup_list, key=lambda pair: pair[0])) + + def _format_backup_list(self, backup_list: List[str]) -> str: + """Formats provided list of backups as a table.""" + backups = ["{:<21s} | {:<12s} | {:s}".format("backup-id", "backup-type", "backup-status")] + + backups.append("-" * len(backups[0])) + for backup_id, backup_type, backup_status in backup_list: + backups.append( + "{:<21s} | {:<12s} | {:s}".format(backup_id, backup_type, backup_status) + ) + + return "\n".join(backups) + + def _backup_from_different_cluster(self, backup_status: str) -> bool: + """Returns if a given backup was made on a different cluster.""" + return re.search(REMAPPING_PATTERN, backup_status) is not None + + def _try_to_restore(self, backup_id: str) -> None: + """Try to restore cluster a backup specified by backup id. + + If PBM is resyncing, the function will retry to create backup + (up to BACKUP_RESTORE_MAX_ATTEMPTS times) with BACKUP_RESTORE_ATTEMPT_COOLDOWN + time between attempts. + + If PMB returen any other error, the function will raise RestoreError. + """ + for attempt in Retrying( + stop=_restore_retry_stop_condition, + wait=wait_fixed(BACKUP_RESTORE_ATTEMPT_COOLDOWN), + reraise=True, + before_sleep=_backup_restore_retry_before_sleep, + ): + with attempt: + try: + remapping_args = self._remap_replicaset(backup_id) + restore_cmd = ["restore", backup_id] + if remapping_args: + restore_cmd = restore_cmd + remapping_args.split(" ") + self.charm.run_pbm_command(restore_cmd) + except (subprocess.CalledProcessError, ExecError) as e: + if isinstance(e, subprocess.CalledProcessError): + error_message = e.output.decode("utf-8") + else: + error_message = str(e.stderr) + if "Resync" in error_message: + raise ResyncError + + fail_message = f"Restore failed: {str(e)}" + if f"backup '{backup_id}' not found" in error_message: + fail_message = f"Restore failed: Backup id '{backup_id}' does not exist in list of backups, please check list-backups for the available backup_ids." + + raise RestoreError(fail_message) + + def _try_to_backup(self): + """Try to create a backup and return the backup id. + + If PBM is resyncing, the function will retry to create backup + (up to BACKUP_RESTORE_MAX_ATTEMPTS times) + with BACKUP_RESTORE_ATTEMPT_COOLDOWN time between attempts. + + If PMB returen any other error, the function will raise BackupError. + """ + for attempt in Retrying( + stop=_backup_retry_stop_condition, + wait=wait_fixed(BACKUP_RESTORE_ATTEMPT_COOLDOWN), + reraise=True, + before_sleep=_backup_restore_retry_before_sleep, + ): + with attempt: + try: + output = self.charm.run_pbm_command(["backup"]) + backup_id_match = re.search( + r"Starting backup '(?P\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)'", + output, + ) + return backup_id_match.group("backup_id") if backup_id_match else "N/A" + except (subprocess.CalledProcessError, ExecError) as e: + if isinstance(e, subprocess.CalledProcessError): + error_message = e.output.decode("utf-8") + else: + error_message = str(e.stderr) + if "Resync" in error_message: + raise ResyncError + + fail_message = f"Backup failed: {str(e)}" + + raise BackupError(fail_message) + + def _remap_replicaset(self, backup_id: str) -> str: + """Returns options for remapping a replica set during a cluster migration restore. + + Args: + backup_id: str of the backup to check for remapping + + Raises: CalledProcessError + """ + pbm_status = self.charm.run_pbm_command(PBM_STATUS_CMD) + pbm_status = json.loads(pbm_status) + + # grab the error status from the backup if present + backups = pbm_status["backups"]["snapshot"] or [] + backup_status = "" + for backup in backups: + if not backup_id == backup["name"]: + continue + + backup_status = backup.get("error", "") + break + + if not self._backup_from_different_cluster(backup_status): + return "" + + # TODO in the future when we support conf servers and shards this will need to be more + # comprehensive. + old_cluster_name = re.search(REMAPPING_PATTERN, backup_status).group(1) + current_cluster_name = self.charm.app.name + logger.debug( + "Replica set remapping is necessary for restore, old cluster name: %s ; new cluster name: %s", + old_cluster_name, + current_cluster_name, + ) + return f"--replset-remapping {current_cluster_name}={old_cluster_name}" + + def _fail_action_with_error_log(self, event, action: str, message: str) -> None: + logger.error("%s failed: %s", action.capitalize(), message) + event.fail(message) + + def _defer_action_with_info_log(self, event, action: str, message: str) -> None: + logger.info("Deferring %s: %s", action, message) + event.defer() + + def _success_action_with_info_log(self, event, action: str, results: Dict[str, str]) -> None: + logger.info("%s completed successfully", action.capitalize()) + event.set_results(results) + + def _log_backup_restore_result(self, current_pbm_status, previous_pbm_status) -> None: + """Logs the result of the backup/restore operation. + + Expected to be called for not failed operations. + """ + operation_result = self._get_backup_restore_operation_result( + current_pbm_status, previous_pbm_status + ) + logger.info(operation_result) + + def _get_backup_restore_operation_result(self, current_pbm_status, previous_pbm_status) -> str: + """Returns a string with the result of the backup/restore operation. + + The function call is expected to be only for not failed operations. + The operation is taken from previous status of the unit and expected + to contain the operation type (backup/restore) and the backup id. + """ + if ( + isinstance(current_pbm_status, type(previous_pbm_status)) + and current_pbm_status.message == previous_pbm_status.message + ): + return f"Operation is still in progress: '{current_pbm_status.message}'" + + if ( + isinstance(previous_pbm_status, MaintenanceStatus) + and "backup id:" in previous_pbm_status.message + ): + backup_id = previous_pbm_status.message.split("backup id:")[-1].strip() + if "restore" in previous_pbm_status.message: + return f"Restore from backup {backup_id} completed successfully" + if "backup" in previous_pbm_status.message: + return f"Backup {backup_id} completed successfully" + + return "Unknown operation result" diff --git a/lib/charms/mongodb/v1/mongodb_backups.py b/lib/charms/mongodb/v1/mongodb_backups.py index 9828d7a6b..3908457f9 100644 --- a/lib/charms/mongodb/v1/mongodb_backups.py +++ b/lib/charms/mongodb/v1/mongodb_backups.py @@ -41,7 +41,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 6 +LIBPATCH = 3 logger = logging.getLogger(__name__) diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index fda0315bd..293c33b9e 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -62,7 +62,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 14 +LIBPATCH = 7 KEYFILE_KEY = "key-file" HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())