-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Point In Time Recovery #551
base: main
Are you sure you want to change the base?
Changes from all commits
405d5a4
ae58701
2ce63f7
85ddea2
7872d7d
f57a83e
c730506
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,12 +46,18 @@ def is_unit_blocked(self) -> bool: | |
""" | ||
|
||
import datetime | ||
import io | ||
import logging | ||
import pathlib | ||
import typing | ||
from typing import Dict, List, Optional, Tuple | ||
|
||
from charms.data_platform_libs.v0.s3 import S3Requirer | ||
import yaml | ||
from charms.data_platform_libs.v0.s3 import ( | ||
CredentialsChangedEvent, | ||
CredentialsGoneEvent, | ||
S3Requirer, | ||
) | ||
from charms.mysql.v0.mysql import ( | ||
MySQLConfigureInstanceError, | ||
MySQLCreateClusterError, | ||
|
@@ -67,6 +73,7 @@ def is_unit_blocked(self) -> bool: | |
MySQLPrepareBackupForRestoreError, | ||
MySQLRescanClusterError, | ||
MySQLRestoreBackupError, | ||
MySQLRestorePitrError, | ||
MySQLRetrieveBackupWithXBCloudError, | ||
MySQLServiceNotRunningError, | ||
MySQLSetInstanceOfflineModeError, | ||
|
@@ -76,6 +83,7 @@ def is_unit_blocked(self) -> bool: | |
MySQLUnableToGetMemberStateError, | ||
) | ||
from charms.mysql.v0.s3_helpers import ( | ||
ensure_s3_compatible_group_replication_id, | ||
fetch_and_check_existence_of_s3_path, | ||
list_backups_in_s3_path, | ||
upload_content_to_s3, | ||
|
@@ -85,7 +93,12 @@ def is_unit_blocked(self) -> bool: | |
from ops.jujuversion import JujuVersion | ||
from ops.model import BlockedStatus, MaintenanceStatus | ||
|
||
from constants import MYSQL_DATA_DIR | ||
from constants import ( | ||
MYSQL_BINLOGS_COLLECTOR_CONFIG_FILE, | ||
MYSQL_DATA_DIR, | ||
SERVER_CONFIG_PASSWORD_KEY, | ||
SERVER_CONFIG_USERNAME, | ||
) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -102,6 +115,10 @@ def is_unit_blocked(self) -> bool: | |
# to 0 if you are raising the major API version | ||
LIBPATCH = 12 | ||
|
||
ANOTHER_S3_CLUSTER_REPOSITORY_ERROR_MESSAGE = "S3 repository claimed by another cluster" | ||
MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR = ( | ||
"Move restored cluster to another S3 repository" | ||
) | ||
|
||
if typing.TYPE_CHECKING: | ||
from charm import MySQLOperatorCharm | ||
|
@@ -119,6 +136,13 @@ def __init__(self, charm: "MySQLOperatorCharm", s3_integrator: S3Requirer) -> No | |
self.framework.observe(self.charm.on.create_backup_action, self._on_create_backup) | ||
self.framework.observe(self.charm.on.list_backups_action, self._on_list_backups) | ||
self.framework.observe(self.charm.on.restore_action, self._on_restore) | ||
self.framework.observe( | ||
self.s3_integrator.on.credentials_changed, self._on_s3_credentials_changed | ||
) | ||
self.framework.observe(self.charm.on.leader_elected, self._on_s3_credentials_changed) | ||
self.framework.observe( | ||
self.s3_integrator.on.credentials_gone, self._on_s3_credentials_gone | ||
) | ||
|
||
# ------------------ Helpers ------------------ | ||
@property | ||
|
@@ -235,18 +259,33 @@ def _on_list_backups(self, event: ActionEvent) -> None: | |
|
||
# ------------------ Create Backup ------------------ | ||
|
||
def _on_create_backup(self, event: ActionEvent) -> None: | ||
"""Handle the create backup action.""" | ||
logger.info("A backup has been requested on unit") | ||
def _pre_create_backup_checks(self, event: ActionEvent) -> bool: | ||
"""Run some checks before creating the backup. | ||
|
||
Returns: a boolean indicating whether operation should be run. | ||
""" | ||
if not self._s3_integrator_relation_exists: | ||
logger.error("Backup failed: missing relation with S3 integrator charm") | ||
event.fail("Missing relation with S3 integrator charm") | ||
return | ||
return False | ||
|
||
if "s3-block-message" in self.charm.app_peer_data: | ||
logger.error("Backup failed: S3 relation is blocked for write") | ||
event.fail("S3 relation is blocked for write") | ||
return False | ||
|
||
if not self.charm._mysql.is_mysqld_running(): | ||
logger.error(f"Backup failed: process mysqld is not running on {self.charm.unit.name}") | ||
event.fail("Process mysqld not running") | ||
return False | ||
|
||
return True | ||
|
||
def _on_create_backup(self, event: ActionEvent) -> None: | ||
"""Handle the create backup action.""" | ||
logger.info("A backup has been requested on unit") | ||
|
||
if not self._pre_create_backup_checks(event): | ||
return | ||
|
||
datetime_backup_requested = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") | ||
|
@@ -479,7 +518,7 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool: | |
|
||
return True | ||
|
||
def _on_restore(self, event: ActionEvent) -> None: | ||
def _on_restore(self, event: ActionEvent) -> None: # noqa: C901 | ||
"""Handle the restore backup action event. | ||
|
||
Restore a backup from S3 (parameters for which can retrieved from the | ||
|
@@ -489,7 +528,12 @@ def _on_restore(self, event: ActionEvent) -> None: | |
return | ||
|
||
backup_id = event.params["backup-id"].strip().strip("/") | ||
logger.info(f"A restore with backup-id {backup_id} has been requested on unit") | ||
restore_to_time = event.params.get("restore-to-time") | ||
logger.info( | ||
f"A restore with backup-id {backup_id}" | ||
f"{f' to time point {restore_to_time}' if restore_to_time else ''}" | ||
f" has been requested on the unit" | ||
) | ||
|
||
# Retrieve and validate missing S3 parameters | ||
s3_parameters, missing_parameters = self._retrieve_s3_parameters() | ||
|
@@ -519,14 +563,41 @@ def _on_restore(self, event: ActionEvent) -> None: | |
if not success: | ||
logger.error(f"Restore failed: {error_message}") | ||
event.fail(error_message) | ||
|
||
if recoverable: | ||
self._clean_data_dir_and_start_mysqld() | ||
else: | ||
self.charm.app_peer_data.update({ | ||
"s3-block-message": MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR, | ||
"binlogs-collecting": "", | ||
}) | ||
if not self.charm._mysql.start_stop_binlogs_collecting(): | ||
logger.error("Failed to stop binlogs collecting after failed restore") | ||
self.charm.unit.status = BlockedStatus(error_message) | ||
return | ||
|
||
self.charm.app_peer_data.update({ | ||
"s3-block-message": MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR, | ||
"binlogs-collecting": "", | ||
}) | ||
if not self.charm._mysql.start_stop_binlogs_collecting(): | ||
logger.error("Failed to stop binlogs collecting prior to restore") | ||
|
||
success, error_message = self._clean_data_dir_and_start_mysqld() | ||
if not success: | ||
logger.error(f"Restore failed: {error_message}") | ||
self.charm.unit.status = BlockedStatus(error_message) | ||
event.fail(error_message) | ||
return | ||
|
||
if restore_to_time is not None: | ||
self.charm.unit.status = MaintenanceStatus("Running point-in-time-recovery operations") | ||
success, error_message = self._pitr_restore(restore_to_time, s3_parameters) | ||
if not success: | ||
logger.error(f"Restore failed: {error_message}") | ||
event.fail(error_message) | ||
self.charm.unit.status = BlockedStatus(error_message) | ||
return | ||
|
||
# Run post-restore operations | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest moving logic (from lines 585 to 600) into the this will make it easier to trace the code since all the restore logic is in one method ( |
||
self.charm.unit.status = MaintenanceStatus("Running post-restore operations") | ||
success, error_message = self._post_restore() | ||
|
@@ -636,15 +707,29 @@ def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, str]: | |
|
||
return True, "" | ||
|
||
def _pitr_restore( | ||
self, restore_to_time: str, s3_parameters: Dict[str, str] | ||
) -> Tuple[bool, str]: | ||
try: | ||
logger.info("Restoring point-in-time-recovery") | ||
stdout, stderr = self.charm._mysql.restore_pitr( | ||
host=self.charm.get_unit_address(self.charm.unit), | ||
mysql_user=SERVER_CONFIG_USERNAME, | ||
password=self.charm.get_secret("app", SERVER_CONFIG_PASSWORD_KEY), | ||
s3_parameters=s3_parameters, | ||
restore_to_time=restore_to_time, | ||
) | ||
logger.debug(f"Stdout of mysql-pitr-helper restore command: {stdout}") | ||
logger.debug(f"Stderr of mysql-pitr-helper restore command: {stderr}") | ||
except MySQLRestorePitrError: | ||
return False, f"Failed to restore point-in-time-recovery to the {restore_to_time}" | ||
return True, "" | ||
|
||
def _post_restore(self) -> Tuple[bool, str]: | ||
"""Run operations required after restoring a backup. | ||
|
||
Returns: tuple of (success, error_message) | ||
""" | ||
success, error_message = self._clean_data_dir_and_start_mysqld() | ||
if not success: | ||
return success, error_message | ||
|
||
try: | ||
logger.info("Configuring instance to be part of an InnoDB cluster") | ||
self.charm._mysql.configure_instance(create_cluster_admin=False) | ||
|
@@ -674,3 +759,108 @@ def _post_restore(self) -> Tuple[bool, str]: | |
return False, "Failed to rescan the cluster" | ||
|
||
return True, "" | ||
|
||
def _on_s3_credentials_changed(self, event: CredentialsChangedEvent) -> None: | ||
if not self.charm.unit.is_leader(): | ||
logger.debug("Early exit on _on_s3_credentials_changed: unit is not a leader") | ||
return | ||
|
||
if not self._s3_integrator_relation_exists: | ||
logger.debug( | ||
"Early exit on _on_s3_credentials_changed: s3 integrator relation does not exist" | ||
) | ||
return | ||
|
||
if ( | ||
not self.charm._mysql.is_mysqld_running() | ||
or not self.charm.unit_initialized | ||
or not self.charm.upgrade.idle | ||
): | ||
logger.debug( | ||
"Deferring _on_s3_credentials_changed: mysql cluster is not started yet or upgrade is occurring" | ||
) | ||
event.defer() | ||
return | ||
|
||
logger.info("Retrieving s3 parameters from the s3-integrator relation") | ||
s3_parameters, missing_parameters = self._retrieve_s3_parameters() | ||
if missing_parameters: | ||
logger.error(f"Missing S3 parameters: {missing_parameters}") | ||
return | ||
|
||
logger.info("Ensuring compatibility with the provided S3 repository") | ||
if ensure_s3_compatible_group_replication_id( | ||
self.charm._mysql.get_current_group_replication_id(), s3_parameters | ||
): | ||
self.charm.app_peer_data.update({ | ||
"s3-block-message": "", | ||
"binlogs-collecting": "true", | ||
}) | ||
else: | ||
self.charm.app_peer_data.update({ | ||
"s3-block-message": ANOTHER_S3_CLUSTER_REPOSITORY_ERROR_MESSAGE, | ||
"binlogs-collecting": "", | ||
}) | ||
|
||
if not self.charm._mysql.start_stop_binlogs_collecting(True): | ||
logger.error("Failed to restart binlogs collecting after S3 relation update") | ||
|
||
def _on_s3_credentials_gone(self, event: CredentialsGoneEvent) -> None: | ||
if not self.charm.unit.is_leader(): | ||
logger.debug("Early exit on _on_s3_credentials_gone: unit is not a leader") | ||
return | ||
self.charm.app_peer_data.update({ | ||
"s3-block-message": "", | ||
"binlogs-collecting": "", | ||
}) | ||
if not self.charm._mysql.start_stop_binlogs_collecting(): | ||
logger.error("Failed to stop binlogs collecting after S3 relation depart") | ||
|
||
def update_binlogs_collector_config(self) -> bool: | ||
"""Update binlogs collector service config file. | ||
|
||
Returns: whether this operation was successful. | ||
""" | ||
if not self._s3_integrator_relation_exists: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it might be useful to delete the config if the s3 integrator relation is removed—but deferring to @paulomach |
||
logger.error( | ||
"Cannot update binlogs collector config: s3 integrator relation does not exist" | ||
) | ||
return False | ||
|
||
logger.info("Retrieving s3 parameters from the s3-integrator relation") | ||
s3_parameters, missing_parameters = self._retrieve_s3_parameters() | ||
if missing_parameters: | ||
logger.error( | ||
f"Cannot update binlogs collector config: Missing S3 parameters: {missing_parameters}" | ||
) | ||
return False | ||
|
||
bucket_url = ( | ||
f"{s3_parameters['bucket']}/{s3_parameters['path']}binlogs" | ||
if s3_parameters["path"][-1] == "/" | ||
else f"{s3_parameters['bucket']}/{s3_parameters['path']}/binlogs" | ||
) | ||
|
||
with io.StringIO() as string_io: | ||
yaml.dump( | ||
{ | ||
"endpoint": s3_parameters["endpoint"], | ||
"hosts": self.charm._mysql.get_cluster_members(), | ||
"user": SERVER_CONFIG_USERNAME, | ||
"pass": self.charm.get_secret("app", SERVER_CONFIG_PASSWORD_KEY), | ||
"storage_type": "s3", | ||
"s3": { | ||
"access_key_id": s3_parameters["access-key"], | ||
"secret_access_key": s3_parameters["secret-key"], | ||
"bucket_url": bucket_url, | ||
"default_region": s3_parameters["region"], | ||
}, | ||
}, | ||
string_io, | ||
) | ||
self.charm._mysql.write_content_to_file( | ||
path=MYSQL_BINLOGS_COLLECTOR_CONFIG_FILE, | ||
content=string_io.getvalue(), | ||
) | ||
|
||
return True |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
question: if the restore fails and is not
recoverable
, does this instruct the user to change the s3 repository?if so, is that the safest & most relevant action for the user to take?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's the most convenient way to report a restore error with event fail and blocked unit status simultaneously while keeping
s3-block-message
as application status.It's important to mention, that while the unit is blocked, application status can't transit to the
s3-block-message
due to theMySQLOperatorCharm._on_update_status
->if not self.cluster_initialized
->if self._mysql.cluster_metadata_exists(self.get_unit_address(unit))
, that will fail like:Summarizing, if this particular check fails, the application status will be next:
Also, right below your code reference, there is the next code block:
mysql-operator/lib/charms/mysql/v0/backups.py
Lines 578 to 581 in c730506
^ it will imply the same logic I talking about if any of the later backup steps are failed such as
_clean_data_dir_and_start_mysqld
,_pitr_restore
or_post_restore
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think I understand. To clarify, my question was referring to lines 569-571
if the restore fails not
recoverable
, what should the user do?what happens if the user removes that unit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I'm also referring to the 569-571. I'm just pointing to the 578-581 lines to inform that the same logic (like when
_restore
fails and notrecoverable
) will be applied to the later steps like_pitr_restore
and_post_restore
assuming that these operations are notrecoverable
.When it's not
recoverable
, user will receive failed event notification and face single blocked unit. You can see this on the screenshot I've attached to the previous message.Best course for user in this situation will be:
debug-log
and resolve it (for example, user can re-create cluster, see documentation section "Migrate a cluster")Maybe it would be a great idea to put some notice about possible restore failures directly in the documentation.
unit =
active / idle
application =
active
unit =
blocked / idle
application =
active
application =
active
unit =
active / idle
application =
S3 repository claimed by another cluster
unit =
active / idle
application =
Move restored cluster to another S3 repository