Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Point In Time Recovery #551

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ restore:
backup-id:
type: string
description: A backup-id to identify the backup to restore (format = %Y-%m-%dT%H:%M:%SZ)
restore-to-time:
type: string
description: Point-in-time-recovery target in MySQL timestamp format.

pre-upgrade-check:
description: Run necessary pre-upgrade checks and preparations before executing a charm refresh.
Expand Down
216 changes: 203 additions & 13 deletions lib/charms/mysql/v0/backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,18 @@ def is_unit_blocked(self) -> bool:
"""

import datetime
import io
import logging
import pathlib
import typing
from typing import Dict, List, Optional, Tuple

from charms.data_platform_libs.v0.s3 import S3Requirer
import yaml
from charms.data_platform_libs.v0.s3 import (
CredentialsChangedEvent,
CredentialsGoneEvent,
S3Requirer,
)
from charms.mysql.v0.mysql import (
MySQLConfigureInstanceError,
MySQLCreateClusterError,
Expand All @@ -67,6 +73,7 @@ def is_unit_blocked(self) -> bool:
MySQLPrepareBackupForRestoreError,
MySQLRescanClusterError,
MySQLRestoreBackupError,
MySQLRestorePitrError,
MySQLRetrieveBackupWithXBCloudError,
MySQLServiceNotRunningError,
MySQLSetInstanceOfflineModeError,
Expand All @@ -76,6 +83,7 @@ def is_unit_blocked(self) -> bool:
MySQLUnableToGetMemberStateError,
)
from charms.mysql.v0.s3_helpers import (
ensure_s3_compatible_group_replication_id,
fetch_and_check_existence_of_s3_path,
list_backups_in_s3_path,
upload_content_to_s3,
Expand All @@ -85,7 +93,12 @@ def is_unit_blocked(self) -> bool:
from ops.jujuversion import JujuVersion
from ops.model import BlockedStatus, MaintenanceStatus

from constants import MYSQL_DATA_DIR
from constants import (
MYSQL_BINLOGS_COLLECTOR_CONFIG_FILE,
MYSQL_DATA_DIR,
SERVER_CONFIG_PASSWORD_KEY,
SERVER_CONFIG_USERNAME,
)

logger = logging.getLogger(__name__)

Expand All @@ -102,6 +115,10 @@ def is_unit_blocked(self) -> bool:
# to 0 if you are raising the major API version
LIBPATCH = 12

ANOTHER_S3_CLUSTER_REPOSITORY_ERROR_MESSAGE = "S3 repository claimed by another cluster"
MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR = (
"Move restored cluster to another S3 repository"
)

if typing.TYPE_CHECKING:
from charm import MySQLOperatorCharm
Expand All @@ -119,6 +136,13 @@ def __init__(self, charm: "MySQLOperatorCharm", s3_integrator: S3Requirer) -> No
self.framework.observe(self.charm.on.create_backup_action, self._on_create_backup)
self.framework.observe(self.charm.on.list_backups_action, self._on_list_backups)
self.framework.observe(self.charm.on.restore_action, self._on_restore)
self.framework.observe(
self.s3_integrator.on.credentials_changed, self._on_s3_credentials_changed
)
self.framework.observe(self.charm.on.leader_elected, self._on_s3_credentials_changed)
self.framework.observe(
self.s3_integrator.on.credentials_gone, self._on_s3_credentials_gone
)

# ------------------ Helpers ------------------
@property
Expand Down Expand Up @@ -235,18 +259,33 @@ def _on_list_backups(self, event: ActionEvent) -> None:

# ------------------ Create Backup ------------------

def _on_create_backup(self, event: ActionEvent) -> None:
"""Handle the create backup action."""
logger.info("A backup has been requested on unit")
def _pre_create_backup_checks(self, event: ActionEvent) -> bool:
"""Run some checks before creating the backup.

Returns: a boolean indicating whether operation should be run.
"""
if not self._s3_integrator_relation_exists:
logger.error("Backup failed: missing relation with S3 integrator charm")
event.fail("Missing relation with S3 integrator charm")
return
return False

if "s3-block-message" in self.charm.app_peer_data:
logger.error("Backup failed: S3 relation is blocked for write")
event.fail("S3 relation is blocked for write")
return False

if not self.charm._mysql.is_mysqld_running():
logger.error(f"Backup failed: process mysqld is not running on {self.charm.unit.name}")
event.fail("Process mysqld not running")
return False

return True

def _on_create_backup(self, event: ActionEvent) -> None:
"""Handle the create backup action."""
logger.info("A backup has been requested on unit")

if not self._pre_create_backup_checks(event):
return

datetime_backup_requested = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
Expand Down Expand Up @@ -479,7 +518,7 @@ def _pre_restore_checks(self, event: ActionEvent) -> bool:

return True

def _on_restore(self, event: ActionEvent) -> None:
def _on_restore(self, event: ActionEvent) -> None: # noqa: C901
"""Handle the restore backup action event.

Restore a backup from S3 (parameters for which can retrieved from the
Expand All @@ -489,7 +528,12 @@ def _on_restore(self, event: ActionEvent) -> None:
return

backup_id = event.params["backup-id"].strip().strip("/")
logger.info(f"A restore with backup-id {backup_id} has been requested on unit")
restore_to_time = event.params.get("restore-to-time")
logger.info(
f"A restore with backup-id {backup_id}"
f"{f' to time point {restore_to_time}' if restore_to_time else ''}"
f" has been requested on the unit"
)

# Retrieve and validate missing S3 parameters
s3_parameters, missing_parameters = self._retrieve_s3_parameters()
Expand Down Expand Up @@ -519,14 +563,41 @@ def _on_restore(self, event: ActionEvent) -> None:
if not success:
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)

if recoverable:
self._clean_data_dir_and_start_mysqld()
else:
self.charm.app_peer_data.update({
"s3-block-message": MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR,
"binlogs-collecting": "",
Comment on lines 563 to +571
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: if the restore fails and is not recoverable, does this instruct the user to change the s3 repository?

if so, is that the safest & most relevant action for the user to take?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's the most convenient way to report a restore error with event fail and blocked unit status simultaneously while keeping s3-block-message as application status.
It's important to mention, that while the unit is blocked, application status can't transit to the s3-block-message due to the MySQLOperatorCharm._on_update_status -> if not self.cluster_initialized -> if self._mysql.cluster_metadata_exists(self.get_unit_address(unit)), that will fail like:

unit-mysql-0: 01:50:14 WARNING unit.mysql/0.juju-log Failed to check if cluster metadata exists from_instance='10.88.216.67'
unit-mysql-0: 01:50:14 INFO unit.mysql/0.juju-log skip status update when not initialized

Summarizing, if this particular check fails, the application status will be next:
image
Also, right below your code reference, there is the next code block:

self.charm.app_peer_data.update({
"s3-block-message": MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR,
"binlogs-collecting": "",
})

^ it will imply the same logic I talking about if any of the later backup steps are failed such as _clean_data_dir_and_start_mysqld, _pitr_restore or _post_restore.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I understand. To clarify, my question was referring to lines 569-571

while keeping s3-block-message as application status

if the restore fails not recoverable, what should the user do?

while the unit is blocked, application status can't transit to the s3-block-message

what happens if the user removes that unit?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I understand. To clarify, my question was referring to lines 569-571

Yes, I'm also referring to the 569-571. I'm just pointing to the 578-581 lines to inform that the same logic (like when _restore fails and not recoverable) will be applied to the later steps like _pitr_restore and _post_restore assuming that these operations are not recoverable.

if the restore fails not recoverable, what should the user do?

When it's not recoverable, user will receive failed event notification and face single blocked unit. You can see this on the screenshot I've attached to the previous message.

Best course for user in this situation will be:

  • investigate underlying issue in the debug-log and resolve it (for example, user can re-create cluster, see documentation section "Migrate a cluster")
  • run the same restore again / run another restore

Maybe it would be a great idea to put some notice about possible restore failures directly in the documentation.

what happens if the user removes that unit?

  1. Before restore:
    unit = active / idle
    application = active
  2. Restore failed:
    unit = blocked / idle
    application = active
  3. User removes unit:
    application = active
  4. User adds unit:
    unit = active / idle
    application = S3 repository claimed by another cluster
  5. Restore succeed:
    unit = active / idle
    application = Move restored cluster to another S3 repository

})
if not self.charm._mysql.start_stop_binlogs_collecting():
logger.error("Failed to stop binlogs collecting after failed restore")
self.charm.unit.status = BlockedStatus(error_message)
return

self.charm.app_peer_data.update({
"s3-block-message": MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR,
"binlogs-collecting": "",
})
if not self.charm._mysql.start_stop_binlogs_collecting():
logger.error("Failed to stop binlogs collecting prior to restore")

success, error_message = self._clean_data_dir_and_start_mysqld()
if not success:
logger.error(f"Restore failed: {error_message}")
self.charm.unit.status = BlockedStatus(error_message)
event.fail(error_message)
return

if restore_to_time is not None:
self.charm.unit.status = MaintenanceStatus("Running point-in-time-recovery operations")
success, error_message = self._pitr_restore(restore_to_time, s3_parameters)
if not success:
logger.error(f"Restore failed: {error_message}")
event.fail(error_message)
self.charm.unit.status = BlockedStatus(error_message)
return

# Run post-restore operations
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest moving logic (from lines 585 to 600) into the _restore() method as that is where all the restore logic is located. the _on_restore action handler is just coordinating the restore

this will make it easier to trace the code since all the restore logic is in one method (_restore())

self.charm.unit.status = MaintenanceStatus("Running post-restore operations")
success, error_message = self._post_restore()
Expand Down Expand Up @@ -636,15 +707,29 @@ def _clean_data_dir_and_start_mysqld(self) -> Tuple[bool, str]:

return True, ""

def _pitr_restore(
self, restore_to_time: str, s3_parameters: Dict[str, str]
) -> Tuple[bool, str]:
try:
logger.info("Restoring point-in-time-recovery")
stdout, stderr = self.charm._mysql.restore_pitr(
host=self.charm.get_unit_address(self.charm.unit),
mysql_user=SERVER_CONFIG_USERNAME,
password=self.charm.get_secret("app", SERVER_CONFIG_PASSWORD_KEY),
s3_parameters=s3_parameters,
restore_to_time=restore_to_time,
)
logger.debug(f"Stdout of mysql-pitr-helper restore command: {stdout}")
logger.debug(f"Stderr of mysql-pitr-helper restore command: {stderr}")
except MySQLRestorePitrError:
return False, f"Failed to restore point-in-time-recovery to the {restore_to_time}"
return True, ""

def _post_restore(self) -> Tuple[bool, str]:
"""Run operations required after restoring a backup.

Returns: tuple of (success, error_message)
"""
success, error_message = self._clean_data_dir_and_start_mysqld()
if not success:
return success, error_message

try:
logger.info("Configuring instance to be part of an InnoDB cluster")
self.charm._mysql.configure_instance(create_cluster_admin=False)
Expand Down Expand Up @@ -674,3 +759,108 @@ def _post_restore(self) -> Tuple[bool, str]:
return False, "Failed to rescan the cluster"

return True, ""

def _on_s3_credentials_changed(self, event: CredentialsChangedEvent) -> None:
if not self.charm.unit.is_leader():
logger.debug("Early exit on _on_s3_credentials_changed: unit is not a leader")
return

if not self._s3_integrator_relation_exists:
logger.debug(
"Early exit on _on_s3_credentials_changed: s3 integrator relation does not exist"
)
return

if (
not self.charm._mysql.is_mysqld_running()
or not self.charm.unit_initialized
or not self.charm.upgrade.idle
):
logger.debug(
"Deferring _on_s3_credentials_changed: mysql cluster is not started yet or upgrade is occurring"
)
event.defer()
return

logger.info("Retrieving s3 parameters from the s3-integrator relation")
s3_parameters, missing_parameters = self._retrieve_s3_parameters()
if missing_parameters:
logger.error(f"Missing S3 parameters: {missing_parameters}")
return

logger.info("Ensuring compatibility with the provided S3 repository")
if ensure_s3_compatible_group_replication_id(
self.charm._mysql.get_current_group_replication_id(), s3_parameters
):
self.charm.app_peer_data.update({
"s3-block-message": "",
"binlogs-collecting": "true",
})
else:
self.charm.app_peer_data.update({
"s3-block-message": ANOTHER_S3_CLUSTER_REPOSITORY_ERROR_MESSAGE,
"binlogs-collecting": "",
})

if not self.charm._mysql.start_stop_binlogs_collecting(True):
logger.error("Failed to restart binlogs collecting after S3 relation update")

def _on_s3_credentials_gone(self, event: CredentialsGoneEvent) -> None:
if not self.charm.unit.is_leader():
logger.debug("Early exit on _on_s3_credentials_gone: unit is not a leader")
return
self.charm.app_peer_data.update({
"s3-block-message": "",
"binlogs-collecting": "",
})
if not self.charm._mysql.start_stop_binlogs_collecting():
logger.error("Failed to stop binlogs collecting after S3 relation depart")

def update_binlogs_collector_config(self) -> bool:
"""Update binlogs collector service config file.

Returns: whether this operation was successful.
"""
if not self._s3_integrator_relation_exists:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be useful to delete the config if the s3 integrator relation is removed—but deferring to @paulomach

logger.error(
"Cannot update binlogs collector config: s3 integrator relation does not exist"
)
return False

logger.info("Retrieving s3 parameters from the s3-integrator relation")
s3_parameters, missing_parameters = self._retrieve_s3_parameters()
if missing_parameters:
logger.error(
f"Cannot update binlogs collector config: Missing S3 parameters: {missing_parameters}"
)
return False

bucket_url = (
f"{s3_parameters['bucket']}/{s3_parameters['path']}binlogs"
if s3_parameters["path"][-1] == "/"
else f"{s3_parameters['bucket']}/{s3_parameters['path']}/binlogs"
)

with io.StringIO() as string_io:
yaml.dump(
{
"endpoint": s3_parameters["endpoint"],
"hosts": self.charm._mysql.get_cluster_members(),
"user": SERVER_CONFIG_USERNAME,
"pass": self.charm.get_secret("app", SERVER_CONFIG_PASSWORD_KEY),
"storage_type": "s3",
"s3": {
"access_key_id": s3_parameters["access-key"],
"secret_access_key": s3_parameters["secret-key"],
"bucket_url": bucket_url,
"default_region": s3_parameters["region"],
},
},
string_io,
)
self.charm._mysql.write_content_to_file(
path=MYSQL_BINLOGS_COLLECTOR_CONFIG_FILE,
content=string_io.getvalue(),
)

return True
Loading