From 258892370ba8c03719960b9ca2ffcfda539bf0d7 Mon Sep 17 00:00:00 2001 From: Michael Burman Date: Tue, 19 Sep 2023 22:17:20 +0300 Subject: [PATCH] Validate that MedusaBackup can be used to safely restore the datacenter (#1054) --- CHANGELOG/CHANGELOG-1.9.md | 1 + .../medusa/v1alpha1/medusarestorejob_types.go | 7 + ...medusa.k8ssandra.io_medusarestorejobs.yaml | 20 +- controllers/medusa/controllers_test.go | 2 + .../medusa/medusarestorejob_controller.go | 68 +++++ .../medusarestorejob_controller_test.go | 271 ++++++++++++++++++ 6 files changed, 366 insertions(+), 3 deletions(-) diff --git a/CHANGELOG/CHANGELOG-1.9.md b/CHANGELOG/CHANGELOG-1.9.md index 5290bbb29..8d861033a 100644 --- a/CHANGELOG/CHANGELOG-1.9.md +++ b/CHANGELOG/CHANGELOG-1.9.md @@ -15,6 +15,7 @@ When cutting a new release, update the `unreleased` heading to the tag being gen ## unreleased +* [ENHANCEMENT] [#1045](https://github.com/k8ssandra/k8ssandra-operator/issues/1045) Validate MedusaBackup before doing restore to prevent data loss scenarios * [ENHANCEMENT] [#1046](https://github.com/k8ssandra/k8ssandra-operator/issues/1046) Add detailed backup information in the MedusaBackup CRD status * [BUGFIX] [#1027](https://github.com/k8ssandra/k8ssandra-operator/issues/1027) Point system-logger image to use the v1.16.0 tag instead of latest * [BUGFIX] [#1026](https://github.com/k8ssandra/k8ssandra-operator/issues/1026) Fix DC name overrides not being properly handled diff --git a/apis/medusa/v1alpha1/medusarestorejob_types.go b/apis/medusa/v1alpha1/medusarestorejob_types.go index d0902b975..a96933882 100644 --- a/apis/medusa/v1alpha1/medusarestorejob_types.go +++ b/apis/medusa/v1alpha1/medusarestorejob_types.go @@ -54,10 +54,16 @@ type MedusaRestoreJobStatus struct { Finished []string `json:"finished,omitempty"` Failed []string `json:"failed,omitempty"` + + // Message gives the reason why restore operation failed + Message string `json:"message,omitempty"` } //+kubebuilder:object:root=true //+kubebuilder:subresource:status +//+kubebuilder:printcolumn:name="Started",type=date,JSONPath=".status.startTime",description="Restore start time" +//+kubebuilder:printcolumn:name="Finished",type=date,JSONPath=".status.finishTime",description="Restore finish time" +//+kubebuilder:printcolumn:name="Error",type=string,JSONPath=".status.message",description="Error message" // MedusaRestoreJob is the Schema for the medusarestorejobs API type MedusaRestoreJob struct { @@ -70,6 +76,7 @@ type MedusaRestoreJob struct { type MedusaRestoreMapping struct { // Whether the restore is in-place or not + // +optional InPlace *bool `json:"in_place"` // Mapping between source and target nodes for a restore diff --git a/config/crd/bases/medusa.k8ssandra.io_medusarestorejobs.yaml b/config/crd/bases/medusa.k8ssandra.io_medusarestorejobs.yaml index 5ee2e2402..dfb48f722 100644 --- a/config/crd/bases/medusa.k8ssandra.io_medusarestorejobs.yaml +++ b/config/crd/bases/medusa.k8ssandra.io_medusarestorejobs.yaml @@ -14,7 +14,20 @@ spec: singular: medusarestorejob scope: Namespaced versions: - - name: v1alpha1 + - additionalPrinterColumns: + - description: Restore start time + jsonPath: .status.startTime + name: Started + type: date + - description: Restore finish time + jsonPath: .status.finishTime + name: Finished + type: date + - description: Error message + jsonPath: .status.message + name: Error + type: string + name: v1alpha1 schema: openAPIV3Schema: description: MedusaRestoreJob is the Schema for the medusarestorejobs API @@ -66,6 +79,9 @@ spec: items: type: string type: array + message: + description: Message gives the reason why restore operation failed + type: string restoreKey: description: A unique key that identifies the restore operation. type: string @@ -90,8 +106,6 @@ spec: in_place: description: Whether the restore is in-place or not type: boolean - required: - - in_place type: object restorePrepared: type: boolean diff --git a/controllers/medusa/controllers_test.go b/controllers/medusa/controllers_test.go index 2a5796e30..e431590a8 100644 --- a/controllers/medusa/controllers_test.go +++ b/controllers/medusa/controllers_test.go @@ -47,6 +47,8 @@ func TestCassandraBackupRestore(t *testing.T) { defer testEnv3.Stop(t) defer cancel() t.Run("TestMedusaRestoreDatacenter", testEnv3.ControllerTest(ctx, testMedusaRestoreDatacenter)) + + t.Run("TestValidationErrorStopsRestore", testEnv3.ControllerTest(ctx, testValidationErrorStopsRestore)) } func setupMedusaBackupTestEnv(t *testing.T, ctx context.Context) *testutils.MultiClusterTestEnv { diff --git a/controllers/medusa/medusarestorejob_controller.go b/controllers/medusa/medusarestorejob_controller.go index dbbbfb584..c110a576e 100644 --- a/controllers/medusa/medusarestorejob_controller.go +++ b/controllers/medusa/medusarestorejob_controller.go @@ -104,6 +104,19 @@ func (r *MedusaRestoreJobReconciler) Reconcile(ctx context.Context, req ctrl.Req } } + // Verify the backup can be used for restore + if err := validateBackupForRestore(request.MedusaBackup, cassdc); err != nil { + request.RestoreJob.Status.FinishTime = metav1.Now() + request.RestoreJob.Status.Message = err.Error() + if err = r.Status().Update(ctx, request.RestoreJob); err != nil { + logger.Error(err, "failed to update MedusaRestoreJob with error message", "MedusaRestoreJob", req.NamespacedName.Name) + return ctrl.Result{RequeueAfter: r.DefaultDelay}, err + } + + logger.Error(fmt.Errorf("unable to use target backup for restore of CassandraDatacenter: %s", request.RestoreJob.Status.Message), "backup can not be used for restore") + return ctrl.Result{}, nil // No requeue, because this error is not transient + } + // Prepare the restore by placing a mapping file in the Cassandra data volume. if !request.RestoreJob.Status.RestorePrepared { restorePrepared := false @@ -261,6 +274,61 @@ func (r *MedusaRestoreJobReconciler) prepareRestore(ctx context.Context, request return &medusaRestoreMapping, nil } +func validateBackupForRestore(backup *medusav1alpha1.MedusaBackup, cassdc *cassdcapi.CassandraDatacenter) error { + if backup.Status.TotalNodes == 0 && backup.Status.FinishedNodes == 0 { + // This is an old backup without enough data, need to skip for backwards compatibility + return nil + } + + if backup.Status.FinishTime.IsZero() { + return fmt.Errorf("target backup has not finished") + } + + if backup.Status.FinishedNodes != backup.Status.TotalNodes { + // In Medusa, a failed backup is not considered Finished. In MedusaBackupJob, a failed backup is considered finished, but failed. + return fmt.Errorf("target backup has not completed successfully") + } + + if backup.Status.TotalNodes != cassdc.Spec.Size { + return fmt.Errorf("node counts differ for source backup and destination datacenter") + } + + rackSizes := make(map[string]int) + for _, n := range backup.Status.Nodes { + if n.Datacenter != cassdc.DatacenterName() { + return fmt.Errorf("target datacenter has different name than backup") + } + if c, found := rackSizes[n.Rack]; !found { + rackSizes[n.Rack] = 1 + } else { + rackSizes[n.Rack] = c + 1 + } + } + + if len(cassdc.Spec.Racks) > 0 { + if len(rackSizes) != len(cassdc.Spec.Racks) { + return fmt.Errorf("amount of racks must match in backup and target datacenter") + } + + for _, r := range cassdc.Spec.Racks { + if _, found := rackSizes[r.Name]; !found { + return fmt.Errorf("rack names must match in backup and target datacenter") + } + } + } else { + // cass-operator treats this as single rack setup, with name "default" + if len(rackSizes) > 1 { + return fmt.Errorf("amount of racks must match in backup and target datacenter") + } + + if backup.Status.Nodes[0].Rack != "default" { + return fmt.Errorf("rack names must match in backup and target datacenter") + } + } + + return nil +} + // stopDatacenter sets the Stopped property in the Datacenter spec to true. Returns true if // the datacenter is stopped. func stopDatacenterRestoreJob(req *medusa.RestoreRequest) bool { diff --git a/controllers/medusa/medusarestorejob_controller_test.go b/controllers/medusa/medusarestorejob_controller_test.go index 519de68d7..33239addb 100644 --- a/controllers/medusa/medusarestorejob_controller_test.go +++ b/controllers/medusa/medusarestorejob_controller_test.go @@ -147,6 +147,28 @@ func testMedusaRestoreDatacenter(t *testing.T, ctx context.Context, f *framework err = f.Create(ctx, backupKey, backup) require.NoError(err, "failed to create CassandraBackup") + patch := client.MergeFrom(backup.DeepCopy()) + backup.Status.FinishTime = metav1.Now() + backup.Status.FinishedNodes = dc1.Spec.Size + backup.Status.TotalNodes = dc1.Spec.Size + backup.Status.Nodes = []*api.MedusaBackupNode{ + { + Datacenter: "real-dc1", + Rack: "default", + }, + { + Datacenter: "real-dc1", + Rack: "default", + }, + { + Datacenter: "real-dc1", + Rack: "default", + }, + } + + err = f.PatchStatus(ctx, backup, patch, backupKey) + require.NoError(err, "failed to patch MedusaBackup") + restore := &api.MedusaRestoreJob{ ObjectMeta: metav1.ObjectMeta{ Namespace: namespace, @@ -274,6 +296,180 @@ func testMedusaRestoreDatacenter(t *testing.T, ctx context.Context, f *framework require.NoError(err, "failed to delete K8ssandraCluster") } +func testValidationErrorStopsRestore(t *testing.T, ctx context.Context, f *framework.Framework, namespace string) { + require := require.New(t) + f.Client.DeleteAllOf(ctx, &corev1.Pod{}, client.InNamespace(namespace)) + k8sCtx0 := f.DataPlaneContexts[0] + + kc := &k8ss.K8ssandraCluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: "test", + }, + Spec: k8ss.K8ssandraClusterSpec{ + Cassandra: &k8ss.CassandraClusterTemplate{ + Datacenters: []k8ss.CassandraDatacenterTemplate{ + { + Meta: k8ss.EmbeddedObjectMeta{ + Name: "dc1", + }, + K8sContext: k8sCtx0, + Size: 3, + DatacenterOptions: k8ss.DatacenterOptions{ + DatacenterName: "real-dc1", + ServerVersion: "3.11.14", + StorageConfig: &cassdcapi.StorageConfig{ + CassandraDataVolumeClaimSpec: &corev1.PersistentVolumeClaimSpec{ + StorageClassName: &defaultStorageClass, + }, + }, + }, + }, + }, + }, + Medusa: &api.MedusaClusterTemplate{ + ContainerImage: &images.Image{ + Repository: medusaImageRepo, + }, + StorageProperties: api.Storage{ + StorageSecretRef: corev1.LocalObjectReference{ + Name: cassandraUserSecret, + }, + }, + CassandraUserSecretRef: corev1.LocalObjectReference{ + Name: cassandraUserSecret, + }, + }, + }, + } + + t.Log("Creating k8ssandracluster with Medusa") + err := f.Client.Create(ctx, kc) + require.NoError(err, "failed to create K8ssandraCluster") + + reconcileReplicatedSecret(ctx, t, f, kc) + reconcileMedusaStandaloneDeployment(ctx, t, f, kc, "real-dc1", f.DataPlaneContexts[0]) + t.Log("check that dc1 was created") + dc1Key := framework.NewClusterKey(f.DataPlaneContexts[0], namespace, "dc1") + require.Eventually(f.DatacenterExists(ctx, dc1Key), timeout, interval) + + t.Log("update datacenter status to scaling up") + err = f.PatchDatacenterStatus(ctx, dc1Key, func(dc *cassdcapi.CassandraDatacenter) { + dc.SetCondition(cassdcapi.DatacenterCondition{ + Type: cassdcapi.DatacenterScalingUp, + Status: corev1.ConditionTrue, + LastTransitionTime: metav1.Now(), + }) + }) + require.NoError(err, "failed to patch datacenter status") + + kcKey := framework.ClusterKey{K8sContext: k8sCtx0, NamespacedName: types.NamespacedName{Namespace: namespace, Name: "test"}} + + t.Log("check that the K8ssandraCluster status is updated") + require.Eventually(func() bool { + kc := &k8ss.K8ssandraCluster{} + err = f.Client.Get(ctx, kcKey.NamespacedName, kc) + + if err != nil { + t.Logf("failed to get K8ssandraCluster: %v", err) + return false + } + + if len(kc.Status.Datacenters) == 0 { + return false + } + + k8ssandraStatus, found := kc.Status.Datacenters[dc1Key.Name] + if !found { + t.Logf("status for datacenter %s not found", dc1Key) + return false + } + + condition := findDatacenterCondition(k8ssandraStatus.Cassandra, cassdcapi.DatacenterScalingUp) + return condition != nil && condition.Status == corev1.ConditionTrue + }, timeout, interval, "timed out waiting for K8ssandraCluster status update") + + dc1 := &cassdcapi.CassandraDatacenter{} + err = f.Get(ctx, dc1Key, dc1) + + t.Log("update dc1 status to ready") + err = f.PatchDatacenterStatus(ctx, dc1Key, func(dc *cassdcapi.CassandraDatacenter) { + dc.Status.CassandraOperatorProgress = cassdcapi.ProgressReady + dc.SetCondition(cassdcapi.DatacenterCondition{ + Type: cassdcapi.DatacenterReady, + Status: corev1.ConditionTrue, + LastTransitionTime: metav1.Now(), + }) + }) + require.NoError(err, "failed to update dc1 status to ready") + + t.Log("creating MedusaBackup") + backup := &api.MedusaBackup{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: restoredBackupName, + }, + Spec: api.MedusaBackupSpec{ + CassandraDatacenter: dc1.Name, + }, + } + + backupKey := framework.NewClusterKey(dc1Key.K8sContext, dc1Key.Namespace, restoredBackupName) + err = f.Create(ctx, backupKey, backup) + require.NoError(err, "failed to create CassandraBackup") + + patch := client.MergeFrom(backup.DeepCopy()) + backup.Status.FinishTime = metav1.Now() + backup.Status.FinishedNodes = dc1.Spec.Size - 1 + backup.Status.TotalNodes = dc1.Spec.Size + backup.Status.Nodes = []*api.MedusaBackupNode{ + { + Datacenter: "real-dc1", + Rack: "default", + }, + { + Datacenter: "real-dc1", + Rack: "default", + }, + { + Datacenter: "real-dc1", + Rack: "default", + }, + } + + err = f.PatchStatus(ctx, backup, patch, backupKey) + require.NoError(err, "failed to patch MedusaBackup") + + restore := &api.MedusaRestoreJob{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: "test-restore", + }, + Spec: api.MedusaRestoreJobSpec{ + Backup: restoredBackupName, + CassandraDatacenter: dc1.Name, + }, + } + + restoreKey := framework.NewClusterKey(dc1Key.K8sContext, dc1Key.Namespace, restore.ObjectMeta.Name) + err = f.Create(ctx, restoreKey, restore) + require.NoError(err, "failed to create MedusaRestoreJob") + + t.Log("check restore status set to failed") + require.Eventually(func() bool { + restore := &api.MedusaRestoreJob{} + err := f.Get(ctx, restoreKey, restore) + if err != nil { + return false + } + + return restore.Status.Message != "" + }, timeout, interval) + + err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval) + require.NoError(err, "failed to delete K8ssandraCluster") +} + func findContainer(containers []corev1.Container, name string) *corev1.Container { for _, container := range containers { if container.Name == name { @@ -288,6 +484,81 @@ func TestMedusaServiceAddress(t *testing.T) { assert.Equal(t, "k8c-cluster-real-dc1-medusa-service.dc-namespace.svc:50051", serviceUrl) } +func TestValidateBackupForRestore(t *testing.T) { + assert := assert.New(t) + + createBackup := func() *api.MedusaBackup { + return &api.MedusaBackup{ + Spec: api.MedusaBackupSpec{}, + Status: api.MedusaBackupStatus{ + FinishTime: metav1.Now(), + TotalNodes: 3, + FinishedNodes: 3, + Nodes: []*api.MedusaBackupNode{ + { + Datacenter: "dc1", + Rack: "r1", + }, + { + Datacenter: "dc1", + Rack: "r2", + }, + { + Datacenter: "dc1", + Rack: "r3", + }, + }, + }, + } + } + + createCassDc := func() *cassdcapi.CassandraDatacenter { + return &cassdcapi.CassandraDatacenter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "dc1", + }, + Spec: cassdcapi.CassandraDatacenterSpec{ + Size: 3, + Racks: []cassdcapi.Rack{ + { + Name: "r1", + }, + { + Name: "r2", + }, + { + Name: "r3", + }, + }, + }, + } + } + + assert.NoError(validateBackupForRestore(createBackup(), createCassDc())) + + backup := createBackup() + backup.Status.FinishedNodes = 2 + assert.Error(validateBackupForRestore(backup, createCassDc())) + + backup = createBackup() + backup.Status.Nodes[0].Datacenter = "dc2" + assert.Error(validateBackupForRestore(backup, createCassDc())) + + cassdc := createCassDc() + cassdc.Spec.Racks = nil + assert.Error(validateBackupForRestore(createBackup(), cassdc)) + + backup = createBackup() + backup.Status.Nodes[0].Rack = "default" + backup.Status.Nodes[1].Rack = "default" + backup.Status.Nodes[2].Rack = "default" + assert.NoError(validateBackupForRestore(backup, cassdc)) + + cassdc = createCassDc() + cassdc.Spec.Size = 6 + assert.Error(validateBackupForRestore(createBackup(), cassdc)) +} + type fakeMedusaRestoreClientFactory struct { clientsMutex sync.Mutex clients map[string]*fakeMedusaRestoreClient