Skip to content

Commit

Permalink
Validate that MedusaBackup can be used to safely restore the datacenter
Browse files Browse the repository at this point in the history
  • Loading branch information
burmanm committed Sep 18, 2023
1 parent 580f2d2 commit 9983617
Show file tree
Hide file tree
Showing 5 changed files with 343 additions and 2 deletions.
4 changes: 4 additions & 0 deletions apis/medusa/v1alpha1/medusarestorejob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ type MedusaRestoreJobStatus struct {
Finished []string `json:"finished,omitempty"`

Failed []string `json:"failed,omitempty"`

// Message gives the reason why restore operation failed
Message string `json:"message,omitempty"`
}

//+kubebuilder:object:root=true
Expand All @@ -70,6 +73,7 @@ type MedusaRestoreJob struct {

type MedusaRestoreMapping struct {
// Whether the restore is in-place or not
// +optional
InPlace *bool `json:"in_place"`

// Mapping between source and target nodes for a restore
Expand Down
5 changes: 3 additions & 2 deletions config/crd/bases/medusa.k8ssandra.io_medusarestorejobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ spec:
items:
type: string
type: array
message:
description: Message gives the reason why restore operation failed
type: string
restoreKey:
description: A unique key that identifies the restore operation.
type: string
Expand All @@ -90,8 +93,6 @@ spec:
in_place:
description: Whether the restore is in-place or not
type: boolean
required:
- in_place
type: object
restorePrepared:
type: boolean
Expand Down
2 changes: 2 additions & 0 deletions controllers/medusa/controllers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ func TestCassandraBackupRestore(t *testing.T) {
defer testEnv3.Stop(t)
defer cancel()
t.Run("TestMedusaRestoreDatacenter", testEnv3.ControllerTest(ctx, testMedusaRestoreDatacenter))

t.Run("TestValidationErrorStopsRestore", testEnv3.ControllerTest(ctx, testValidationErrorStopsRestore))
}

func setupMedusaBackupTestEnv(t *testing.T, ctx context.Context) *testutils.MultiClusterTestEnv {
Expand Down
63 changes: 63 additions & 0 deletions controllers/medusa/medusarestorejob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,19 @@ func (r *MedusaRestoreJobReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
}

// Verify the backup can be used for restore
if err := validateBackupForRestore(request.MedusaBackup, cassdc); err != nil {
request.RestoreJob.Status.FinishTime = metav1.Now()
request.RestoreJob.Status.Message = err.Error()
if err = r.Status().Update(ctx, request.RestoreJob); err != nil {
logger.Error(err, "failed to update MedusaRestoreJob with error message", "MedusaRestoreJob", req.NamespacedName.Name)
return ctrl.Result{RequeueAfter: r.DefaultDelay}, err
}

logger.Error(fmt.Errorf("unable to use target backup for restore of CassandraDatacenter: %s", request.RestoreJob.Status.Message), "backup can not be used for restore")
return ctrl.Result{}, nil // No requeue, because this error is not transient
}

// Prepare the restore by placing a mapping file in the Cassandra data volume.
if !request.RestoreJob.Status.RestorePrepared {
restorePrepared := false
Expand Down Expand Up @@ -261,6 +274,56 @@ func (r *MedusaRestoreJobReconciler) prepareRestore(ctx context.Context, request
return &medusaRestoreMapping, nil
}

func validateBackupForRestore(backup *medusav1alpha1.MedusaBackup, cassdc *cassdcapi.CassandraDatacenter) error {
if backup.Status.FinishTime.IsZero() {
return fmt.Errorf("target backup has not finished")
}

if backup.Status.FinishedNodes != backup.Status.TotalNodes {
// In Medusa, a failed backup is not considered Finished. In MedusaBackupJob, a failed backup is considered finished, but failed.
return fmt.Errorf("target backup has not completed successfully")
}

if backup.Status.TotalNodes != cassdc.Spec.Size {
return fmt.Errorf("node counts differ for source backup and destination datacenter")
}

rackSizes := make(map[string]int)
for _, n := range backup.Status.Nodes {
if n.Datacenter != cassdc.DatacenterName() {
return fmt.Errorf("target datacenter has different name than backup")
}
if c, found := rackSizes[n.Rack]; !found {
rackSizes[n.Rack] = 1
} else {
rackSizes[n.Rack] = c + 1
}
}

if len(cassdc.Spec.Racks) > 0 {
if len(rackSizes) != len(cassdc.Spec.Racks) {
return fmt.Errorf("amount of racks must match in backup and target datacenter")
}

for _, r := range cassdc.Spec.Racks {
if _, found := rackSizes[r.Name]; !found {
return fmt.Errorf("rack names must match in backup and target datacenter")
}
}
} else {
// cass-operator treats this as single rack setup, with name "default"
if len(rackSizes) > 1 {
return fmt.Errorf("amount of racks must match in backup and target datacenter")
}

if backup.Status.Nodes[0].Rack != "default" {
return fmt.Errorf("rack names must match in backup and target datacenter")
}
}

return nil
}

// stopDatacenter sets the Stopped property in the Datacenter spec to true. Returns true if
// the datacenter is stopped.
func stopDatacenterRestoreJob(req *medusa.RestoreRequest) bool {
Expand Down
Loading

0 comments on commit 9983617

Please sign in to comment.