Skip to content

Commit

Permalink
Validate that MedusaBackup can be used to safely restore the datacent…
Browse files Browse the repository at this point in the history
…er (#1054)
  • Loading branch information
burmanm authored Sep 19, 2023
1 parent 78079ca commit 2588923
Show file tree
Hide file tree
Showing 6 changed files with 366 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG/CHANGELOG-1.9.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ When cutting a new release, update the `unreleased` heading to the tag being gen

## unreleased

* [ENHANCEMENT] [#1045](https://github.com/k8ssandra/k8ssandra-operator/issues/1045) Validate MedusaBackup before doing restore to prevent data loss scenarios
* [ENHANCEMENT] [#1046](https://github.com/k8ssandra/k8ssandra-operator/issues/1046) Add detailed backup information in the MedusaBackup CRD status
* [BUGFIX] [#1027](https://github.com/k8ssandra/k8ssandra-operator/issues/1027) Point system-logger image to use the v1.16.0 tag instead of latest
* [BUGFIX] [#1026](https://github.com/k8ssandra/k8ssandra-operator/issues/1026) Fix DC name overrides not being properly handled
Expand Down
7 changes: 7 additions & 0 deletions apis/medusa/v1alpha1/medusarestorejob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,16 @@ type MedusaRestoreJobStatus struct {
Finished []string `json:"finished,omitempty"`

Failed []string `json:"failed,omitempty"`

// Message gives the reason why restore operation failed
Message string `json:"message,omitempty"`
}

//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:printcolumn:name="Started",type=date,JSONPath=".status.startTime",description="Restore start time"
//+kubebuilder:printcolumn:name="Finished",type=date,JSONPath=".status.finishTime",description="Restore finish time"
//+kubebuilder:printcolumn:name="Error",type=string,JSONPath=".status.message",description="Error message"

// MedusaRestoreJob is the Schema for the medusarestorejobs API
type MedusaRestoreJob struct {
Expand All @@ -70,6 +76,7 @@ type MedusaRestoreJob struct {

type MedusaRestoreMapping struct {
// Whether the restore is in-place or not
// +optional
InPlace *bool `json:"in_place"`

// Mapping between source and target nodes for a restore
Expand Down
20 changes: 17 additions & 3 deletions config/crd/bases/medusa.k8ssandra.io_medusarestorejobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,20 @@ spec:
singular: medusarestorejob
scope: Namespaced
versions:
- name: v1alpha1
- additionalPrinterColumns:
- description: Restore start time
jsonPath: .status.startTime
name: Started
type: date
- description: Restore finish time
jsonPath: .status.finishTime
name: Finished
type: date
- description: Error message
jsonPath: .status.message
name: Error
type: string
name: v1alpha1
schema:
openAPIV3Schema:
description: MedusaRestoreJob is the Schema for the medusarestorejobs API
Expand Down Expand Up @@ -66,6 +79,9 @@ spec:
items:
type: string
type: array
message:
description: Message gives the reason why restore operation failed
type: string
restoreKey:
description: A unique key that identifies the restore operation.
type: string
Expand All @@ -90,8 +106,6 @@ spec:
in_place:
description: Whether the restore is in-place or not
type: boolean
required:
- in_place
type: object
restorePrepared:
type: boolean
Expand Down
2 changes: 2 additions & 0 deletions controllers/medusa/controllers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ func TestCassandraBackupRestore(t *testing.T) {
defer testEnv3.Stop(t)
defer cancel()
t.Run("TestMedusaRestoreDatacenter", testEnv3.ControllerTest(ctx, testMedusaRestoreDatacenter))

t.Run("TestValidationErrorStopsRestore", testEnv3.ControllerTest(ctx, testValidationErrorStopsRestore))
}

func setupMedusaBackupTestEnv(t *testing.T, ctx context.Context) *testutils.MultiClusterTestEnv {
Expand Down
68 changes: 68 additions & 0 deletions controllers/medusa/medusarestorejob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,19 @@ func (r *MedusaRestoreJobReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
}

// Verify the backup can be used for restore
if err := validateBackupForRestore(request.MedusaBackup, cassdc); err != nil {
request.RestoreJob.Status.FinishTime = metav1.Now()
request.RestoreJob.Status.Message = err.Error()
if err = r.Status().Update(ctx, request.RestoreJob); err != nil {
logger.Error(err, "failed to update MedusaRestoreJob with error message", "MedusaRestoreJob", req.NamespacedName.Name)
return ctrl.Result{RequeueAfter: r.DefaultDelay}, err
}

logger.Error(fmt.Errorf("unable to use target backup for restore of CassandraDatacenter: %s", request.RestoreJob.Status.Message), "backup can not be used for restore")
return ctrl.Result{}, nil // No requeue, because this error is not transient
}

// Prepare the restore by placing a mapping file in the Cassandra data volume.
if !request.RestoreJob.Status.RestorePrepared {
restorePrepared := false
Expand Down Expand Up @@ -261,6 +274,61 @@ func (r *MedusaRestoreJobReconciler) prepareRestore(ctx context.Context, request
return &medusaRestoreMapping, nil
}

func validateBackupForRestore(backup *medusav1alpha1.MedusaBackup, cassdc *cassdcapi.CassandraDatacenter) error {
if backup.Status.TotalNodes == 0 && backup.Status.FinishedNodes == 0 {
// This is an old backup without enough data, need to skip for backwards compatibility
return nil
}

if backup.Status.FinishTime.IsZero() {
return fmt.Errorf("target backup has not finished")
}

if backup.Status.FinishedNodes != backup.Status.TotalNodes {
// In Medusa, a failed backup is not considered Finished. In MedusaBackupJob, a failed backup is considered finished, but failed.
return fmt.Errorf("target backup has not completed successfully")
}

if backup.Status.TotalNodes != cassdc.Spec.Size {
return fmt.Errorf("node counts differ for source backup and destination datacenter")
}

rackSizes := make(map[string]int)
for _, n := range backup.Status.Nodes {
if n.Datacenter != cassdc.DatacenterName() {
return fmt.Errorf("target datacenter has different name than backup")
}
if c, found := rackSizes[n.Rack]; !found {
rackSizes[n.Rack] = 1
} else {
rackSizes[n.Rack] = c + 1
}
}

if len(cassdc.Spec.Racks) > 0 {
if len(rackSizes) != len(cassdc.Spec.Racks) {
return fmt.Errorf("amount of racks must match in backup and target datacenter")
}

for _, r := range cassdc.Spec.Racks {
if _, found := rackSizes[r.Name]; !found {
return fmt.Errorf("rack names must match in backup and target datacenter")
}
}
} else {
// cass-operator treats this as single rack setup, with name "default"
if len(rackSizes) > 1 {
return fmt.Errorf("amount of racks must match in backup and target datacenter")
}

if backup.Status.Nodes[0].Rack != "default" {
return fmt.Errorf("rack names must match in backup and target datacenter")
}
}

return nil
}

// stopDatacenter sets the Stopped property in the Datacenter spec to true. Returns true if
// the datacenter is stopped.
func stopDatacenterRestoreJob(req *medusa.RestoreRequest) bool {
Expand Down
Loading

0 comments on commit 2588923

Please sign in to comment.