Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add medusa validation #1054

Merged
merged 3 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG/CHANGELOG-1.9.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ When cutting a new release, update the `unreleased` heading to the tag being gen

## unreleased

* [ENHANCEMENT] [#1045](https://github.com/k8ssandra/k8ssandra-operator/issues/1045) Validate MedusaBackup before doing restore to prevent data loss scenarios
* [ENHANCEMENT] [#1046](https://github.com/k8ssandra/k8ssandra-operator/issues/1046) Add detailed backup information in the MedusaBackup CRD status
* [BUGFIX] [#1027](https://github.com/k8ssandra/k8ssandra-operator/issues/1027) Point system-logger image to use the v1.16.0 tag instead of latest
* [BUGFIX] [#1026](https://github.com/k8ssandra/k8ssandra-operator/issues/1026) Fix DC name overrides not being properly handled
Expand Down
7 changes: 7 additions & 0 deletions apis/medusa/v1alpha1/medusarestorejob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,16 @@ type MedusaRestoreJobStatus struct {
Finished []string `json:"finished,omitempty"`

Failed []string `json:"failed,omitempty"`

// Message gives the reason why restore operation failed
Message string `json:"message,omitempty"`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: putting StartTime, FinishTime and Message as printcolumns would be a nice addition. Can do that in a subsequent ticket.

}

//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:printcolumn:name="Started",type=date,JSONPath=".status.startTime",description="Restore start time"
//+kubebuilder:printcolumn:name="Finished",type=date,JSONPath=".status.finishTime",description="Restore finish time"
//+kubebuilder:printcolumn:name="Error",type=string,JSONPath=".status.message",description="Error message"

// MedusaRestoreJob is the Schema for the medusarestorejobs API
type MedusaRestoreJob struct {
Expand All @@ -70,6 +76,7 @@ type MedusaRestoreJob struct {

type MedusaRestoreMapping struct {
// Whether the restore is in-place or not
// +optional
InPlace *bool `json:"in_place"`

// Mapping between source and target nodes for a restore
Expand Down
20 changes: 17 additions & 3 deletions config/crd/bases/medusa.k8ssandra.io_medusarestorejobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,20 @@ spec:
singular: medusarestorejob
scope: Namespaced
versions:
- name: v1alpha1
- additionalPrinterColumns:
- description: Restore start time
jsonPath: .status.startTime
name: Started
type: date
- description: Restore finish time
jsonPath: .status.finishTime
name: Finished
type: date
- description: Error message
jsonPath: .status.message
name: Error
type: string
name: v1alpha1
schema:
openAPIV3Schema:
description: MedusaRestoreJob is the Schema for the medusarestorejobs API
Expand Down Expand Up @@ -66,6 +79,9 @@ spec:
items:
type: string
type: array
message:
description: Message gives the reason why restore operation failed
type: string
restoreKey:
description: A unique key that identifies the restore operation.
type: string
Expand All @@ -90,8 +106,6 @@ spec:
in_place:
description: Whether the restore is in-place or not
type: boolean
required:
- in_place
type: object
restorePrepared:
type: boolean
Expand Down
2 changes: 2 additions & 0 deletions controllers/medusa/controllers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ func TestCassandraBackupRestore(t *testing.T) {
defer testEnv3.Stop(t)
defer cancel()
t.Run("TestMedusaRestoreDatacenter", testEnv3.ControllerTest(ctx, testMedusaRestoreDatacenter))

t.Run("TestValidationErrorStopsRestore", testEnv3.ControllerTest(ctx, testValidationErrorStopsRestore))
}

func setupMedusaBackupTestEnv(t *testing.T, ctx context.Context) *testutils.MultiClusterTestEnv {
Expand Down
68 changes: 68 additions & 0 deletions controllers/medusa/medusarestorejob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,19 @@
}
}

// Verify the backup can be used for restore
if err := validateBackupForRestore(request.MedusaBackup, cassdc); err != nil {
request.RestoreJob.Status.FinishTime = metav1.Now()
request.RestoreJob.Status.Message = err.Error()
if err = r.Status().Update(ctx, request.RestoreJob); err != nil {
logger.Error(err, "failed to update MedusaRestoreJob with error message", "MedusaRestoreJob", req.NamespacedName.Name)
return ctrl.Result{RequeueAfter: r.DefaultDelay}, err
}

Check warning on line 114 in controllers/medusa/medusarestorejob_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/medusa/medusarestorejob_controller.go#L112-L114

Added lines #L112 - L114 were not covered by tests

logger.Error(fmt.Errorf("unable to use target backup for restore of CassandraDatacenter: %s", request.RestoreJob.Status.Message), "backup can not be used for restore")
return ctrl.Result{}, nil // No requeue, because this error is not transient
}

// Prepare the restore by placing a mapping file in the Cassandra data volume.
if !request.RestoreJob.Status.RestorePrepared {
restorePrepared := false
Expand Down Expand Up @@ -261,6 +274,61 @@
return &medusaRestoreMapping, nil
}

func validateBackupForRestore(backup *medusav1alpha1.MedusaBackup, cassdc *cassdcapi.CassandraDatacenter) error {
if backup.Status.TotalNodes == 0 && backup.Status.FinishedNodes == 0 {
// This is an old backup without enough data, need to skip for backwards compatibility
return nil
}

Check warning on line 281 in controllers/medusa/medusarestorejob_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/medusa/medusarestorejob_controller.go#L279-L281

Added lines #L279 - L281 were not covered by tests

if backup.Status.FinishTime.IsZero() {
return fmt.Errorf("target backup has not finished")
}

Check warning on line 285 in controllers/medusa/medusarestorejob_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/medusa/medusarestorejob_controller.go#L284-L285

Added lines #L284 - L285 were not covered by tests

if backup.Status.FinishedNodes != backup.Status.TotalNodes {
// In Medusa, a failed backup is not considered Finished. In MedusaBackupJob, a failed backup is considered finished, but failed.
return fmt.Errorf("target backup has not completed successfully")
}

if backup.Status.TotalNodes != cassdc.Spec.Size {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: This would break for all backups that were created prior to the new fields being added in the MedusaBackup CRD, right?
Maybe we should detect that and skip validation in this case?
I guess that if backup.Status.TotalNodes = backup.Status.FinishedNodes == 0 then we can skip the validation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, backwards compatibility. Will do.

return fmt.Errorf("node counts differ for source backup and destination datacenter")
}

rackSizes := make(map[string]int)
for _, n := range backup.Status.Nodes {
if n.Datacenter != cassdc.DatacenterName() {
return fmt.Errorf("target datacenter has different name than backup")
}
if c, found := rackSizes[n.Rack]; !found {
rackSizes[n.Rack] = 1
} else {
rackSizes[n.Rack] = c + 1
}
}

if len(cassdc.Spec.Racks) > 0 {
if len(rackSizes) != len(cassdc.Spec.Racks) {
return fmt.Errorf("amount of racks must match in backup and target datacenter")
}

Check warning on line 311 in controllers/medusa/medusarestorejob_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/medusa/medusarestorejob_controller.go#L310-L311

Added lines #L310 - L311 were not covered by tests

for _, r := range cassdc.Spec.Racks {
if _, found := rackSizes[r.Name]; !found {
return fmt.Errorf("rack names must match in backup and target datacenter")
}

Check warning on line 316 in controllers/medusa/medusarestorejob_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/medusa/medusarestorejob_controller.go#L315-L316

Added lines #L315 - L316 were not covered by tests
}
} else {
// cass-operator treats this as single rack setup, with name "default"
if len(rackSizes) > 1 {
return fmt.Errorf("amount of racks must match in backup and target datacenter")
}

if backup.Status.Nodes[0].Rack != "default" {
return fmt.Errorf("rack names must match in backup and target datacenter")
}

Check warning on line 326 in controllers/medusa/medusarestorejob_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/medusa/medusarestorejob_controller.go#L325-L326

Added lines #L325 - L326 were not covered by tests
}

return nil
}

// stopDatacenter sets the Stopped property in the Datacenter spec to true. Returns true if
// the datacenter is stopped.
func stopDatacenterRestoreJob(req *medusa.RestoreRequest) bool {
Expand Down
Loading
Loading