diff --git a/.github/actions/run-integ-test/action.yml b/.github/actions/run-integ-test/action.yml index 0e4cb8b2..24eeca2c 100644 --- a/.github/actions/run-integ-test/action.yml +++ b/.github/actions/run-integ-test/action.yml @@ -23,7 +23,7 @@ runs: sudo rm -rf /usr/share/dotnet sudo rm -rf /opt/ghc - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version-file: 'go.mod' cache: true diff --git a/.github/workflows/operatorBuildAndDeploy.yml b/.github/workflows/operatorBuildAndDeploy.yml index c67422c0..cfba03c7 100644 --- a/.github/workflows/operatorBuildAndDeploy.yml +++ b/.github/workflows/operatorBuildAndDeploy.yml @@ -20,7 +20,7 @@ jobs: - uses: actions/checkout@v4 if: github.event_name != 'pull_request' - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version-file: 'go.mod' cache: true @@ -53,7 +53,7 @@ jobs: - name: Check out code into the Go module directory uses: actions/checkout@v4 - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version-file: 'go.mod' cache: true diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f6a516cc..b01aec65 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,7 +15,7 @@ jobs: - name: Check out code into the Go module directory uses: actions/checkout@v4 - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version-file: 'go.mod' cache: true diff --git a/CHANGELOG.md b/CHANGELOG.md index 086ecab0..ba237c2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti * [FEATURE] [#659](https://github.com/k8ssandra/cass-operator/issues/659) Add support for HCD serverType with versions 1.x.x. It will be deployed like Cassandra >= 4.1 for now. * [FEATURE] [#660](https://github.com/k8ssandra/cass-operator/issues/660) Add support for DSE version 6.9.x and remove support for DSE 7.x.x. DSE 6.9 will be deployed like DSE 6.8. * [BUGFIX] [#652](https://github.com/k8ssandra/cass-operator/issues/652) Update nodeStatuses info when a pod is recreated +* [BUGFIX] [#656](https://github.com/k8ssandra/cass-operator/issues/656) After canary upgrade, it was not possible to continue upgrading rest of the nodes if the only change was removing the canary upgrade +* [BUGFIX] [#657](https://github.com/k8ssandra/cass-operator/issues/657) If a change did not result in StatefulSet changes, a Ready -> Ready state would lose an ObservedGeneration update in the status +* [BUGFIX] [#382](https://github.com/k8ssandra/cass-operator/issues/382) If StatefulSet has not caught up yet, do not allow any changes to it. Instead, requeue and wait (does not change behavior of forceRacksUpgrade) ## v1.20.0 diff --git a/pkg/reconciliation/construct_statefulset.go b/pkg/reconciliation/construct_statefulset.go index e48f3ce6..a9b1ca81 100644 --- a/pkg/reconciliation/construct_statefulset.go +++ b/pkg/reconciliation/construct_statefulset.go @@ -152,6 +152,23 @@ func newStatefulSetForCassandraDatacenter( result.Spec.ServiceName = sts.Spec.ServiceName } + if dc.Spec.CanaryUpgrade { + var partition int32 + if dc.Spec.CanaryUpgradeCount == 0 || dc.Spec.CanaryUpgradeCount > replicaCountInt32 { + partition = replicaCountInt32 + } else { + partition = replicaCountInt32 - dc.Spec.CanaryUpgradeCount + } + + strategy := appsv1.StatefulSetUpdateStrategy{ + Type: appsv1.RollingUpdateStatefulSetStrategyType, + RollingUpdate: &appsv1.RollingUpdateStatefulSetStrategy{ + Partition: &partition, + }, + } + result.Spec.UpdateStrategy = strategy + } + // add a hash here to facilitate checking if updates are needed utils.AddHashAnnotation(result) diff --git a/pkg/reconciliation/constructor.go b/pkg/reconciliation/constructor.go index bbd3dad5..9eb15feb 100644 --- a/pkg/reconciliation/constructor.go +++ b/pkg/reconciliation/constructor.go @@ -6,11 +6,12 @@ package reconciliation // This file defines constructors for k8s objects import ( + "fmt" + api "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1" "github.com/k8ssandra/cass-operator/pkg/oplabels" "github.com/k8ssandra/cass-operator/pkg/utils" - corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" @@ -48,7 +49,7 @@ func newPodDisruptionBudgetForDatacenter(dc *api.CassandraDatacenter) *policyv1. } func setOperatorProgressStatus(rc *ReconciliationContext, newState api.ProgressState) error { - rc.ReqLogger.Info("reconcile_racks::setOperatorProgressStatus") + rc.ReqLogger.Info(fmt.Sprintf("reconcile_racks::setOperatorProgressStatus::%v", newState)) currentState := rc.Datacenter.Status.CassandraOperatorProgress if currentState == newState { // early return, no need to ping k8s @@ -57,13 +58,11 @@ func setOperatorProgressStatus(rc *ReconciliationContext, newState api.ProgressS patch := client.MergeFrom(rc.Datacenter.DeepCopy()) rc.Datacenter.Status.CassandraOperatorProgress = newState - // TODO there may be a better place to push status.observedGeneration in the reconcile loop + if newState == api.ProgressReady { - rc.Datacenter.Status.ObservedGeneration = rc.Datacenter.Generation if rc.Datacenter.Status.DatacenterName == nil { rc.Datacenter.Status.DatacenterName = &rc.Datacenter.Spec.DatacenterName } - rc.setCondition(api.NewDatacenterCondition(api.DatacenterRequiresUpdate, corev1.ConditionFalse)) } if err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, patch); err != nil { rc.ReqLogger.Error(err, "error updating the Cassandra Operator Progress state") diff --git a/pkg/reconciliation/handler.go b/pkg/reconciliation/handler.go index dd015714..ebb474dd 100644 --- a/pkg/reconciliation/handler.go +++ b/pkg/reconciliation/handler.go @@ -7,9 +7,11 @@ import ( "fmt" "sync" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/reconcile" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" api "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1" @@ -65,9 +67,28 @@ func (rc *ReconciliationContext) CalculateReconciliationActions() (reconcile.Res return result.Error(err).Output() } - result, err := rc.ReconcileAllRacks() + res, err := rc.ReconcileAllRacks() + if err != nil { + return result.Error(err).Output() + } + + if err := rc.updateStatus(); err != nil { + return result.Error(err).Output() + } - return result, err + return res, nil +} + +func (rc *ReconciliationContext) updateStatus() error { + patch := client.MergeFrom(rc.Datacenter.DeepCopy()) + rc.Datacenter.Status.ObservedGeneration = rc.Datacenter.Generation + rc.setCondition(api.NewDatacenterCondition(api.DatacenterRequiresUpdate, corev1.ConditionFalse)) + if err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, patch); err != nil { + rc.ReqLogger.Error(err, "error updating the Cassandra Operator Progress state") + return err + } + + return nil } // This file contains various definitions and plumbing setup used for reconciliation. diff --git a/pkg/reconciliation/reconcile_racks.go b/pkg/reconciliation/reconcile_racks.go index af914313..9cd2fe5c 100644 --- a/pkg/reconciliation/reconcile_racks.go +++ b/pkg/reconciliation/reconcile_racks.go @@ -186,6 +186,30 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult { } statefulSet := rc.statefulSets[idx] + status := statefulSet.Status + + updatedReplicas := status.UpdatedReplicas + if status.CurrentRevision != status.UpdateRevision { + // Previous update was a canary upgrade, so we have pods in different versions + updatedReplicas = status.CurrentReplicas + status.UpdatedReplicas + } + + if statefulSet.Generation != status.ObservedGeneration || + status.Replicas != status.ReadyReplicas || + status.Replicas != updatedReplicas { + + logger.Info( + "waiting for upgrade to finish on statefulset", + "statefulset", statefulSet.Name, + "replicas", status.Replicas, + "readyReplicas", status.ReadyReplicas, + "currentReplicas", status.CurrentReplicas, + "updatedReplicas", status.UpdatedReplicas, + ) + + return result.RequeueSoon(10) + } + desiredSts, err := newStatefulSetForCassandraDatacenter(statefulSet, rackName, dc, int(*statefulSet.Spec.Replicas)) if err != nil { @@ -232,22 +256,6 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult { // selector must match podTemplate.Labels, those can't be updated either desiredSts.Spec.Selector = statefulSet.Spec.Selector - if dc.Spec.CanaryUpgrade { - var partition int32 - if dc.Spec.CanaryUpgradeCount == 0 || dc.Spec.CanaryUpgradeCount > int32(rc.desiredRackInformation[idx].NodeCount) { - partition = int32(rc.desiredRackInformation[idx].NodeCount) - } else { - partition = int32(rc.desiredRackInformation[idx].NodeCount) - dc.Spec.CanaryUpgradeCount - } - - strategy := appsv1.StatefulSetUpdateStrategy{ - Type: appsv1.RollingUpdateStatefulSetStrategyType, - RollingUpdate: &appsv1.RollingUpdateStatefulSetStrategy{ - Partition: &partition, - }, - } - desiredSts.Spec.UpdateStrategy = strategy - } stateMeta, err := meta.Accessor(statefulSet) resVersion := stateMeta.GetResourceVersion() if err != nil { @@ -300,29 +308,6 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult { // we just updated k8s and pods will be knocked out of ready state, so let k8s // call us back when these changes are done and the new pods are back to ready return result.Done() - } else { - - // the pod template is right, but if any pods don't match it, - // or are missing, we should not move onto the next rack, - // because there's an upgrade in progress - - status := statefulSet.Status - if statefulSet.Generation != status.ObservedGeneration || - status.Replicas != status.ReadyReplicas || - status.Replicas != status.CurrentReplicas || - status.Replicas != status.UpdatedReplicas { - - logger.Info( - "waiting for upgrade to finish on statefulset", - "statefulset", statefulSet.Name, - "replicas", status.Replicas, - "readyReplicas", status.ReadyReplicas, - "currentReplicas", status.CurrentReplicas, - "updatedReplicas", status.UpdatedReplicas, - ) - - return result.RequeueSoon(10) - } } } diff --git a/pkg/reconciliation/reconcile_racks_test.go b/pkg/reconciliation/reconcile_racks_test.go index bbfcf48c..c33e7d21 100644 --- a/pkg/reconciliation/reconcile_racks_test.go +++ b/pkg/reconciliation/reconcile_racks_test.go @@ -293,7 +293,6 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) { rc, _, cleanpMockSrc := setupTest() defer cleanpMockSrc() - rc.Datacenter.Spec.ServerVersion = "6.8.2" rc.Datacenter.Spec.Racks = []api.Rack{ {Name: "rack1", DeprecatedZone: "zone-1"}, } @@ -314,11 +313,10 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) { assert.True(t, result.Completed()) assert.Nil(t, err) - previousLabels := rc.statefulSets[0].Spec.Template.Labels rc.Datacenter.Spec.CanaryUpgrade = true rc.Datacenter.Spec.CanaryUpgradeCount = 1 - rc.Datacenter.Spec.ServerVersion = "6.8.3" + rc.Datacenter.Spec.ServerVersion = "6.8.44" partition := rc.Datacenter.Spec.CanaryUpgradeCount result = rc.CheckRackPodTemplate() @@ -342,12 +340,14 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) { rc.statefulSets[0].Status.ReadyReplicas = 2 rc.statefulSets[0].Status.CurrentReplicas = 1 rc.statefulSets[0].Status.UpdatedReplicas = 1 + rc.statefulSets[0].Status.CurrentRevision = "1" + rc.statefulSets[0].Status.UpdateRevision = "2" - result = rc.CheckRackPodTemplate() - - assert.EqualValues(t, previousLabels, rc.statefulSets[0].Spec.Template.Labels) + rc.Datacenter.Spec.CanaryUpgrade = false + result = rc.CheckRackPodTemplate() assert.True(t, result.Completed()) + assert.NotEqual(t, expectedStrategy, rc.statefulSets[0].Spec.UpdateStrategy) } func TestCheckRackPodTemplate_GenerationCheck(t *testing.T) { @@ -405,7 +405,11 @@ func TestCheckRackPodTemplate_TemplateLabels(t *testing.T) { 2) require.NoErrorf(err, "error occurred creating statefulset") + desiredStatefulSet.Generation = 1 desiredStatefulSet.Spec.Replicas = ptr.To(int32(1)) + desiredStatefulSet.Status.Replicas = int32(1) + desiredStatefulSet.Status.UpdatedReplicas = int32(1) + desiredStatefulSet.Status.ObservedGeneration = 1 desiredStatefulSet.Status.ReadyReplicas = int32(1) trackObjects := []runtime.Object{ @@ -424,9 +428,10 @@ func TestCheckRackPodTemplate_TemplateLabels(t *testing.T) { rc.statefulSets = make([]*appsv1.StatefulSet, len(rackInfo)) rc.statefulSets[0] = desiredStatefulSet - rc.Client = fake.NewClientBuilder().WithStatusSubresource(rc.Datacenter).WithRuntimeObjects(trackObjects...).Build() + rc.Client = fake.NewClientBuilder().WithStatusSubresource(rc.Datacenter, rc.statefulSets[0]).WithRuntimeObjects(trackObjects...).Build() res := rc.CheckRackPodTemplate() require.Equal(result.Done(), res) + rc.statefulSets[0].Status.ObservedGeneration = rc.statefulSets[0].Generation sts := &appsv1.StatefulSet{} require.NoError(rc.Client.Get(rc.Ctx, types.NamespacedName{Name: desiredStatefulSet.Name, Namespace: desiredStatefulSet.Namespace}, sts)) diff --git a/tests/canary_upgrade/canary_upgrade_test.go b/tests/canary_upgrade/canary_upgrade_test.go index 5758ac0f..0f09c643 100644 --- a/tests/canary_upgrade/canary_upgrade_test.go +++ b/tests/canary_upgrade/canary_upgrade_test.go @@ -100,6 +100,22 @@ var _ = Describe(testName, func() { ns.WaitForCassandraImages(dcName, images, 300) ns.WaitForDatacenterReadyPodCount(dcName, 3) + // TODO Verify that after the canary upgrade we can issue upgrades to the rest of the nodes + step = "remove canary upgrade" + json = "{\"spec\": {\"canaryUpgrade\": false}}" + k = kubectl.PatchMerge(dcResource, json) + ns.ExecAndLog(step, k) + + ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30) + ns.WaitForDatacenterReadyPodCount(dcName, 3) + + images = []string{ + updated, + updated, + updated, + } + ns.WaitForCassandraImages(dcName, images, 300) + step = "deleting the dc" k = kubectl.DeleteFromFiles(dcYaml) ns.ExecAndLog(step, k)