Skip to content

Commit

Permalink
Upgrade canary test to fully update the cluster after the canary upgr…
Browse files Browse the repository at this point in the history
…ade (#654)

* Upgrade canary test to fully update the cluster after the canary upgrade

* Update to setup-go v5

* Fix canary upgrade hash calculation by including canary information in the hash calculation. Move generation update at the end of the reconcile, ignoring state change

* Improve tests to match the current behavior
  • Loading branch information
burmanm authored May 30, 2024
1 parent 8d34919 commit 14d5d61
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 57 deletions.
2 changes: 1 addition & 1 deletion .github/actions/run-integ-test/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ runs:
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/operatorBuildAndDeploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
- uses: actions/checkout@v4
if: github.event_name != 'pull_request'
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
Expand Down Expand Up @@ -53,7 +53,7 @@ jobs:
- name: Check out code into the Go module directory
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- name: Check out code into the Go module directory
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti
* [FEATURE] [#659](https://github.com/k8ssandra/cass-operator/issues/659) Add support for HCD serverType with versions 1.x.x. It will be deployed like Cassandra >= 4.1 for now.
* [FEATURE] [#660](https://github.com/k8ssandra/cass-operator/issues/660) Add support for DSE version 6.9.x and remove support for DSE 7.x.x. DSE 6.9 will be deployed like DSE 6.8.
* [BUGFIX] [#652](https://github.com/k8ssandra/cass-operator/issues/652) Update nodeStatuses info when a pod is recreated
* [BUGFIX] [#656](https://github.com/k8ssandra/cass-operator/issues/656) After canary upgrade, it was not possible to continue upgrading rest of the nodes if the only change was removing the canary upgrade
* [BUGFIX] [#657](https://github.com/k8ssandra/cass-operator/issues/657) If a change did not result in StatefulSet changes, a Ready -> Ready state would lose an ObservedGeneration update in the status
* [BUGFIX] [#382](https://github.com/k8ssandra/cass-operator/issues/382) If StatefulSet has not caught up yet, do not allow any changes to it. Instead, requeue and wait (does not change behavior of forceRacksUpgrade)

## v1.20.0

Expand Down
17 changes: 17 additions & 0 deletions pkg/reconciliation/construct_statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,23 @@ func newStatefulSetForCassandraDatacenter(
result.Spec.ServiceName = sts.Spec.ServiceName
}

if dc.Spec.CanaryUpgrade {
var partition int32
if dc.Spec.CanaryUpgradeCount == 0 || dc.Spec.CanaryUpgradeCount > replicaCountInt32 {
partition = replicaCountInt32
} else {
partition = replicaCountInt32 - dc.Spec.CanaryUpgradeCount
}

strategy := appsv1.StatefulSetUpdateStrategy{
Type: appsv1.RollingUpdateStatefulSetStrategyType,
RollingUpdate: &appsv1.RollingUpdateStatefulSetStrategy{
Partition: &partition,
},
}
result.Spec.UpdateStrategy = strategy
}

// add a hash here to facilitate checking if updates are needed
utils.AddHashAnnotation(result)

Expand Down
9 changes: 4 additions & 5 deletions pkg/reconciliation/constructor.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ package reconciliation
// This file defines constructors for k8s objects

import (
"fmt"

api "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
"github.com/k8ssandra/cass-operator/pkg/oplabels"
"github.com/k8ssandra/cass-operator/pkg/utils"

corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -48,7 +49,7 @@ func newPodDisruptionBudgetForDatacenter(dc *api.CassandraDatacenter) *policyv1.
}

func setOperatorProgressStatus(rc *ReconciliationContext, newState api.ProgressState) error {
rc.ReqLogger.Info("reconcile_racks::setOperatorProgressStatus")
rc.ReqLogger.Info(fmt.Sprintf("reconcile_racks::setOperatorProgressStatus::%v", newState))
currentState := rc.Datacenter.Status.CassandraOperatorProgress
if currentState == newState {
// early return, no need to ping k8s
Expand All @@ -57,13 +58,11 @@ func setOperatorProgressStatus(rc *ReconciliationContext, newState api.ProgressS

patch := client.MergeFrom(rc.Datacenter.DeepCopy())
rc.Datacenter.Status.CassandraOperatorProgress = newState
// TODO there may be a better place to push status.observedGeneration in the reconcile loop

if newState == api.ProgressReady {
rc.Datacenter.Status.ObservedGeneration = rc.Datacenter.Generation
if rc.Datacenter.Status.DatacenterName == nil {
rc.Datacenter.Status.DatacenterName = &rc.Datacenter.Spec.DatacenterName
}
rc.setCondition(api.NewDatacenterCondition(api.DatacenterRequiresUpdate, corev1.ConditionFalse))
}
if err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, patch); err != nil {
rc.ReqLogger.Error(err, "error updating the Cassandra Operator Progress state")
Expand Down
25 changes: 23 additions & 2 deletions pkg/reconciliation/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ import (
"fmt"
"sync"

"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"

api "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
Expand Down Expand Up @@ -65,9 +67,28 @@ func (rc *ReconciliationContext) CalculateReconciliationActions() (reconcile.Res
return result.Error(err).Output()
}

result, err := rc.ReconcileAllRacks()
res, err := rc.ReconcileAllRacks()
if err != nil {
return result.Error(err).Output()
}

if err := rc.updateStatus(); err != nil {
return result.Error(err).Output()
}

return result, err
return res, nil
}

func (rc *ReconciliationContext) updateStatus() error {
patch := client.MergeFrom(rc.Datacenter.DeepCopy())
rc.Datacenter.Status.ObservedGeneration = rc.Datacenter.Generation
rc.setCondition(api.NewDatacenterCondition(api.DatacenterRequiresUpdate, corev1.ConditionFalse))
if err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, patch); err != nil {
rc.ReqLogger.Error(err, "error updating the Cassandra Operator Progress state")
return err
}

return nil
}

// This file contains various definitions and plumbing setup used for reconciliation.
Expand Down
63 changes: 24 additions & 39 deletions pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,30 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
}
statefulSet := rc.statefulSets[idx]

status := statefulSet.Status

updatedReplicas := status.UpdatedReplicas
if status.CurrentRevision != status.UpdateRevision {
// Previous update was a canary upgrade, so we have pods in different versions
updatedReplicas = status.CurrentReplicas + status.UpdatedReplicas
}

if statefulSet.Generation != status.ObservedGeneration ||
status.Replicas != status.ReadyReplicas ||
status.Replicas != updatedReplicas {

logger.Info(
"waiting for upgrade to finish on statefulset",
"statefulset", statefulSet.Name,
"replicas", status.Replicas,
"readyReplicas", status.ReadyReplicas,
"currentReplicas", status.CurrentReplicas,
"updatedReplicas", status.UpdatedReplicas,
)

return result.RequeueSoon(10)
}

desiredSts, err := newStatefulSetForCassandraDatacenter(statefulSet, rackName, dc, int(*statefulSet.Spec.Replicas))

if err != nil {
Expand Down Expand Up @@ -232,22 +256,6 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
// selector must match podTemplate.Labels, those can't be updated either
desiredSts.Spec.Selector = statefulSet.Spec.Selector

if dc.Spec.CanaryUpgrade {
var partition int32
if dc.Spec.CanaryUpgradeCount == 0 || dc.Spec.CanaryUpgradeCount > int32(rc.desiredRackInformation[idx].NodeCount) {
partition = int32(rc.desiredRackInformation[idx].NodeCount)
} else {
partition = int32(rc.desiredRackInformation[idx].NodeCount) - dc.Spec.CanaryUpgradeCount
}

strategy := appsv1.StatefulSetUpdateStrategy{
Type: appsv1.RollingUpdateStatefulSetStrategyType,
RollingUpdate: &appsv1.RollingUpdateStatefulSetStrategy{
Partition: &partition,
},
}
desiredSts.Spec.UpdateStrategy = strategy
}
stateMeta, err := meta.Accessor(statefulSet)
resVersion := stateMeta.GetResourceVersion()
if err != nil {
Expand Down Expand Up @@ -300,29 +308,6 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
// we just updated k8s and pods will be knocked out of ready state, so let k8s
// call us back when these changes are done and the new pods are back to ready
return result.Done()
} else {

// the pod template is right, but if any pods don't match it,
// or are missing, we should not move onto the next rack,
// because there's an upgrade in progress

status := statefulSet.Status
if statefulSet.Generation != status.ObservedGeneration ||
status.Replicas != status.ReadyReplicas ||
status.Replicas != status.CurrentReplicas ||
status.Replicas != status.UpdatedReplicas {

logger.Info(
"waiting for upgrade to finish on statefulset",
"statefulset", statefulSet.Name,
"replicas", status.Replicas,
"readyReplicas", status.ReadyReplicas,
"currentReplicas", status.CurrentReplicas,
"updatedReplicas", status.UpdatedReplicas,
)

return result.RequeueSoon(10)
}
}
}

Expand Down
19 changes: 12 additions & 7 deletions pkg/reconciliation/reconcile_racks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,6 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) {
rc, _, cleanpMockSrc := setupTest()
defer cleanpMockSrc()

rc.Datacenter.Spec.ServerVersion = "6.8.2"
rc.Datacenter.Spec.Racks = []api.Rack{
{Name: "rack1", DeprecatedZone: "zone-1"},
}
Expand All @@ -314,11 +313,10 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) {

assert.True(t, result.Completed())
assert.Nil(t, err)
previousLabels := rc.statefulSets[0].Spec.Template.Labels

rc.Datacenter.Spec.CanaryUpgrade = true
rc.Datacenter.Spec.CanaryUpgradeCount = 1
rc.Datacenter.Spec.ServerVersion = "6.8.3"
rc.Datacenter.Spec.ServerVersion = "6.8.44"
partition := rc.Datacenter.Spec.CanaryUpgradeCount

result = rc.CheckRackPodTemplate()
Expand All @@ -342,12 +340,14 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) {
rc.statefulSets[0].Status.ReadyReplicas = 2
rc.statefulSets[0].Status.CurrentReplicas = 1
rc.statefulSets[0].Status.UpdatedReplicas = 1
rc.statefulSets[0].Status.CurrentRevision = "1"
rc.statefulSets[0].Status.UpdateRevision = "2"

result = rc.CheckRackPodTemplate()

assert.EqualValues(t, previousLabels, rc.statefulSets[0].Spec.Template.Labels)
rc.Datacenter.Spec.CanaryUpgrade = false

result = rc.CheckRackPodTemplate()
assert.True(t, result.Completed())
assert.NotEqual(t, expectedStrategy, rc.statefulSets[0].Spec.UpdateStrategy)
}

func TestCheckRackPodTemplate_GenerationCheck(t *testing.T) {
Expand Down Expand Up @@ -405,7 +405,11 @@ func TestCheckRackPodTemplate_TemplateLabels(t *testing.T) {
2)
require.NoErrorf(err, "error occurred creating statefulset")

desiredStatefulSet.Generation = 1
desiredStatefulSet.Spec.Replicas = ptr.To(int32(1))
desiredStatefulSet.Status.Replicas = int32(1)
desiredStatefulSet.Status.UpdatedReplicas = int32(1)
desiredStatefulSet.Status.ObservedGeneration = 1
desiredStatefulSet.Status.ReadyReplicas = int32(1)

trackObjects := []runtime.Object{
Expand All @@ -424,9 +428,10 @@ func TestCheckRackPodTemplate_TemplateLabels(t *testing.T) {
rc.statefulSets = make([]*appsv1.StatefulSet, len(rackInfo))
rc.statefulSets[0] = desiredStatefulSet

rc.Client = fake.NewClientBuilder().WithStatusSubresource(rc.Datacenter).WithRuntimeObjects(trackObjects...).Build()
rc.Client = fake.NewClientBuilder().WithStatusSubresource(rc.Datacenter, rc.statefulSets[0]).WithRuntimeObjects(trackObjects...).Build()
res := rc.CheckRackPodTemplate()
require.Equal(result.Done(), res)
rc.statefulSets[0].Status.ObservedGeneration = rc.statefulSets[0].Generation

sts := &appsv1.StatefulSet{}
require.NoError(rc.Client.Get(rc.Ctx, types.NamespacedName{Name: desiredStatefulSet.Name, Namespace: desiredStatefulSet.Namespace}, sts))
Expand Down
16 changes: 16 additions & 0 deletions tests/canary_upgrade/canary_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,22 @@ var _ = Describe(testName, func() {
ns.WaitForCassandraImages(dcName, images, 300)
ns.WaitForDatacenterReadyPodCount(dcName, 3)

// TODO Verify that after the canary upgrade we can issue upgrades to the rest of the nodes
step = "remove canary upgrade"
json = "{\"spec\": {\"canaryUpgrade\": false}}"
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30)
ns.WaitForDatacenterReadyPodCount(dcName, 3)

images = []string{
updated,
updated,
updated,
}
ns.WaitForCassandraImages(dcName, images, 300)

step = "deleting the dc"
k = kubectl.DeleteFromFiles(dcYaml)
ns.ExecAndLog(step, k)
Expand Down

0 comments on commit 14d5d61

Please sign in to comment.