From e580f21c32b0660930646eba0e798426c7c5b855 Mon Sep 17 00:00:00 2001 From: Michael Burman Date: Mon, 25 Nov 2024 04:44:32 -0800 Subject: [PATCH] Make parallel restart of already bootstrapped nodes the default (#733) --- CHANGELOG.md | 1 + pkg/reconciliation/reconcile_racks.go | 6 +- pkg/reconciliation/reconcile_racks_test.go | 114 ++++++++++++++++-- .../default-three-rack-three-node-dc.yaml | 2 - 4 files changed, 111 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 346c360c..e84cb3b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti ## unreleased * [CHANGE] [#720](https://github.com/k8ssandra/cass-operator/issues/720) Always use ObjectMeta.Name for the PodDisruptionBudget resource name, not the DatacenterName +* [CHANGE] [#731](https://github.com/k8ssandra/cass-operator/issues/731) Make the concurrent restart of already bootstrapped nodes the default. If user wishes to revert to older behavior, set annotation ``cassandra.datastax.com/allow-parallel-starts: "false"`` to datacenter. * [FEATURE] [#651](https://github.com/k8ssandra/cass-operator/issues/651) Add tsreload task for DSE deployments and ability to check if sync operation is available on the mgmt-api side * [ENHANCEMENT] [#722](https://github.com/k8ssandra/cass-operator/issues/722) Add back the ability to track cleanup task before marking scale up as done. This is controlled by an annotation cassandra.datastax.com/track-cleanup-tasks * [ENHANCEMENT] [#532](https://github.com/k8ssandra/k8ssandra-operator/issues/532) Extend ImageConfig type to allow additional parameters for k8ssandra-operator requirements. These include per-image PullPolicy / PullSecrets as well as additional image. Also add ability to override a single HCD version image. diff --git a/pkg/reconciliation/reconcile_racks.go b/pkg/reconciliation/reconcile_racks.go index af5f4c36..3cf1ab78 100644 --- a/pkg/reconciliation/reconcile_racks.go +++ b/pkg/reconciliation/reconcile_racks.go @@ -672,6 +672,10 @@ func (rc *ReconciliationContext) checkSeedLabels() (int, error) { return seedCount, nil } +func shouldUseFastPath(dc *api.CassandraDatacenter, seedCount int) bool { + return seedCount > 0 && !(metav1.HasAnnotation(dc.ObjectMeta, api.AllowParallelStartsAnnotations) && dc.Annotations[api.AllowParallelStartsAnnotations] == "false") +} + // CheckPodsReady loops over all the server pods and starts them func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMetadataEndpoints) result.ReconcileResult { rc.ReqLogger.Info("reconcile_racks::CheckPodsReady") @@ -719,7 +723,7 @@ func (rc *ReconciliationContext) CheckPodsReady(endpointData httphelper.CassMeta } // step 0 - fastpath - if seedCount > 0 && metav1.HasAnnotation(rc.Datacenter.ObjectMeta, api.AllowParallelStartsAnnotations) && rc.Datacenter.Annotations[api.AllowParallelStartsAnnotations] == "true" { + if shouldUseFastPath(rc.Datacenter, seedCount) { notReadyPods, err := rc.startBootstrappedNodes(endpointData) if err != nil { return result.Error(err) diff --git a/pkg/reconciliation/reconcile_racks_test.go b/pkg/reconciliation/reconcile_racks_test.go index aa1ab0b4..abf6ba77 100644 --- a/pkg/reconciliation/reconcile_racks_test.go +++ b/pkg/reconciliation/reconcile_racks_test.go @@ -210,19 +210,16 @@ func TestReconcileRacks_ReconcilePods(t *testing.T) { rc, _, cleanupMockScr := setupTest() defer cleanupMockScr() - var ( - one = int32(1) - ) - + rc.Datacenter.Spec.Size = 1 desiredStatefulSet, err := newStatefulSetForCassandraDatacenter( nil, "default", rc.Datacenter, - 2) + 1) assert.NoErrorf(t, err, "error occurred creating statefulset") - desiredStatefulSet.Spec.Replicas = &one - desiredStatefulSet.Status.ReadyReplicas = one + desiredStatefulSet.Spec.Replicas = ptr.To[int32](1) + desiredStatefulSet.Status.ReadyReplicas = 1 trackObjects := []runtime.Object{ desiredStatefulSet, @@ -238,8 +235,8 @@ func TestReconcileRacks_ReconcilePods(t *testing.T) { rc.Client = fake.NewClientBuilder().WithStatusSubresource(rc.Datacenter).WithRuntimeObjects(trackObjects...).Build() nextRack := &RackInformation{} - nextRack.RackName = "default" - nextRack.NodeCount = 1 + nextRack.RackName = desiredStatefulSet.Labels[api.RackLabel] + nextRack.NodeCount = int(*desiredStatefulSet.Spec.Replicas) nextRack.SeedCount = 1 rackInfo := []*RackInformation{nextRack} @@ -1175,9 +1172,12 @@ func mockReadyPodsForStatefulSet(sts *appsv1.StatefulSet, cluster, dc string) [] pod.Labels[api.ClusterLabel] = cluster pod.Labels[api.DatacenterLabel] = dc pod.Labels[api.CassNodeState] = "Started" + pod.Labels[api.RackLabel] = sts.Labels[api.RackLabel] pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: "cassandra", Ready: true, }} + pod.Status.PodIP = fmt.Sprintf("192.168.1.%d", i) pods = append(pods, pod) } return pods @@ -2952,3 +2952,99 @@ func TestCheckRackLabels(t *testing.T) { require.Equal(result.Continue(), res, "Label updates should not cause errors") require.Subset(desiredStatefulSet.Labels, rc.Datacenter.GetRackLabels("default")) } + +func TestCheckPodsReadyAllStarted(t *testing.T) { + rc, _, cleanupMockScr := setupTest() + defer cleanupMockScr() + assert := assert.New(t) + + desiredStatefulSet, err := newStatefulSetForCassandraDatacenter( + nil, + "default", + rc.Datacenter, + 3) + assert.NoErrorf(err, "error occurred creating statefulset") + + desiredStatefulSet.Status.ReadyReplicas = *desiredStatefulSet.Spec.Replicas + + trackObjects := []runtime.Object{ + desiredStatefulSet, + rc.Datacenter, + } + + mockPods := mockReadyPodsForStatefulSet(desiredStatefulSet, rc.Datacenter.Spec.ClusterName, rc.Datacenter.Name) + for idx := range mockPods { + mp := mockPods[idx] + metav1.SetMetaDataLabel(&mp.ObjectMeta, api.SeedNodeLabel, "true") + trackObjects = append(trackObjects, mp) + } + + rc.Client = fake.NewClientBuilder().WithStatusSubresource(rc.Datacenter).WithRuntimeObjects(trackObjects...).Build() + + nextRack := &RackInformation{} + nextRack.RackName = desiredStatefulSet.Labels[api.RackLabel] + nextRack.NodeCount = int(*desiredStatefulSet.Spec.Replicas) + nextRack.SeedCount = 1 + + rackInfo := []*RackInformation{nextRack} + + rc.desiredRackInformation = rackInfo + rc.statefulSets = make([]*appsv1.StatefulSet, len(rackInfo)) + rc.statefulSets[0] = desiredStatefulSet + + rc.clusterPods = mockPods + rc.dcPods = mockPods + + epData := httphelper.CassMetadataEndpoints{ + Entity: []httphelper.EndpointState{}, + } + + for i := 0; i < int(*desiredStatefulSet.Spec.Replicas); i++ { + ep := httphelper.EndpointState{ + RpcAddress: fmt.Sprintf("192.168.1.%d", i+1), + Status: "UN", + } + epData.Entity = append(epData.Entity, ep) + } + + res := &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader("OK")), + } + + mockHttpClient := mocks.NewHttpClient(t) + mockHttpClient.On("Do", + mock.MatchedBy( + func(req *http.Request) bool { + return req != nil + })). + Return(res, nil). + Times(len(epData.Entity) * 2) // reloadSeeds * pods + clusterHealthCheck * pods + + client := httphelper.NodeMgmtClient{ + Client: mockHttpClient, + Log: rc.ReqLogger, + Protocol: "http", + } + + rc.NodeMgmtClient = client + + recRes := rc.CheckPodsReady(epData) + assert.Equal(result.Continue(), recRes) // All pods should be up, no need to call anything +} + +func TestShouldUseFastPath(t *testing.T) { + dc := &api.CassandraDatacenter{} + + seedCount := 0 + + assert := assert.New(t) + assert.False(shouldUseFastPath(dc, seedCount)) + seedCount = 1 + assert.True(shouldUseFastPath(dc, seedCount)) + + metav1.SetMetaDataAnnotation(&dc.ObjectMeta, api.AllowParallelStartsAnnotations, "true") + assert.True(shouldUseFastPath(dc, seedCount)) + metav1.SetMetaDataAnnotation(&dc.ObjectMeta, api.AllowParallelStartsAnnotations, "false") + assert.False(shouldUseFastPath(dc, seedCount)) +} diff --git a/tests/testdata/default-three-rack-three-node-dc.yaml b/tests/testdata/default-three-rack-three-node-dc.yaml index b91cf6ce..030a94ae 100644 --- a/tests/testdata/default-three-rack-three-node-dc.yaml +++ b/tests/testdata/default-three-rack-three-node-dc.yaml @@ -2,8 +2,6 @@ apiVersion: cassandra.datastax.com/v1beta1 kind: CassandraDatacenter metadata: name: dc1 - annotations: - cassandra.datastax.com/allow-parallel-starts: "true" spec: clusterName: cluster1 serverType: cassandra