From d8ae58df46d64da0c6a40b1c35b61e9b8feeda42 Mon Sep 17 00:00:00 2001 From: randytqwjp Date: Thu, 19 Dec 2024 13:46:12 +0900 Subject: [PATCH] fix review comments --- .../after/deployment.yaml | 3 +- .../after/tortoise.yaml | 121 ++++++++---------- internal/controller/tortoise_controller.go | 21 +-- pkg/hpa/service.go | 59 ++++----- pkg/hpa/service_test.go | 7 +- 5 files changed, 95 insertions(+), 116 deletions(-) diff --git a/internal/controller/testdata/mutable-autoscalingpolicy-no-hpa-and-add-horizontal/after/deployment.yaml b/internal/controller/testdata/mutable-autoscalingpolicy-no-hpa-and-add-horizontal/after/deployment.yaml index 96d10022..0d41a091 100644 --- a/internal/controller/testdata/mutable-autoscalingpolicy-no-hpa-and-add-horizontal/after/deployment.yaml +++ b/internal/controller/testdata/mutable-autoscalingpolicy-no-hpa-and-add-horizontal/after/deployment.yaml @@ -8,8 +8,7 @@ spec: strategy: {} template: metadata: - annotations: - kubectl.kubernetes.io/restartedAt: "2023-01-01T00:00:00Z" + annotations: null creationTimestamp: null labels: app: mercari diff --git a/internal/controller/testdata/mutable-autoscalingpolicy-no-hpa-and-add-horizontal/after/tortoise.yaml b/internal/controller/testdata/mutable-autoscalingpolicy-no-hpa-and-add-horizontal/after/tortoise.yaml index ece6ec11..9af0d17e 100644 --- a/internal/controller/testdata/mutable-autoscalingpolicy-no-hpa-and-add-horizontal/after/tortoise.yaml +++ b/internal/controller/testdata/mutable-autoscalingpolicy-no-hpa-and-add-horizontal/after/tortoise.yaml @@ -20,110 +20,97 @@ status: cpu: Vertical memory: Vertical conditions: + containerResourceRequests: + - containerName: app + resource: + cpu: "10" + memory: 10Gi + - containerName: istio-proxy + resource: + cpu: "4" + memory: 4Gi + tortoiseConditions: + - lastTransitionTime: "2023-01-01T00:00:00Z" + lastUpdateTime: "2023-01-01T00:00:00Z" + status: "False" + type: FailedToReconcile containerRecommendationFromVPA: - containerName: app maxRecommendation: cpu: - quantity: "3" - updatedAt: "2023-01-01T00:00:00Z" + quantity: "10" + updatedAt: null memory: - quantity: 3Gi - updatedAt: "2023-01-01T00:00:00Z" + quantity: 10Gi + updatedAt: null recommendation: cpu: - quantity: "3" - updatedAt: "2023-01-01T00:00:00Z" + quantity: "10" + updatedAt: null memory: - quantity: 3Gi - updatedAt: "2023-01-01T00:00:00Z" + quantity: 10Gi + updatedAt: null - containerName: istio-proxy maxRecommendation: cpu: - quantity: "3" - updatedAt: "2023-01-01T00:00:00Z" + quantity: "4" + updatedAt: null memory: - quantity: 3Gi - updatedAt: "2023-01-01T00:00:00Z" + quantity: 4Gi + updatedAt: null recommendation: cpu: - quantity: "3" - updatedAt: "2023-01-01T00:00:00Z" + quantity: "4" + updatedAt: null memory: - quantity: 3Gi - updatedAt: "2023-01-01T00:00:00Z" - containerResourceRequests: - - containerName: app - resource: - cpu: "10" - memory: 3Gi - - containerName: istio-proxy - resource: - cpu: "3" - memory: 3Gi - tortoiseConditions: - - lastTransitionTime: "2023-01-01T00:00:00Z" - lastUpdateTime: "2023-01-01T00:00:00Z" - message: the current number of replicas is not bigger than the preferred max - replica number - reason: ScaledUpBasedOnPreferredMaxReplicas - status: "False" - type: ScaledUpBasedOnPreferredMaxReplicas - - lastTransitionTime: "2023-01-01T00:00:00Z" - lastUpdateTime: "2023-01-01T00:00:00Z" - message: The recommendation is provided - status: "True" - type: VerticalRecommendationUpdated - - lastTransitionTime: "2023-01-01T00:00:00Z" - lastUpdateTime: "2023-01-01T00:00:00Z" - status: "False" - type: FailedToReconcile - containerResourcePhases: - - containerName: app - resourcePhases: - cpu: - lastTransitionTime: "2023-01-01T00:00:00Z" - phase: GatheringData - memory: - lastTransitionTime: "2023-01-01T00:00:00Z" - phase: Working - - containerName: istio-proxy - resourcePhases: - cpu: - lastTransitionTime: "2023-01-01T00:00:00Z" - phase: Working - memory: - lastTransitionTime: "2023-01-01T00:00:00Z" - phase: Working + quantity: 4Gi + updatedAt: null recommendations: horizontal: maxReplicas: - from: 0 timezone: Local to: 24 - updatedAt: "2023-01-01T00:00:00Z" + updatedAt: "2023-10-06T01:15:47Z" value: 20 minReplicas: - from: 0 timezone: Local to: 24 - updatedAt: "2023-01-01T00:00:00Z" + updatedAt: "2023-10-06T01:15:47Z" value: 5 targetUtilizations: - containerName: app - targetUtilization: - cpu: 70 + targetUtilization: {} - containerName: istio-proxy targetUtilization: {} vertical: containerResourceRecommendation: - RecommendedResource: cpu: "10" - memory: 3Gi + memory: 10Gi containerName: app - RecommendedResource: - cpu: "3" - memory: 3Gi + cpu: "4" + memory: 4Gi containerName: istio-proxy + containerResourcePhases: + - containerName: app + resourcePhases: + cpu: + lastTransitionTime: "2023-01-01T00:00:00Z" + phase: GatheringData + memory: + lastTransitionTime: "2023-01-01T00:00:00Z" + phase: Working + - containerName: istio-proxy + resourcePhases: + cpu: + lastTransitionTime: "2023-01-01T00:00:00Z" + phase: Working + memory: + lastTransitionTime: "2023-01-01T00:00:00Z" + phase: Working targets: horizontalPodAutoscaler: tortoise-hpa-mercari scaleTargetRef: @@ -132,4 +119,4 @@ status: verticalPodAutoscalers: - name: tortoise-monitor-mercari role: Monitor - tortoisePhase: Working + tortoisePhase: PartlyWorking diff --git a/internal/controller/tortoise_controller.go b/internal/controller/tortoise_controller.go index db7afd1d..0a6fd8d2 100644 --- a/internal/controller/tortoise_controller.go +++ b/internal/controller/tortoise_controller.go @@ -90,7 +90,6 @@ var ( func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { logger := log.FromContext(ctx) now := time.Now() - hpaCreated := false if onlyTestNow != nil { now = *onlyTestNow } @@ -204,7 +203,7 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ return ctrl.Result{}, fmt.Errorf("add finalizer: %w", err) } - tortoise, hpaCreated, err = r.HpaService.UpdateHPASpecFromTortoiseAutoscalingPolicy(ctx, tortoise, hpa, currentDesiredReplicaNum, now) + tortoise, err = r.HpaService.UpdateHPASpecFromTortoiseAutoscalingPolicy(ctx, tortoise, hpa, currentDesiredReplicaNum, now) if err != nil { logger.Error(err, "update HPA spec from Tortoise autoscaling policy", "tortoise", req.NamespacedName) return ctrl.Result{}, err @@ -234,20 +233,22 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ // VPA is ready, we mark all Vertical scaling resources as Running. tortoise = vpa.SetAllVerticalContainerResourcePhaseWorking(tortoise, now) + isReady := false + logger.Info("VPA created by tortoise is ready, proceeding to generate the recommendation", "tortoise", req.NamespacedName) - hpa, err = r.HpaService.GetHPAOnTortoise(ctx, tortoise) + hpa, isReady, err = r.HpaService.GetHPAOnTortoise(ctx, tortoise) if err != nil { logger.Error(err, "failed to get HPA", "tortoise", req.NamespacedName) return ctrl.Result{}, err } - scalingActive, err := r.HpaService.CheckHpaMetricStatus(ctx, hpa) - if err != nil { - if tortoise.Status.TortoisePhase == autoscalingv1beta3.TortoisePhaseWorking && !hpaCreated { - logger.Error(err, "HPA status abnormal", "tortoise", req.NamespacedName) - return ctrl.Result{}, err - } - err = nil + + if !isReady { + // HPA is correctly fetched, but looks like not ready yet. We won't be able to calculate things correctly, and hence stop the reconciliation here. + logger.Info("HPA not ready, abort reconcile") + return ctrl.Result{}, nil } + scalingActive := r.HpaService.CheckHpaMetricStatus(ctx, hpa) + tortoise, err = r.TortoiseService.UpdateTortoisePhaseIfHPAIsUnhealthy(ctx, scalingActive, tortoise) if err != nil { logger.Error(err, "Tortoise could not switch to emergency mode", "tortoise", req.NamespacedName) diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 78bdaf2d..1a040ba7 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -286,19 +286,23 @@ func (c *Service) GetHPAOnTortoiseSpec(ctx context.Context, tortoise *autoscalin return hpa, nil } -func (c *Service) GetHPAOnTortoise(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise) (*v2.HorizontalPodAutoscaler, error) { +func (c *Service) GetHPAOnTortoise(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise) (*v2.HorizontalPodAutoscaler, bool, error) { if !HasHorizontal(tortoise) { // there should be no HPA - return nil, nil + return nil, true, nil } hpa := &v2.HorizontalPodAutoscaler{} if err := c.c.Get(ctx, types.NamespacedName{Namespace: tortoise.Namespace, Name: tortoise.Status.Targets.HorizontalPodAutoscaler}, hpa); err != nil { - return nil, fmt.Errorf("failed to get hpa on tortoise: %w", err) + return nil, false, fmt.Errorf("failed to get hpa on tortoise: %w", err) + } + if reflect.DeepEqual(hpa.Status, v2.HorizontalPodAutoscalerStatus{}) || hpa.Status.Conditions == nil || hpa.Status.CurrentMetrics == nil { + // Most likely, HPA is just created and not yet handled by HPA controller. + return nil, false, nil } recordHPAMetric(ctx, tortoise, hpa) - return hpa, nil + return hpa, true, nil } func (s *Service) UpdatingHPATargetUtilizationAllowed(tortoise *autoscalingv1beta3.Tortoise, now time.Time) (*autoscalingv1beta3.Tortoise, bool) { @@ -498,11 +502,10 @@ func (c *Service) UpdateHPASpecFromTortoiseAutoscalingPolicy( givenHPA *v2.HorizontalPodAutoscaler, replicaNum int32, now time.Time, -) (*autoscalingv1beta3.Tortoise, bool, error) { - hpaCreated := false +) (*autoscalingv1beta3.Tortoise, error) { if tortoise.Spec.UpdateMode == autoscalingv1beta3.UpdateModeOff { // When UpdateMode is Off, we don't update HPA. - return tortoise, hpaCreated, nil + return tortoise, nil } if !HasHorizontal(tortoise) { @@ -510,21 +513,21 @@ func (c *Service) UpdateHPASpecFromTortoiseAutoscalingPolicy( // HPA should be created by Tortoise, which can be deleted. err := c.DeleteHPACreatedByTortoise(ctx, tortoise) if err != nil && !apierrors.IsNotFound(err) { - return tortoise, hpaCreated, fmt.Errorf("delete hpa created by tortoise: %w", err) + return tortoise, fmt.Errorf("delete hpa created by tortoise: %w", err) } c.recorder.Event(tortoise, corev1.EventTypeNormal, event.HPADeleted, fmt.Sprintf("Deleted a HPA %s/%s because tortoise has no resource to scale horizontally", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) } else { // We cannot delete the HPA because it's specified by the user. err := c.disableHPA(ctx, tortoise, replicaNum) if err != nil { - return tortoise, hpaCreated, fmt.Errorf("disable hpa: %w", err) + return tortoise, fmt.Errorf("disable hpa: %w", err) } c.recorder.Event(tortoise, corev1.EventTypeNormal, event.HPADisabled, fmt.Sprintf("Disabled a HPA %s/%s because tortoise has no resource to scale horizontally", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) } // No need to edit container resource phase. - return tortoise, hpaCreated, nil + return tortoise, nil } hpa := &v2.HorizontalPodAutoscaler{} @@ -536,15 +539,14 @@ func (c *Service) UpdateHPASpecFromTortoiseAutoscalingPolicy( // - In that case, we need to create an initial HPA or give an annotation to existing HPA. tortoise, err = c.InitializeHPA(ctx, tortoise, replicaNum, now) if err != nil { - return tortoise, hpaCreated, fmt.Errorf("initialize hpa: %w", err) + return tortoise, fmt.Errorf("initialize hpa: %w", err) } - hpaCreated = true c.recorder.Event(tortoise, corev1.EventTypeNormal, event.HPACreated, fmt.Sprintf("Initialized a HPA %s/%s because tortoise has resource to scale horizontally", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) - return tortoise, hpaCreated, nil + return tortoise, nil } - return tortoise, hpaCreated, fmt.Errorf("failed to get hpa on tortoise: %w", err) + return tortoise, fmt.Errorf("failed to get hpa on tortoise: %w", err) } var newhpa *v2.HorizontalPodAutoscaler @@ -552,7 +554,7 @@ func (c *Service) UpdateHPASpecFromTortoiseAutoscalingPolicy( newhpa, tortoise, isHpaEdited = c.syncHPAMetricsWithTortoiseAutoscalingPolicy(ctx, tortoise, hpa, now) if !isHpaEdited { // User didn't change anything. - return tortoise, hpaCreated, nil + return tortoise, nil } retryNumber := -1 @@ -571,12 +573,12 @@ func (c *Service) UpdateHPASpecFromTortoiseAutoscalingPolicy( } if err := retry.RetryOnConflict(retry.DefaultRetry, updateFn); err != nil { - return tortoise, hpaCreated, fmt.Errorf("update hpa: %w (%v times retried)", err, replicaNum) + return tortoise, fmt.Errorf("update hpa: %w (%v times retried)", err, replicaNum) } c.recorder.Event(tortoise, corev1.EventTypeNormal, event.HPAUpdated, fmt.Sprintf("Updated a HPA %s/%s because the autoscaling policy is changed in the tortoise", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) - return tortoise, hpaCreated, nil + return tortoise, nil } func HasHorizontal(tortoise *autoscalingv1beta3.Tortoise) bool { @@ -769,25 +771,18 @@ func (c *Service) excludeExternalMetric(ctx context.Context, hpa *v2.HorizontalP return newHPA } -func (c *Service) CheckHpaMetricStatus(ctx context.Context, currenthpa *v2.HorizontalPodAutoscaler) (bool, error) { +func (c *Service) CheckHpaMetricStatus(ctx context.Context, currenthpa *v2.HorizontalPodAutoscaler) bool { //currenthpa = currenthpa.DeepCopy() logger := log.FromContext(ctx) if currenthpa == nil { logger.Info("empty HPA passed into status check, ignore") - return true, nil - } - - if reflect.DeepEqual(currenthpa.Status, v2.HorizontalPodAutoscalerStatus{}) { - return false, fmt.Errorf("HPA empty status, switch to emergency mode") + return true } - if currenthpa.Status.Conditions == nil { - return false, fmt.Errorf("HPA empty conditions, switch to emergency mode") + if reflect.DeepEqual(currenthpa.Status, v2.HorizontalPodAutoscalerStatus{}) || currenthpa.Status.Conditions == nil || currenthpa.Status.CurrentMetrics == nil { + return true } - if currenthpa.Status.CurrentMetrics == nil { - return false, fmt.Errorf("HPA no metrics, switch to emergency mode") - } conditions := currenthpa.Status.Conditions currentMetrics := currenthpa.Status.CurrentMetrics @@ -796,7 +791,7 @@ func (c *Service) CheckHpaMetricStatus(ctx context.Context, currenthpa *v2.Horiz if condition.Type == "ScalingActive" && condition.Status == "False" && condition.Reason == "FailedGetResourceMetric" { //switch to Emergency mode since no metrics logger.Info("HPA failed to get resource metrics, switch to emergency mode") - return false, nil + return false } } } @@ -805,14 +800,14 @@ func (c *Service) CheckHpaMetricStatus(ctx context.Context, currenthpa *v2.Horiz for _, currentMetric := range currentMetrics { if !currentMetric.ContainerResource.Current.Value.IsZero() { //Can still get metrics for some containers, scale based on those - return true, nil + return true } } logger.Info("HPA all metrics return 0, switch to emergency mode") - return false, nil + return false } logger.Info("HPA status check passed") - return true, nil + return true } diff --git a/pkg/hpa/service_test.go b/pkg/hpa/service_test.go index c7780820..fd23d9fe 100644 --- a/pkg/hpa/service_test.go +++ b/pkg/hpa/service_test.go @@ -4636,7 +4636,7 @@ func TestService_UpdateHPASpecFromTortoiseAutoscalingPolicy(t *testing.T) { // givenHPA is only non-nil when the tortoise has a reference to an existing HPA givenHPA = tt.initialHPA } - tortoise, _, err := c.UpdateHPASpecFromTortoiseAutoscalingPolicy(context.Background(), tt.args.tortoise, givenHPA, tt.args.replicaNum, time.Now()) + tortoise, err := c.UpdateHPASpecFromTortoiseAutoscalingPolicy(context.Background(), tt.args.tortoise, givenHPA, tt.args.replicaNum, time.Now()) if (err != nil) != tt.wantErr { t.Errorf("Service.UpdateHPASpecFromTortoiseAutoscalingPolicy() error = %v, wantErr %v", err, tt.wantErr) return @@ -5005,10 +5005,7 @@ func TestService_CheckHpaMetricStatus(t *testing.T) { if err != nil { t.Fatalf("New() error = %v", err) } - status, err := c.CheckHpaMetricStatus(context.Background(), tt.HPA) - if err != nil { - t.Fatalf("New() error = %v", err) - } + status := c.CheckHpaMetricStatus(context.Background(), tt.HPA) if status != tt.result { t.Errorf("Service.checkHpaMetricStatus() status test: %s failed", tt.name) return