From 2997fa0a055ec9e036fd0a63620770404f1125e2 Mon Sep 17 00:00:00 2001 From: Kensei Nakada Date: Mon, 2 Oct 2023 17:11:49 +0900 Subject: [PATCH] add(observability): emit events from the controller --- config/rbac/role.yaml | 8 +++++ controllers/tortoise_controller.go | 23 ++++++++----- controllers/tortoise_controller_test.go | 12 ++++--- main.go | 10 +++--- ..._v1_clusterrole_tortoise-manager-role.yaml | 8 +++++ pkg/hpa/service.go | 20 +++++++++-- pkg/hpa/service_test.go | 7 ++-- pkg/tortoise/tortoise.go | 31 +++++++++++++++-- pkg/vpa/service.go | 34 +++++++++++++++---- 9 files changed, 122 insertions(+), 31 deletions(-) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index ed76cf39..e7e93491 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -66,3 +66,11 @@ rules: - get - patch - update +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update diff --git a/controllers/tortoise_controller.go b/controllers/tortoise_controller.go index cb1a4052..d8f3825f 100644 --- a/controllers/tortoise_controller.go +++ b/controllers/tortoise_controller.go @@ -33,6 +33,7 @@ import ( v1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/log" @@ -55,17 +56,25 @@ type TortoiseReconciler struct { DeploymentService *deployment.Service TortoiseService *tortoise.Service RecommenderService *recommender.Service + EventRecorder record.EventRecorder } //+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=tortoises,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=tortoises/status,verbs=get;update;patch //+kubebuilder:rbac:groups=autoscaling.mercari.com,resources=tortoises/finalizers,verbs=update +//+kubebuilder:rbac:groups=core,resources=events,verbs=create;update;patch //+kubebuilder:rbac:groups=autoscaling.k8s.io,resources=verticalpodautoscalers,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=autoscaling,resources=horizontalpodautoscalers,verbs=get;list;watch;create;update;patch;delete -func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { + defer func() { + if reterr != nil { + r.EventRecorder.Event(&autoscalingv1alpha1.Tortoise{}, "Warning", "ReconcileError", reterr.Error()) + } + }() + logger := log.FromContext(ctx) now := time.Now() @@ -90,7 +99,7 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c if err := r.TortoiseService.RemoveFinalizer(ctx, tortoise); err != nil { return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err) } - return ctrl.Result{}, nil + return ctrl.Result{RequeueAfter: r.Interval}, nil } // tortoise is not deleted. Make sure finalizer is added to tortoise. @@ -117,9 +126,7 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, fmt.Errorf("initialize VPAs and HPA: %w", err) } - // VPA and HPA are just created, and they won't start working soon. - // So, return here and wait a few min for them to start to work. - return ctrl.Result{RequeueAfter: 2 * time.Minute}, nil + return ctrl.Result{RequeueAfter: r.Interval}, nil } vpa, ready, err := r.VpaService.GetTortoiseMonitorVPA(ctx, tortoise) @@ -135,7 +142,7 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c logger.Error(err, "update Tortoise status", "tortoise", req.NamespacedName) return ctrl.Result{}, err } - return ctrl.Result{RequeueAfter: 2 * time.Minute}, nil + return ctrl.Result{RequeueAfter: r.Interval}, nil } logger.V(4).Info("VPA created by tortoise is ready, proceeding to generate the recommendation", "tortoise", req.NamespacedName) @@ -161,7 +168,7 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c if tortoise.Status.TortoisePhase == autoscalingv1alpha1.TortoisePhaseGatheringData { logger.V(4).Info("tortoise is GatheringData phase; skip applying the recommendation to HPA or VPAs") - return ctrl.Result{}, nil + return ctrl.Result{RequeueAfter: r.Interval}, nil } _, tortoise, err = r.HpaService.UpdateHPAFromTortoiseRecommendation(ctx, tortoise, now) @@ -182,7 +189,7 @@ func (r *TortoiseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, err } - return ctrl.Result{}, nil + return ctrl.Result{RequeueAfter: r.Interval}, nil } func (r *TortoiseReconciler) deleteVPAAndHPA(ctx context.Context, tortoise *autoscalingv1alpha1.Tortoise, now time.Time) error { diff --git a/controllers/tortoise_controller_test.go b/controllers/tortoise_controller_test.go index 99576159..5b25542a 100644 --- a/controllers/tortoise_controller_test.go +++ b/controllers/tortoise_controller_test.go @@ -16,6 +16,7 @@ import ( autoscalingv1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/client/clientset/versioned" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" "k8s.io/utils/pointer" ctrl "sigs.k8s.io/controller-runtime" @@ -67,13 +68,14 @@ var _ = Describe("Test TortoiseController", func() { }) Expect(err).ShouldNot(HaveOccurred()) - tortoiseService, err := tortoise.New(mgr.GetClient(), 1, "Asia/Tokyo", 1000*time.Minute, "weekly") + tortoiseService, err := tortoise.New(mgr.GetClient(), record.NewFakeRecorder(10), 1, "Asia/Tokyo", 1000*time.Minute, "weekly") Expect(err).ShouldNot(HaveOccurred()) - cli, err := vpa.New(mgr.GetConfig()) + cli, err := vpa.New(mgr.GetConfig(), record.NewFakeRecorder(10)) Expect(err).ShouldNot(HaveOccurred()) reconciler := &TortoiseReconciler{ Scheme: scheme, - HpaService: hpa.New(mgr.GetClient(), 0.95, 90), + HpaService: hpa.New(mgr.GetClient(), record.NewFakeRecorder(10), 0.95, 90), + EventRecorder: record.NewFakeRecorder(10), VpaService: cli, DeploymentService: deployment.New(mgr.GetClient()), TortoiseService: tortoiseService, @@ -1632,7 +1634,7 @@ func (t *testCase) initializeResources(ctx context.Context, k8sClient client.Cli } if t.before.hpa == nil { // create default HPA. - HpaClient := hpa.New(k8sClient, 0.95, 90) + HpaClient := hpa.New(k8sClient, record.NewFakeRecorder(10), 0.95, 90) t.before.hpa, t.before.tortoise, err = HpaClient.CreateHPA(ctx, t.before.tortoise, t.before.deployment) if err != nil { return err @@ -1641,7 +1643,7 @@ func (t *testCase) initializeResources(ctx context.Context, k8sClient client.Cli if t.before.vpa == nil { // create default VPAs. t.before.vpa = map[v1alpha1.VerticalPodAutoscalerRole]*autoscalingv1.VerticalPodAutoscaler{} - VpaClient, err := vpa.New(config) + VpaClient, err := vpa.New(config, record.NewFakeRecorder(10)) if err != nil { return err } diff --git a/main.go b/main.go index 8ff03790..5dd85b0c 100644 --- a/main.go +++ b/main.go @@ -144,19 +144,20 @@ func main() { setupLog.Error(err, "unable to start manager") os.Exit(1) } - tortoiseService, err := tortoise.New(mgr.GetClient(), rangeOfMinMaxReplicasRecommendationHours, timeZone, tortoiseUpdateInterval, minMaxReplicasRoutine) + eventRecorder := mgr.GetEventRecorderFor("tortoise-controller") + tortoiseService, err := tortoise.New(mgr.GetClient(), eventRecorder, rangeOfMinMaxReplicasRecommendationHours, timeZone, tortoiseUpdateInterval, minMaxReplicasRoutine) if err != nil { setupLog.Error(err, "unable to start tortoise service") os.Exit(1) } - vpaClient, err := vpa.New(mgr.GetConfig()) + vpaClient, err := vpa.New(mgr.GetConfig(), eventRecorder) if err != nil { setupLog.Error(err, "unable to start vpa client") os.Exit(1) } - hpaService := hpa.New(mgr.GetClient(), replicaReductionFactor, upperTargetResourceUtilization) + hpaService := hpa.New(mgr.GetClient(), eventRecorder, replicaReductionFactor, upperTargetResourceUtilization) if err = (&controllers.TortoiseReconciler{ Scheme: mgr.GetScheme(), @@ -165,7 +166,8 @@ func main() { DeploymentService: deployment.New(mgr.GetClient()), RecommenderService: recommender.New(tTLHoursOfMinMaxReplicasRecommendation, maxReplicasFactor, minReplicasFactor, upperTargetResourceUtilization, minimumMinReplicas, preferredReplicaNumUpperLimit, maxCPUPerContainer, maxMemoryPerContainer), TortoiseService: tortoiseService, - Interval: 30 * time.Second, + Interval: tortoiseUpdateInterval, + EventRecorder: eventRecorder, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Tortoise") os.Exit(1) diff --git a/manifests/default/rbac.authorization.k8s.io_v1_clusterrole_tortoise-manager-role.yaml b/manifests/default/rbac.authorization.k8s.io_v1_clusterrole_tortoise-manager-role.yaml index d39dc756..a4e0ca99 100644 --- a/manifests/default/rbac.authorization.k8s.io_v1_clusterrole_tortoise-manager-role.yaml +++ b/manifests/default/rbac.authorization.k8s.io_v1_clusterrole_tortoise-manager-role.yaml @@ -65,3 +65,11 @@ rules: - get - patch - update +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update diff --git a/pkg/hpa/service.go b/pkg/hpa/service.go index 5817cd49..787de86c 100644 --- a/pkg/hpa/service.go +++ b/pkg/hpa/service.go @@ -13,6 +13,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" @@ -27,13 +28,15 @@ type Service struct { replicaReductionFactor float64 upperTargetResourceUtilization int32 + recorder record.EventRecorder } -func New(c client.Client, replicaReductionFactor float64, upperTargetResourceUtilization int) *Service { +func New(c client.Client, recorder record.EventRecorder, replicaReductionFactor float64, upperTargetResourceUtilization int) *Service { return &Service{ c: c, replicaReductionFactor: replicaReductionFactor, upperTargetResourceUtilization: int32(upperTargetResourceUtilization), + recorder: recorder, } } @@ -44,6 +47,9 @@ func (c *Service) InitializeHPA(ctx context.Context, tortoise *autoscalingv1alph if err != nil { return tortoise, fmt.Errorf("give annotations on a hpa specified in targetrefs: %w", err) } + + c.recorder.Event(tortoise, corev1.EventTypeNormal, "HPAUpdated", fmt.Sprintf("Updated HPA %s/%s", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) + return tortoise, nil } @@ -53,6 +59,8 @@ func (c *Service) InitializeHPA(ctx context.Context, tortoise *autoscalingv1alph return tortoise, fmt.Errorf("create hpa: %w", err) } + c.recorder.Event(tortoise, corev1.EventTypeNormal, "HPACreated", fmt.Sprintf("Initialized a HPA %s/%s", tortoise.Namespace, tortoise.Status.Targets.HorizontalPodAutoscaler)) + return tortoise, nil } @@ -271,7 +279,15 @@ func (c *Service) UpdateHPAFromTortoiseRecommendation(ctx context.Context, torto return c.c.Update(ctx, hpa) } - return retHPA, retTortoise, retry.RetryOnConflict(retry.DefaultRetry, updateFn) + if err := retry.RetryOnConflict(retry.DefaultRetry, updateFn); err != nil { + return nil, nil, err + } + + if tortoise.Spec.UpdateMode != autoscalingv1alpha1.UpdateModeOff { + c.recorder.Event(tortoise, corev1.EventTypeNormal, "HPAUpdated", fmt.Sprintf("HPA %s/%s is updated by the recommendation", retHPA.Namespace, retHPA.Name)) + } + + return retHPA, retTortoise, nil } // GetReplicasRecommendation finds the corresponding recommendations. diff --git a/pkg/hpa/service_test.go b/pkg/hpa/service_test.go index 0f404d83..637948c3 100644 --- a/pkg/hpa/service_test.go +++ b/pkg/hpa/service_test.go @@ -11,6 +11,7 @@ import ( v2 "k8s.io/api/autoscaling/v2" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -694,7 +695,7 @@ func TestClient_UpdateHPAFromTortoiseRecommendation(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - c := New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), 0.95, 90) + c := New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), record.NewFakeRecorder(10), 0.95, 90) got, tortoise, err := c.UpdateHPAFromTortoiseRecommendation(tt.args.ctx, tt.args.tortoise, tt.args.now) if (err != nil) != tt.wantErr { t.Errorf("UpdateHPAFromTortoiseRecommendation() error = %v, wantErr %v", err, tt.wantErr) @@ -902,9 +903,9 @@ func TestService_InitializeHPA(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - c := New(fake.NewClientBuilder().Build(), 0.95, 90) + c := New(fake.NewClientBuilder().Build(), record.NewFakeRecorder(10), 0.95, 90) if tt.initialHPA != nil { - c = New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), 0.95, 90) + c = New(fake.NewClientBuilder().WithRuntimeObjects(tt.initialHPA).Build(), record.NewFakeRecorder(10), 0.95, 90) } _, err := c.InitializeHPA(context.Background(), tt.args.tortoise, tt.args.dm) if (err != nil) != tt.wantErr { diff --git a/pkg/tortoise/tortoise.go b/pkg/tortoise/tortoise.go index ffd57249..f47d3179 100644 --- a/pkg/tortoise/tortoise.go +++ b/pkg/tortoise/tortoise.go @@ -12,6 +12,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" v1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" + "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" "k8s.io/utils/pointer" @@ -25,7 +26,9 @@ import ( const tortoiseFinalizer = "tortoise.autoscaling.mercari.com/finalizer" type Service struct { - c client.Client + c client.Client + recorder record.EventRecorder + rangeOfMinMaxReplicasRecommendationHour int timeZone *time.Location tortoiseUpdateInterval time.Duration @@ -38,7 +41,7 @@ type Service struct { lastTimeUpdateTortoise map[client.ObjectKey]time.Time } -func New(c client.Client, rangeOfMinMaxReplicasRecommendationHour int, timeZone string, tortoiseUpdateInterval time.Duration, minMaxReplicasRoutine string) (*Service, error) { +func New(c client.Client, recorder record.EventRecorder, rangeOfMinMaxReplicasRecommendationHour int, timeZone string, tortoiseUpdateInterval time.Duration, minMaxReplicasRoutine string) (*Service, error) { jst, err := time.LoadLocation(timeZone) if err != nil { return nil, fmt.Errorf("load location: %w", err) @@ -47,6 +50,7 @@ func New(c client.Client, rangeOfMinMaxReplicasRecommendationHour int, timeZone return &Service{ c: c, + recorder: recorder, rangeOfMinMaxReplicasRecommendationHour: rangeOfMinMaxReplicasRecommendationHour, minMaxReplicasRoutine: minMaxReplicasRoutine, timeZone: jst, @@ -75,16 +79,35 @@ func (s *Service) UpdateTortoisePhase(tortoise *v1alpha1.Tortoise, dm *appv1.Dep switch tortoise.Status.TortoisePhase { case "": tortoise = s.initializeTortoise(tortoise, dm) + r := "1 week" + if s.minMaxReplicasRoutine == "daily" { + r = "1 day" + } + s.recorder.Event(tortoise, corev1.EventTypeNormal, "Initialized", fmt.Sprintf("Tortoise is initialized and starts to gather data to make recommendations. It will take %s to finish gathering data and then tortoise starts to work actually", r)) + case v1alpha1.TortoisePhaseInitializing: // change it to GatheringData anyway. Later the controller may change it back to initialize if VPA isn't ready. tortoise.Status.TortoisePhase = v1alpha1.TortoisePhaseGatheringData case v1alpha1.TortoisePhaseGatheringData: tortoise = s.checkIfTortoiseFinishedGatheringData(tortoise) + if tortoise.Status.TortoisePhase == v1alpha1.TortoisePhaseWorking { + s.recorder.Event(tortoise, corev1.EventTypeNormal, "Working", "Tortoise finishes gathering data and it starts to work on autoscaling") + } + case v1alpha1.TortoisePhaseEmergency: + if tortoise.Spec.UpdateMode != v1alpha1.UpdateModeEmergency { + // Emergency mode is turned off. + s.recorder.Event(tortoise, corev1.EventTypeNormal, "Working", "Emergency mode is turned off. Tortoise starts to work on autoscaling normally") + tortoise.Status.TortoisePhase = v1alpha1.TortoisePhaseEmergency + } } if tortoise.Spec.UpdateMode == v1alpha1.UpdateModeEmergency { - tortoise.Status.TortoisePhase = v1alpha1.TortoisePhaseEmergency + if tortoise.Status.TortoisePhase != v1alpha1.TortoisePhaseEmergency { + s.recorder.Event(tortoise, corev1.EventTypeNormal, "Emergency", "Tortoise is in Emergency mode") + tortoise.Status.TortoisePhase = v1alpha1.TortoisePhaseEmergency + } } + return tortoise } @@ -287,6 +310,8 @@ func (s *Service) UpdateTortoiseStatus(ctx context.Context, originalTortoise *v1 return originalTortoise, err } + s.recorder.Event(originalTortoise, corev1.EventTypeNormal, "RecommendationUpdated", "The recommendation on Tortoise status is updated") + s.updateLastTimeUpdateTortoise(originalTortoise, now) return originalTortoise, nil diff --git a/pkg/vpa/service.go b/pkg/vpa/service.go index 17dd597b..994c2ffa 100644 --- a/pkg/vpa/service.go +++ b/pkg/vpa/service.go @@ -11,6 +11,7 @@ import ( v1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/client/clientset/versioned" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" autoscalingv1alpha1 "github.com/mercari/tortoise/api/v1alpha1" @@ -19,15 +20,16 @@ import ( ) type Service struct { - c versioned.Interface + c versioned.Interface + recorder record.EventRecorder } -func New(c *rest.Config) (*Service, error) { +func New(c *rest.Config, recorder record.EventRecorder) (*Service, error) { cli, err := versioned.NewForConfig(c) if err != nil { return nil, err } - return &Service{c: cli}, nil + return &Service{c: cli, recorder: recorder}, nil } const TortoiseMonitorVPANamePrefix = "tortoise-monitor-" @@ -136,7 +138,13 @@ func (c *Service) CreateTortoiseUpdaterVPA(ctx context.Context, tortoise *autosc Role: autoscalingv1alpha1.VerticalPodAutoscalerRoleUpdater, }) vpa, err := c.c.AutoscalingV1().VerticalPodAutoscalers(vpa.Namespace).Create(ctx, vpa, metav1.CreateOptions{}) - return vpa, tortoise, err + if err != nil { + return nil, tortoise, err + } + + c.recorder.Event(tortoise, corev1.EventTypeNormal, "VPACreated", fmt.Sprintf("Initialized a updator VPA %s/%s", vpa.Namespace, vpa.Name)) + + return vpa, tortoise, nil } func (c *Service) CreateTortoiseMonitorVPA(ctx context.Context, tortoise *autoscalingv1alpha1.Tortoise) (*v1.VerticalPodAutoscaler, *autoscalingv1alpha1.Tortoise, error) { @@ -177,7 +185,13 @@ func (c *Service) CreateTortoiseMonitorVPA(ctx context.Context, tortoise *autosc }) vpa, err := c.c.AutoscalingV1().VerticalPodAutoscalers(vpa.Namespace).Create(ctx, vpa, metav1.CreateOptions{}) - return vpa, tortoise, err + if err != nil { + return nil, tortoise, err + } + + c.recorder.Event(tortoise, corev1.EventTypeNormal, "VPACreated", fmt.Sprintf("Initialized a monitor VPA %s/%s", vpa.Namespace, vpa.Name)) + + return vpa, tortoise, nil } func (c *Service) UpdateVPAFromTortoiseRecommendation(ctx context.Context, tortoise *autoscalingv1alpha1.Tortoise) (*v1.VerticalPodAutoscaler, error) { @@ -220,7 +234,15 @@ func (c *Service) UpdateVPAFromTortoiseRecommendation(ctx context.Context, torto return err } - return retVPA, retry.RetryOnConflict(retry.DefaultRetry, updateFn) + if err := retry.RetryOnConflict(retry.DefaultRetry, updateFn); err != nil { + return retVPA, fmt.Errorf("update VPA status: %w", err) + } + + if tortoise.Spec.UpdateMode != autoscalingv1alpha1.UpdateModeOff { + c.recorder.Event(tortoise, corev1.EventTypeNormal, "VPAUpdated", fmt.Sprintf("VPA %s/%s is updated by the recommendation. The Pods should also be updated with new resources soon by VPA if needed", retVPA.Namespace, retVPA.Name)) + } + + return retVPA, nil } func (c *Service) GetTortoiseUpdaterVPA(ctx context.Context, tortoise *autoscalingv1alpha1.Tortoise) (*v1.VerticalPodAutoscaler, error) {