From 3b66cef3437f32f3cf3b34c94da785d89e539345 Mon Sep 17 00:00:00 2001 From: adrianc Date: Wed, 21 Aug 2024 17:27:04 +0300 Subject: [PATCH] Convey failed maintenance by requestor This commit adds support for requestors to convey failed node maintenance. - Add RequestorFailed condition type, to be set by requestors with reason MaintenanceFailed if maintenance operation failed - Add new condition reason to Ready condition : MaintenanceFailed. this reason is set by controller if Ready condition is with Ready reason and RequestorFailed condition is preset with value True. Failed condition type conveys failure in performing node maintenance by the requestor. Signed-off-by: adrianc --- api/v1alpha1/nodemaintenance_types.go | 19 +++- ...intenance.nvidia.com_nodemaintenances.yaml | 3 + config/rbac/role.yaml | 2 + .../controller/nodemaintenance_controller.go | 90 +++++++++++++++---- 4 files changed, 92 insertions(+), 22 deletions(-) diff --git a/api/v1alpha1/nodemaintenance_types.go b/api/v1alpha1/nodemaintenance_types.go index 8ee191a..91c186c 100644 --- a/api/v1alpha1/nodemaintenance_types.go +++ b/api/v1alpha1/nodemaintenance_types.go @@ -21,11 +21,17 @@ import ( ) const ( - // ConditionTypeReady is the Ready Condition.Type for NodeMaintenance + // ConditionTypeReady is the Ready Condition type for NodeMaintenance. + // It is used to convey node readiness for maintenance operation by the requestor. ConditionTypeReady string = "Ready" + // ConditionTypeRequestorFailed is the RequestorFailed Condition type for NodeMaintenance. + // It is used to convey failure during node maintenance operation by the requestor. + ConditionTypeRequestorFailed string = "RequestorFailed" ) const ( + // Condition Reasons for ConditionTypeReady condition type. + // ConditionReasonUninitialized means that Condition is unset for NodeMaintenance ConditionReasonUninitialized string = "" // ConditionReasonPending means that NodeMaintenance is in Pending state @@ -40,8 +46,14 @@ const ( ConditionReasonDraining string = "Draining" // ConditionReasonReady means that NodeMaintenance is in Ready state ConditionReasonReady string = "Ready" - // ConditionReasonAborted means that NodeMaintenance is in Aborted state - ConditionReasonAborted string = "Aborted" + // ConditionReasonRequestorFailed means that NodeMaintenance operation by the requestor has failed + // garbage collection will not occur if this reason is set. + ConditionReasonRequestorFailed string = ConditionTypeRequestorFailed + + // Condition Reasons for ConditionTypeRequestorFailed condition type. + + // ConditionReasonFailedMaintenance means that maintenance operation failed in a non-recoverable way + ConditionReasonFailedMaintenance string = "FailedMaintenance" ) const ( @@ -184,6 +196,7 @@ type DrainStatus struct { // +kubebuilder:printcolumn:name="Requestor",type="string",JSONPath=`.spec.requestorID` // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=`.status.conditions[?(@.type=='Ready')].status` // +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=`.status.conditions[?(@.type=='Ready')].reason` +// +kubebuilder:printcolumn:name="Failed",type="string",JSONPath=`.status.conditions[?(@.type=='Failed')].reason` // NodeMaintenance is the Schema for the nodemaintenances API type NodeMaintenance struct { diff --git a/config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml b/config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml index 44f3ee3..b8e8701 100644 --- a/config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml +++ b/config/crd/bases/maintenance.nvidia.com_nodemaintenances.yaml @@ -27,6 +27,9 @@ spec: - jsonPath: .status.conditions[?(@.type=='Ready')].reason name: Phase type: string + - jsonPath: .status.conditions[?(@.type=='Failed')].reason + name: Failed + type: string name: v1alpha1 schema: openAPIV3Schema: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index bcb7bde..574cf8a 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -10,6 +10,8 @@ rules: - events verbs: - create + - patch + - update - apiGroups: - "" resources: diff --git a/internal/controller/nodemaintenance_controller.go b/internal/controller/nodemaintenance_controller.go index c55da0a..45d68f6 100644 --- a/internal/controller/nodemaintenance_controller.go +++ b/internal/controller/nodemaintenance_controller.go @@ -27,6 +27,7 @@ import ( "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" @@ -114,7 +115,7 @@ type NodeMaintenanceReconciler struct { //+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances/status,verbs=get;update;patch //+kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances/finalizers,verbs=update -//+kubebuilder:rbac:groups="",resources=events,verbs=create +//+kubebuilder:rbac:groups="",resources=events,verbs=create;update;patch //+kubebuilder:rbac:groups="",resources=nodes,verbs=get;update;patch //+kubebuilder:rbac:groups="",resources=pods,verbs=get;watch;list;update;patch;delete //+kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;get;list;update;patch;delete @@ -198,8 +199,8 @@ func (r *NodeMaintenanceReconciler) Reconcile(ctx context.Context, req ctrl.Requ if err != nil { reqLog.Error(err, "failed to handle drain state for NodeMaintenance object") } - case maintenancev1.ConditionReasonReady: - res, err = r.handleReadyState(ctx, reqLog, nm, node) + case maintenancev1.ConditionReasonReady, maintenancev1.ConditionReasonRequestorFailed: + res, err = r.handleTerminalState(ctx, reqLog, nm, node) if err != nil { reqLog.Error(err, "failed to handle Ready state for NodeMaintenance object") } @@ -506,8 +507,8 @@ func (r *NodeMaintenanceReconciler) updateDrainStatus(ctx context.Context, nm *m return nil } -func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) { - reqLog.Info("Handle Ready NodeMaintenance") +func (r *NodeMaintenanceReconciler) handleTerminalState(ctx context.Context, reqLog logr.Logger, nm *maintenancev1.NodeMaintenance, node *corev1.Node) (ctrl.Result, error) { + reqLog.Info("Handle Ready/RequestorFailed NodeMaintenance") var err error var res ctrl.Result @@ -539,12 +540,40 @@ func (r *NodeMaintenanceReconciler) handleReadyState(ctx context.Context, reqLog if !metav1.HasAnnotation(nm.ObjectMeta, ReadyTimeAnnotation) || nm.Annotations[ReadyTimeAnnotation] == "" { metav1.SetMetaDataAnnotation(&nm.ObjectMeta, ReadyTimeAnnotation, time.Now().UTC().Format(time.RFC3339)) - err := r.Update(ctx, nm) + err = r.Update(ctx, nm) if err != nil { return res, err } } + // check RequestorFailed condition + currentReason := k8sutils.GetReadyConditionReason(nm) + conditionChanged := false + cond := meta.FindStatusCondition(nm.Status.Conditions, maintenancev1.ConditionTypeRequestorFailed) + if cond != nil && cond.Status == metav1.ConditionTrue { + // set Ready condition to RequestorFailed reason if not already set + if currentReason == maintenancev1.ConditionReasonReady { + err = k8sutils.SetReadyConditionReason(ctx, r.Client, nm, maintenancev1.ConditionReasonRequestorFailed) + conditionChanged = true + } + + } else { + // set Ready condition to Ready reason if not already set + if currentReason == maintenancev1.ConditionReasonRequestorFailed { + err = k8sutils.SetReadyConditionReason(ctx, r.Client, nm, maintenancev1.ConditionReasonReady) + conditionChanged = true + } + + } + if err != nil { + return res, err + } + + if conditionChanged { + r.EventRecorder.Event( + nm, corev1.EventTypeNormal, maintenancev1.ConditionChangedEventType, k8sutils.GetReadyConditionReason(nm)) + } + return res, nil } @@ -553,32 +582,32 @@ func (r *NodeMaintenanceReconciler) SetupWithManager(mgr ctrl.Manager, log logr. r.EventRecorder = mgr.GetEventRecorderFor("nodemaintenancereconciler") return ctrl.NewControllerManagedBy(mgr). - For(&maintenancev1.NodeMaintenance{}, builder.WithPredicates(NewReadyConditionChangedPredicate(log))). + For(&maintenancev1.NodeMaintenance{}, builder.WithPredicates(NewConditionChangedPredicate(log))). Complete(r) } -// NewReadyConditionChangedPredicate creates a new ReadyConditionChangedPredicate -func NewReadyConditionChangedPredicate(log logr.Logger) ReadyConditionChangedPredicate { - return ReadyConditionChangedPredicate{ +// NewConditionChangedPredicate creates a new ConditionChangedPredicate +func NewConditionChangedPredicate(log logr.Logger) ConditionChangedPredicate { + return ConditionChangedPredicate{ Funcs: predicate.Funcs{}, log: log, } } -// ReadyConditionChangedPredicate will trigger enqueue of Event for reconcile in the following cases: -// 1. A change in NodeMaintenance Ready Condition +// ConditionChangedPredicate will trigger enqueue of Event for reconcile in the following cases: +// 1. A change in NodeMaintenance Conditions // 2. Update to the object occurred and deletion timestamp is set // 3. NodeMaintenance created // 4. NodeMaintenance deleted // 5. generic event received -type ReadyConditionChangedPredicate struct { +type ConditionChangedPredicate struct { predicate.Funcs log logr.Logger } // Update implements Predicate. -func (p ReadyConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.Object]) bool { +func (p ConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.Object]) bool { if e.ObjectOld == nil { p.log.Error(nil, "old object is nil in update event, ignoring event.") return false @@ -600,15 +629,38 @@ func (p ReadyConditionChangedPredicate) Update(e event.TypedUpdateEvent[client.O return false } - oldRCR := k8sutils.GetReadyConditionReason(oldO) - newRCR := k8sutils.GetReadyConditionReason(newO) + condChanged := false + + oldConds := make(map[string]metav1.Condition) + newConds := make(map[string]metav1.Condition) + + for _, c := range oldO.Status.Conditions { + oldConds[c.Type] = c + } + for _, c := range newO.Status.Conditions { + newConds[c.Type] = c + } + + for k, c := range oldConds { + if !reflect.DeepEqual(c, newConds[k]) { + condChanged = true + break + } + } + for k, c := range newConds { + if !reflect.DeepEqual(c, oldConds[k]) { + condChanged = true + break + } + } - process := oldRCR != newRCR || !newO.GetDeletionTimestamp().IsZero() + deleting := !newO.GetDeletionTimestamp().IsZero() + process := condChanged || deleting p.log.V(operatorlog.DebugLevel).Info("Update event for NodeMaintenance", "name", newO.Name, "namespace", newO.Namespace, - "condition-changed", oldRCR != newRCR, "old", oldRCR, "new", newRCR, - "deleting", !newO.GetDeletionTimestamp().IsZero(), "process", process) + "condition-changed", condChanged, + "deleting", deleting, "process", process) return process }