diff --git a/pkg/pillar/docs/zedkube.md b/pkg/pillar/docs/zedkube.md new file mode 100644 index 0000000000..ad899c8c88 --- /dev/null +++ b/pkg/pillar/docs/zedkube.md @@ -0,0 +1,153 @@ +# Clustered eve nodes (zedkube) + +## Overview + +## Components + +### kubenodeop + +kubenodeop handles cordoning, uncordoning, and draining of clustered eve-os nodes. +Any given node could be hosting one or more longhorn volume replicas and thus could be the rebuild source for other node replicas. +A drain operation should be performed before any Node Operation / Node Command which can cause an extended outage of a node such as a reboot, shutdown, reset. +kubenodeop handles NodeDrainRequest objects which zedkube subscribes to, initiates the drain, and publishes NodeDrainStatus objects. + +### kubeapi + +1. `kubeapi.GetNodeDrainStatus()` to determine if system supports drain + - HV!=kubevirt: NOTSUPPORTED + - HV=kubevirt will return: + - NOTSUPPORTED if in single node. + - NOTREQUESTED if in cluster mode +1. `kubeapi.RequestNodeDrain()` to begin a drain + +### Drain PubSub setup (node reboot/shutdown) + +1. zedagent/handlenodedrain.go:`initNodeDrainPubSub()` + - subscribes to NodeDrainStatus from zedkube + - creates publication of NodeDrainRequest +1. nodeagent/handlenodedrain.go:`initNodeDrainPubSub()` + - subscribe to NodeDrainStatus from zedkube + +### Drain Request path (node reboot/shutdown) + +1. zedagent/parseconfig.go:`scheduleDeviceOperation()` + - If `shouldDeferForNodeDrain()` is true + - Set Reboot or shutdown cmd deferred state in zedagentContext +1. zedagent/handlenodedrain.go:`shouldDeferForNodeDrain()` + - NodeDrainStatus == (NOTREQUESTED || FAILEDCORDON || FAILEDDRAIN): + - Drain is requested via `kubeapi.RequestNodeDrain()` + - return Defer + - NodeDrainStatus == (UNKNOWN || NOTSUPPORTED || COMPLETE ) + - return !Defer + - NodeDrainStatus == (REQUESTED || STARTING || CORDONED || DRAINRETRYING ): + - return Defer + +### Drain Status Handler (node reboot/shutdown) + +1. zedagent/handlenodedrain.go:`handleNodeDrainStatusImpl()` + - NodeDrainStatus = FAILEDCORDON or FAILEDDRAIN + - Unpublish NodeDrainRequest +1. nodeagent/handlenodedrain.go:`handleNodeDrainStatusImplNA()` + - NodeDrainStatus >= REQUESTED and < COMPLETE + - republish nodeagentstatus with drainInProgress set + - NodeDrainStatus == COMPLETE + - republish nodeagentstatus with drainInProgress cleared +1. zedagent/zedagent.go:`handleNodeAgentStatusImpl()` + - If there is: + - a deferred device op + - nodeagent configctx reports drain complete + - Then process deferred reboot/shutdown + +### Drain PubSub setup (node eveimage-update) + +1. baseosmgr/handlenodedrain.go:`initNodeDrainPubSub()` + - subscribe to NodeDrainStatus from zedkube + - setup publication to NodeDrainRequest + +### Drain Request path (node eveimage-update) + +1. baseosmgr/handlebaseos.go:`baseOsHandleStatusUpdateUUID()` + - If BaseOs download complete (LOADING||LOADED||INSTALLED), not currently Activated, and new config requested it Activated + - Check `shouldDeferForNodeDrain()`, if defer requested return as Completion will later will complete this BaseOsStatusUpdate. +1. baseosmgr/handlenodedrain.go:`shouldDeferForNodeDrain()` + - NodeDrainStatus == (NOTREQUESTED || FAILEDCORDON || FAILEDDRAIN): + - save BaseOsId in baseOsMgrContext.deferredBaseOsID + - Drain is requested via `kubeapi.RequestNodeDrain()` + - return Defer + - NodeDrainStatus == (UNKNOWN || NOTSUPPORTED || COMPLETE ) + - return !Defer + - NodeDrainStatus == (REQUESTED || STARTING || CORDONED || DRAINRETRYING ): + - return Defer + +### Drain Status Handler (node eve-image update) + +1. baseosmgr/handlenodedrain.go:`handleNodeDrainStatusImpl()` + - NodeDrainStatus == FAILEDCORDON or FAILEDDRAIN: + - Unpublish NodeDrainRequest + - NodeDrainStatus == COMPLETE: + - Complete deferred baseOsMgrContext.deferredBaseOsID to `baseOsHandleStatusUpdateUUID()` + +### General DrainRequest Processing + +1. zedkube/zedkube.go:Run() + - sub to NodeDrainRequest from zedagent and baseosmgr + - new publication of NodeDrainStatus + - Init NodeDrainStatus to NOTSUPPORTED +1. zedkube/zedkube.go:`handleEdgeNodeClusterConfigImpl()` + - System switching to cluster membership: NodeDrainStatus -> NOTREQUESTED +1. zedkube/zedkube.go:`handleEdgeNodeClusterConfigDelete()` + - System switching to single node: NodeDrainStatus -> NOTSUPPORTED +1. zedkube/handlenodedrain.go:`handleNodeDrainRequestImpl()` + - NodeDrainStatus -> REQUESTED +1. zedkube/kubenodeop.go:`cordonAndDrainNode()` + - NodeDrainStatus -> STARTING + - Retry Cordon up to 10 times (in case k8s api states object changed) + - when retries exhausted: NodeDrainStatus -> FAILEDCORDON + - NodeDrainStatus -> CORDONED + - Retry Drain up to 5 times + - between tries: NodeDrainStatus -> DRAINRETRYING + - on failure: NodeDrainStatus -> FAILEDDRAIN + - NodeDrainStatus -> COMPLETE + +## Debugging + +### PubSub NodeDrainRequest/NodeDrainStatus + +/run/zedagent/NodeDrainRequest/global.json +/run/baseosmgr/NodeDrainRequest/global.json +/run/zedkube/NodeDrainStatus/global.json + +The current node drain progress is available from the global NodeDrainStatus object found at +`cat /run/zedkube/NodeDrainStatus/global.json | jq .` + +NodeDrainStatus can be forced by writing the object (in pillar svc container fs) to: /persist/kube-status/force-NodeDrainStatus-global.json + +eg. to force disable drain: +echo '{"Status":1,"RequestedBy":1}' > /persist/kube-status/kubeforce-NodeDrainStatus-global.json + +eg. to force deviceop drain complete: +echo '{"Status":9,"RequestedBy":2}' > /persist/kube-status/force-NodeDrainStatus-global.json + +eg. to force baseosmgr drain complete: +echo '{"Status":9,"RequestedBy":3}' > /persist/kube-status/force-NodeDrainStatus-global.json + +"Cannot evict pod as it would violate the pod's disruption budget": +If NodeDrainStatus can get stuck if attempting to drain a node running a pod where the pod has an +explicit spec.nodeName == "drain node". Delete the pod to continue. +If workload is a statefulset declaing spec.nodeName and node is already cordoned. Then deleting the pod is not sufficient +The statefulset must be deleted. + +### NodeDrainRequest/NodeDrainStatus log strings + +- NodeDrainRequest +- NodeDrainStatus +- cordonNode +- cordonAndDrainNode +- scheduleDeviceOperation +- baseOsHandleStatusUpdateUUID +- nodedrain-step +- kubevirt_node_drain_completion_time_seconds +... + zgrep 'kubevirt_node_drain_completion_time_seconds' /persist/newlog/keepSentQueue/dev.log.1725511530990.gz | jq -r .content | jq -r .msg | cut -d ':' -f 2 + s34.559219 +... diff --git a/pkg/pillar/kubeapi/kubetypes.go b/pkg/pillar/kubeapi/kubetypes.go new file mode 100644 index 0000000000..48e973599b --- /dev/null +++ b/pkg/pillar/kubeapi/kubetypes.go @@ -0,0 +1,75 @@ +// Copyright (c) 2024 Zededa, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package kubeapi + +import "time" + +// DrainStatus tracks progress of draining a node of replica disks and workloads +type DrainStatus uint8 + +const ( + UNKNOWN DrainStatus = iota + 0 // UNKNOWN Unable to determine + NOTSUPPORTED // NOTSUPPORTED System not (HV=kubevirt and clustered) + NOTREQUESTED // NOTREQUESTED Not yet requested + REQUESTED // REQUESTED From zedagent device operation or baseosmgr new update + STARTING // STARTING Zedkube go routine started, not yet cordoned + CORDONED // CORDONED Node Unschedulable set + FAILEDCORDON // FAILEDCORDON Node modification unable to apply + DRAINRETRYING // DRAINRETRYING Drain retry in progress, could be retried replica rebuild + FAILEDDRAIN // FAILEDDRAIN Could be retried replica rebuild + COMPLETE // COMPLETE All node workloads removed from system +) + +func (status DrainStatus) String() string { + switch status { + case UNKNOWN: + return "Unknown" + case NOTSUPPORTED: + return "Not Supported" + case NOTREQUESTED: + return "Not Requested" + case REQUESTED: + return "Requested" + case STARTING: + return "Starting" + case CORDONED: + return "Cordoned" + case FAILEDCORDON: + return "Failed Cordon" + case DRAINRETRYING: + return "Drain Retrying" + case FAILEDDRAIN: + return "Failed Drain" + case COMPLETE: + return "Complete" + default: + return "Unknown" + } +} + +// DrainRequester is a user initiated edge-node operation from a pillar microservice +type DrainRequester uint8 + +const ( + NONE DrainRequester = iota + 1 // NONE - The default value + DEVICEOP // DEVICEOP - Node Reboot or shutdown + UPDATE // UPDATE - baseos update +) + +// NodeDrainRequest is the trigger to NodeDrainStatus +// +// Used by Reboots, Prepare-Shutdown, baseos updates +type NodeDrainRequest struct { + RequestedAt time.Time + RequestedBy DrainRequester + Context string +} + +// NodeDrainStatus is a response to NodeDrainRequest +// +// Subscribe to updates to continue NodeDrainRequest operations. +type NodeDrainStatus struct { + Status DrainStatus + RequestedBy DrainRequester +} diff --git a/pkg/pillar/kubeapi/nodedrain.go b/pkg/pillar/kubeapi/nodedrain.go new file mode 100644 index 0000000000..d2732921ca --- /dev/null +++ b/pkg/pillar/kubeapi/nodedrain.go @@ -0,0 +1,100 @@ +// Copyright (c) 2024 Zededa, Inc. +// SPDX-License-Identifier: Apache-2.0 + +//go:build kubevirt + +package kubeapi + +import ( + "encoding/json" + "fmt" + "os" + "time" + + "github.com/lf-edge/eve/pkg/pillar/base" + "github.com/lf-edge/eve/pkg/pillar/pubsub" +) + +// An alternate path to force a drain status in the event of a drain issue. +const forceNodeDrainPath string = "/persist/kube-status/force-NodeDrainStatus-global.json" + +// RequestNodeDrain generates the NodeDrainRequest object and publishes it +func RequestNodeDrain(pubNodeDrainRequest pubsub.Publication, requester DrainRequester, context string) error { + drainReq := NodeDrainRequest{ + RequestedAt: time.Now(), + RequestedBy: requester, + Context: context, + } + err := pubNodeDrainRequest.Publish("global", drainReq) + if err != nil { + return fmt.Errorf("RequestNodeDrain: error publishing drain request: %v", err) + } + return nil +} + +// GetDrainStatusOverride : an alternate way to set drain status for debug +func GetDrainStatusOverride(log *base.LogObject) *NodeDrainStatus { + if _, err := os.Stat(forceNodeDrainPath); err != nil { + return nil + } + b, err := os.ReadFile(forceNodeDrainPath) + if err != nil { + log.Warnf("Unable to read %s:%v", forceNodeDrainPath, err) + return nil + } + cfg := NodeDrainStatus{} + err = json.Unmarshal(b, &cfg) + if err != nil { + log.Warnf("Unable to Unmarshal %s to NodeDrainStatus: %v", forceNodeDrainPath, err) + return nil + } + if cfg.Status == COMPLETE { + err = os.Remove(forceNodeDrainPath) + if err != nil { + log.Warnf("could not remove %s: %v", forceNodeDrainPath, err) + } + } + return &cfg +} + +// CleanupDrainStatusOverride is used at microservice startup to cleanup +// a previously user written override file +func CleanupDrainStatusOverride(log *base.LogObject) { + if _, err := os.Stat(forceNodeDrainPath); err != nil { + return + } + err := os.Remove(forceNodeDrainPath) + if err != nil { + log.Warnf("CleanupDrainStatusOverride could not remove %s: %v", forceNodeDrainPath, err) + return + } + return +} + +// DrainStatusFaultInjectionWait while this file exists, wait in the drain status goroutine +func DrainStatusFaultInjectionWait() bool { + injectFaultPath := "/tmp/DrainStatus_FaultInjection_Wait" + if _, err := os.Stat(injectFaultPath); err == nil { + return true + } + return false +} + +// GetNodeDrainStatus is a wrapper to either return latest NodeDrainStatus +// +// or return a forced status from /persist/force-NodeDrainStatus-global.json +func GetNodeDrainStatus(subNodeDrainStatus pubsub.Subscription, log *base.LogObject) *NodeDrainStatus { + override := GetDrainStatusOverride(log) + if override != nil { + return override + } + + items := subNodeDrainStatus.GetAll() + glbStatus, ok := items["global"].(NodeDrainStatus) + if !ok { + // This should only be expected on an HV=kubevirt build + // and only very early in boot (before zedkube starts) + return &NodeDrainStatus{Status: UNKNOWN, RequestedBy: NONE} + } + return &glbStatus +} diff --git a/pkg/pillar/kubeapi/nokube.go b/pkg/pillar/kubeapi/nokube.go index 16beb48c02..f8f7d86ac5 100644 --- a/pkg/pillar/kubeapi/nokube.go +++ b/pkg/pillar/kubeapi/nokube.go @@ -6,6 +6,7 @@ package kubeapi import ( + "fmt" "time" "github.com/lf-edge/eve/pkg/pillar/base" @@ -27,3 +28,15 @@ func CleanupStaleVMI() (int, error) { func GetPVCList(*base.LogObject) ([]string, error) { panic("GetPVCList is not built") } + +// RequestNodeDrain is a stub for non-kubevirt builds +func RequestNodeDrain(pubsub.Publication, DrainRequester, string) error { + // Nothing to do here, just noop + return fmt.Errorf("nokube requested drain, should not get here") +} + +// GetNodeDrainStatus is a stub for non-kubevirt builds +func GetNodeDrainStatus(pubsub.Subscription) *NodeDrainStatus { + // No need to query for inprogress operations, just a noop + return &NodeDrainStatus{Status: NOTSUPPORTED} +}