lf-edge · naiming-zededa · Oct 30, 2024 · Dec 28, 2024 · eriknordmark · Dec 28, 2024
@@ -67,15 +67,37 @@ func GetAppKubeName(displayName string, uuid uuid.UUID) string {
 }
 
 // GetVMINameFromVirtLauncher : get VMI name from the corresponding Kubevirt
-// launcher pod name.
+// launcher pod name for replicaset generated VMI.
 func GetVMINameFromVirtLauncher(podName string) (vmiName string, isVirtLauncher bool) {
 	if !strings.HasPrefix(podName, VMIPodNamePrefix) {
 		return "", false
 	}
 	vmiName = strings.TrimPrefix(podName, VMIPodNamePrefix)
 	lastSep := strings.LastIndex(vmiName, "-")
-	if lastSep != -1 {
-		vmiName = vmiName[:lastSep]
+	if lastSep == -1 || lastSep < 5 {
+		return "", false
 	}
+
+	// Check if the last part is 5 bytes long
+	if len(vmiName[lastSep+1:]) != 5 {
+		return "", false
+	}
+
+	// Use the index minus 5 bytes to get the VMI name to remove added
+	// replicaset suffix
+	vmiName = vmiName[:lastSep-5]
 	return vmiName, true
 }
+
+// GetReplicaPodName : get the app name from the pod name for replica pods.
+func GetReplicaPodName(displayName, podName string, uuid uuid.UUID) (kubeName string, isReplicaPod bool) {
+	kubeName = GetAppKubeName(displayName, uuid)
+	if !strings.HasPrefix(podName, kubeName) {
+		return "", false
+	}
+	suffix := strings.TrimPrefix(podName, kubeName)
+	if strings.HasPrefix(suffix, "-") && len(suffix[1:]) == 5 {
+		return kubeName, true
+	}
+	return "", false
+}
@@ -27,6 +27,7 @@ func getEncryptionBlock(
 	decBlock.CellularNetUsername = zconfigDecBlockPtr.CellularNetUsername
 	decBlock.CellularNetPassword = zconfigDecBlockPtr.CellularNetPassword
 	decBlock.ProtectedUserData = zconfigDecBlockPtr.ProtectedUserData
+	decBlock.ClusterToken = zconfigDecBlockPtr.ClusterToken
 	return decBlock
 }
 

@@ -91,6 +91,7 @@ type domainContext struct {
 	pubDomainStatus        pubsub.Publication
 	subGlobalConfig        pubsub.Subscription
 	subZFSPoolStatus       pubsub.Subscription
+	subEdgeNodeInfo        pubsub.Subscription
 	pubAssignableAdapters  pubsub.Publication
 	pubDomainMetric        pubsub.Publication
 	pubHostMemory          pubsub.Publication
@@ -126,6 +127,7 @@ type domainContext struct {
 	cpuPinningSupported bool
 	// Is it kubevirt eve
 	hvTypeKube bool
+	nodeName   string
 }
 
 // AddAgentSpecificCLIFlags adds CLI options
@@ -414,9 +416,24 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
 	domainCtx.subZFSPoolStatus = subZFSPoolStatus
 	subZFSPoolStatus.Activate()
 
+	// Look for edge node info
+	subEdgeNodeInfo, err := ps.NewSubscription(pubsub.SubscriptionOptions{
+		AgentName:   "zedagent",
+		MyAgentName: agentName,
+		TopicImpl:   types.EdgeNodeInfo{},
+		Persistent:  true,
+		Activate:    false,
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+	domainCtx.subEdgeNodeInfo = subEdgeNodeInfo
+	_ = subEdgeNodeInfo.Activate()
+
 	// Parse any existing ConfigIntemValueMap but continue if there
 	// is none
-	for !domainCtx.GCComplete {
+	waitEdgeNodeInfo := true
+	for !domainCtx.GCComplete || (domainCtx.hvTypeKube && waitEdgeNodeInfo) {
 		log.Noticef("waiting for GCComplete")
 		select {
 		case change := <-subGlobalConfig.MsgChan():
@@ -425,12 +442,22 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
 		case <-domainCtx.publishTicker.C:
 			publishProcessesHandler(&domainCtx)
 
+		case change := <-subEdgeNodeInfo.MsgChan():
+			subEdgeNodeInfo.ProcessChange(change)
+			waitEdgeNodeInfo = false
+
 		case <-stillRunning.C:
 		}
 		ps.StillRunning(agentName, warningTime, errorTime)
 	}
 	log.Noticef("processed GCComplete")
 
+	// Get the EdgeNode info, needed for kubevirt clustering
+	err = domainCtx.retrieveDeviceNodeName()
+	if err != nil {
+		log.Fatal(err)
+	}
+
 	if !domainCtx.setInitialUsbAccess {
 		log.Functionf("GCComplete but not setInitialUsbAccess => first boot")
 		// Enable USB keyboard and storage
@@ -513,6 +540,9 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
 		case change := <-subZFSPoolStatus.MsgChan():
 			subZFSPoolStatus.ProcessChange(change)
 
+		case change := <-subEdgeNodeInfo.MsgChan():
+			subEdgeNodeInfo.ProcessChange(change)
+
 		case <-domainCtx.publishTicker.C:
 			publishProcessesHandler(&domainCtx)
 
@@ -651,6 +681,9 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar
 		case change := <-subPhysicalIOAdapter.MsgChan():
 			subPhysicalIOAdapter.ProcessChange(change)
 
+		case change := <-subEdgeNodeInfo.MsgChan():
+			subEdgeNodeInfo.ProcessChange(change)
+
 		case <-domainCtx.publishTicker.C:
 			start := time.Now()
 			err = domainCtx.cipherMetrics.Publish(log, cipherMetricsPub, "global")
@@ -977,12 +1010,15 @@ func verifyStatus(ctx *domainContext, status *types.DomainStatus) {
 				status.SetErrorDescription(errDescription)
 			}
 
-			//cleanup app instance tasks
-			if err := hyper.Task(status).Delete(status.DomainName); err != nil {
-				log.Errorf("failed to delete domain: %s (%v)", status.DomainName, err)
-			}
-			if err := hyper.Task(status).Cleanup(status.DomainName); err != nil {
-				log.Errorf("failed to cleanup domain: %s (%v)", status.DomainName, err)
+			// in cluster mode, we can not delete the pod due to failing to get app info
+			if !ctx.hvTypeKube {
+				//cleanup app instance tasks
+				if err := hyper.Task(status).Delete(status.DomainName); err != nil {
+					log.Errorf("failed to delete domain: %s (%v)", status.DomainName, err)
+				}
+				if err := hyper.Task(status).Cleanup(status.DomainName); err != nil {
+					log.Errorf("failed to cleanup domain: %s (%v)", status.DomainName, err)
+				}
 			}
 		}
 		status.DomainId = 0
@@ -1071,6 +1107,7 @@ func maybeRetryBoot(ctx *domainContext, status *types.DomainStatus) {
 	if !status.BootFailed {
 		return
 	}
+
 	if status.Activated && status.BootFailed {
 		log.Functionf("maybeRetryBoot(%s) clearing bootFailed since Activated",
 			status.Key())
@@ -1138,6 +1175,11 @@ func maybeRetryBoot(ctx *domainContext, status *types.DomainStatus) {
 		log.Errorf("Failed to setup vTPM for %s: %s", status.DomainName, err)
 	}
 
+	// pass nodeName to hypervisor call Setup
+	if status.NodeName == "" {
+		status.NodeName = ctx.nodeName
+	}
+
 	if err := hyper.Task(status).Setup(*status, *config, ctx.assignableAdapters, nil, file); err != nil {
 		//it is retry, so omit error
 		log.Errorf("Failed to create DomainStatus from %+v: %s",
@@ -1684,6 +1726,11 @@ func doActivate(ctx *domainContext, config types.DomainConfig,
 		log.Errorf("Failed to setup vTPM for %s: %s", status.DomainName, err)
 	}
 
+	// pass nodeName to hypervisor call Setup
+	if status.NodeName == "" {
+		status.NodeName = ctx.nodeName
+	}
+
 	globalConfig := agentlog.GetGlobalConfig(log, ctx.subGlobalConfig)
 	if err := hyper.Task(status).Setup(*status, config, ctx.assignableAdapters, globalConfig, file); err != nil {
 		log.Errorf("Failed to create DomainStatus from %+v: %s",
@@ -1780,6 +1827,7 @@ func doActivateTail(ctx *domainContext, status *types.DomainStatus,
 		status.SetErrorNow(err.Error())
 		log.Errorf("doActivateTail(%v) failed for %s: %s",
 			status.UUIDandVersion, status.DisplayName, err)
+
 		// Delete
 		if err := hyper.Task(status).Delete(status.DomainName); err != nil {
 			log.Errorf("failed to delete domain: %s (%v)", status.DomainName, err)
@@ -1844,7 +1892,7 @@ func doInactivate(ctx *domainContext, status *types.DomainStatus, impatient bool
 		if doShutdown {
 			// If the Shutdown fails we don't wait; assume failure
 			// was due to no PV tools
-			if err := DomainShutdown(*status, false); err != nil {
+			if err := DomainShutdown(ctx, *status, false); err != nil {
 				log.Errorf("DomainShutdown %s failed: %s",
 					status.DomainName, err)
 			} else {
@@ -1864,7 +1912,7 @@ func doInactivate(ctx *domainContext, status *types.DomainStatus, impatient bool
 		// the domain is already on the way down.
 		// In case of errors we proceed directly to deleting the task,
 		// and after that we waitForDomainGone
-		if err := DomainShutdown(*status, true); err != nil {
+		if err := DomainShutdown(ctx, *status, true); err != nil {
 			log.Warnf("DomainShutdown -F %s failed: %s",
 				status.DomainName, err)
 		} else {
@@ -2508,13 +2556,14 @@ func DomainCreate(ctx *domainContext, status types.DomainStatus) (int, error) {
 }
 
 // DomainShutdown is a wrapper for domain shutdown
-func DomainShutdown(status types.DomainStatus, force bool) error {
+func DomainShutdown(ctx *domainContext, status types.DomainStatus, force bool) error {
 
 	var err error
 	log.Functionf("DomainShutdown force-%v %s %d", force, status.DomainName, status.DomainId)
 
 	// Stop the domain
 	log.Functionf("Stopping domain - %s", status.DomainName)
+
 	err = hyper.Task(&status).Stop(status.DomainName, force)
 
 	return err
@@ -3603,3 +3652,15 @@ func lookupCapabilities(ctx *domainContext) (*types.Capabilities, error) {
 	}
 	return &capabilities, nil
 }
+
+func (ctx *domainContext) retrieveDeviceNodeName() error {
+	NodeInfo, err := ctx.subEdgeNodeInfo.Get("global")
+	if err != nil {
+		log.Errorf("retrieveDeviceNodeName: can't get edgeNodeInfo %v", err)
+		return err
+	}
+	enInfo := NodeInfo.(types.EdgeNodeInfo)
+	ctx.nodeName = strings.ReplaceAll(strings.ToLower(enInfo.DeviceName), "_", "-")
+	log.Noticef("retrieveDeviceNodeName: devicename, NodeInfo %v", NodeInfo) // XXX
+	return nil
+}
@@ -151,6 +151,10 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig,
 
 		if source != fromBootstrap {
 			activateNewBaseOS := parseBaseOS(getconfigCtx, config)
+			parseEdgeNodeClusterConfig(getconfigCtx, config)
+
+			// Parse EdgeNode Cluster configuration
+
 			parseNetworkInstanceConfig(getconfigCtx, config)
 			parseContentInfoConfig(getconfigCtx, config)
 			parseVolumeConfig(getconfigCtx, config)
@@ -764,6 +768,10 @@ func parseAppInstanceConfig(getconfigCtx *getconfigContext,
 		// Add config submitted via local profile server.
 		addLocalAppConfig(getconfigCtx, &appInstance)
 
+		// XXX add Designated ID to the appInstance
+		// XXX Keep this here for now to allow the kubevirt single-node working, the later PR to EVE main will remove this
+		appInstance.DesignatedNodeID = devUUID
+
 		// Verify that it fits and if not publish with error
 		checkAndPublishAppInstanceConfig(getconfigCtx, appInstance)
 	}
@@ -3199,3 +3207,52 @@ func handleDeviceOperation(ctxPtr *zedagentContext, op types.DeviceOperation) {
 	shutdownAppsGlobal(ctxPtr)
 	// nothing else to be done
 }
+
+func parseEdgeNodeClusterConfig(getconfigCtx *getconfigContext,
+	config *zconfig.EdgeDevConfig) {
+
+	ctx := getconfigCtx.zedagentCtx
+	zcfgCluster := config.GetCluster()
+	if zcfgCluster == nil {
+		log.Functionf("parseEdgeNodeClusterConfig: Unpublishing EdgeNodeClusterConfig")
+		ctx.pubEdgeNodeClusterConfig.Unpublish("global")
+		return
+	}
+	ipAddr, ipNet, err := net.ParseCIDR(zcfgCluster.GetClusterIpPrefix())
+	if err != nil {
+		log.Errorf("parseEdgeNodeClusterConfig: ParseCIDR failed %s", err)
+		return
+	}
+	ipNet.IP = ipAddr
+
+	joinServerIP := net.ParseIP(zcfgCluster.GetJoinServerIp())
+	if joinServerIP == nil {
+		log.Errorf("handleEdgeNodeConfigItem: parse JoinServerIP failed")
+		return
+	}
+	var isJoinNode bool
+	// deduce the bootstrap node status from clusterIPPrefix and joinServerIP
+	if ipAddr.Equal(joinServerIP) { // deduce the bootstrap node status from
+		isJoinNode = true
+	}
+
+	id, err := uuid.FromString(zcfgCluster.GetClusterId())
+	if err != nil {
+		log.Errorf("parseEdgeNodeClusterConfig: failed to parse UUID: %v", err)
+		return
+	}
+	enClusterConfig := types.EdgeNodeClusterConfig{
+		ClusterName:      zcfgCluster.GetClusterName(),
+		ClusterID:        types.UUIDandVersion{UUID: id},
+		ClusterInterface: zcfgCluster.GetClusterInterface(),
+		ClusterIPPrefix:  ipNet,
+		IsWorkerNode:     zcfgCluster.GetIsWorkerNode(),
+		JoinServerIP:     joinServerIP,
+		BootstrapNode:    isJoinNode,
+		// XXX EncryptedClusterToken is only for gcp config
+	}
+	enClusterConfig.CipherToken = parseCipherBlock(getconfigCtx,
+		enClusterConfig.Key(), zcfgCluster.GetEncryptedClusterToken())
+	log.Functionf("parseEdgeNodeClusterConfig: ENCluster API, Config %+v, %v", zcfgCluster, enClusterConfig)
+	ctx.pubEdgeNodeClusterConfig.Publish("global", enClusterConfig)
+}
@@ -229,6 +229,9 @@ type zedagentContext struct {
 	// Is Kubevirt eve
 	hvTypeKube bool
 
+	// EN cluster config
+	pubEdgeNodeClusterConfig pubsub.Publication
+
 	// Netdump
 	netDumper            *netdump.NetDumper // nil if netdump is disabled
 	netdumpInterval      time.Duration
@@ -1103,6 +1106,16 @@ func initPublications(zedagentCtx *zedagentContext) {
 	}
 	getconfigCtx.pubZedAgentStatus.ClearRestarted()
 
+	zedagentCtx.pubEdgeNodeClusterConfig, err = ps.NewPublication(pubsub.PublicationOptions{
+		AgentName:  agentName,
+		Persistent: true,
+		TopicType:  types.EdgeNodeClusterConfig{},
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+	zedagentCtx.pubEdgeNodeClusterConfig.ClearRestarted()
+
 	getconfigCtx.pubPhysicalIOAdapters, err = ps.NewPublication(pubsub.PublicationOptions{
 		AgentName: agentName,
 		TopicType: types.PhysicalIOAdapterList{},

@@ -154,7 +154,7 @@ func (z *zedkube) checkAppsStatus() {
 	}
 
 	pub := z.pubENClusterAppStatus
-	stItmes := pub.GetAll()
+	stItems := pub.GetAll()
 	var oldStatus *types.ENClusterAppStatus
 	for _, item := range items {
 		aiconfig := item.(types.AppInstanceConfig)
@@ -179,7 +179,7 @@ func (z *zedkube) checkAppsStatus() {
 			}
 		}
 
-		for _, st := range stItmes {
+		for _, st := range stItems {
 			aiStatus := st.(types.ENClusterAppStatus)
 			if aiStatus.AppUUID == aiconfig.UUIDandVersion.UUID {
 				oldStatus = &aiStatus
@@ -204,7 +204,7 @@ func (z *zedkube) getnodeNameAndUUID() error {
 			return err
 		}
 		enInfo := NodeInfo.(types.EdgeNodeInfo)
-		z.nodeName = strings.ToLower(enInfo.DeviceName)
+		z.nodeName = strings.ReplaceAll(strings.ToLower(enInfo.DeviceName), "_", "-")
 		z.nodeuuid = enInfo.DeviceID.String()
 	}
 	return nil