From 003b2026ed2c80ef0b4901e0131a004f7358b552 Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 31 Oct 2023 14:50:38 +0200 Subject: [PATCH 1/4] Keep the populator pod on failure This patch will keep the populator pod existence when failing. It won't restart or recreated. This is in the mind of aligning with v2v pod failure and the thought that once we fail, we will most likely keep failing. This change will allow to get the populator logs in case of failure in a easy way. Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 18 ++++++++++++++++-- .../populator-machinery/BUILD.bazel | 1 - .../populator-machinery/controller.go | 8 -------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 7fc33572f..07b340484 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1434,7 +1434,7 @@ func (r *Migration) updateCopyProgress(vm *plan.VMStatus, step *plan.Step) (err if r.Plan.Spec.Warm && len(importer.Status.ContainerStatuses) > 0 { vm.Warm.Failures = int(importer.Status.ContainerStatuses[0].RestartCount) } - if RestartLimitExceeded(importer) { + if restartLimitExceeded(importer) { task.MarkedCompleted() msg, _ := terminationMessage(importer) task.AddError(msg) @@ -1579,6 +1579,10 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St if err != nil { return } + populatorPods, err := r.kubevirt.getPopulatorPods() + if err != nil { + return + } for _, pvc := range pvcs { if _, ok := pvc.Annotations["lun"]; ok { @@ -1597,6 +1601,16 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } + for _, pod := range populatorPods { + pvcId := strings.Split(pod.Name, "populate-")[1] + if string(pvc.UID) != pvcId { + continue + } + if pod.Status.Phase == core.PodFailed { + return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + } + } + if pvc.Status.Phase == core.ClaimBound { task.Phase = Completed task.Reason = TransferCompleted @@ -1682,7 +1696,7 @@ func terminationMessage(pod *core.Pod) (msg string, ok bool) { } // Return whether the pod has failed and restarted too many times. -func RestartLimitExceeded(pod *core.Pod) (exceeded bool) { +func restartLimitExceeded(pod *core.Pod) (exceeded bool) { if len(pod.Status.ContainerStatuses) == 0 { return } diff --git a/pkg/lib-volume-populator/populator-machinery/BUILD.bazel b/pkg/lib-volume-populator/populator-machinery/BUILD.bazel index d9ec31e6f..af0c99c7b 100644 --- a/pkg/lib-volume-populator/populator-machinery/BUILD.bazel +++ b/pkg/lib-volume-populator/populator-machinery/BUILD.bazel @@ -10,7 +10,6 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/apis/forklift/v1beta1", - "//pkg/controller/plan", "//vendor/k8s.io/api/core/v1:core", "//vendor/k8s.io/api/storage/v1:storage", "//vendor/k8s.io/apimachinery/pkg/api/errors", diff --git a/pkg/lib-volume-populator/populator-machinery/controller.go b/pkg/lib-volume-populator/populator-machinery/controller.go index 28d862585..44f40cb42 100644 --- a/pkg/lib-volume-populator/populator-machinery/controller.go +++ b/pkg/lib-volume-populator/populator-machinery/controller.go @@ -31,7 +31,6 @@ import ( "time" "github.com/konveyor/forklift-controller/pkg/apis/forklift/v1beta1" - "github.com/konveyor/forklift-controller/pkg/controller/plan" corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -698,13 +697,6 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str if corev1.PodSucceeded != pod.Status.Phase { if corev1.PodFailed == pod.Status.Phase { c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message) - // Delete failed pods so we can try again - if !plan.RestartLimitExceeded(pod) { - err = c.kubeClient.CoreV1().Pods(populatorNamespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) - if err != nil { - return err - } - } } // We'll get called again later when the pod succeeds return nil From 90733c4dd6eb0efddfd8f995df22810e7d2ddfc7 Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 31 Oct 2023 15:41:40 +0200 Subject: [PATCH 2/4] Decreace updatePopulatorCopyProgress complexity Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 07b340484..488e3a6fa 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1601,14 +1601,9 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - for _, pod := range populatorPods { - pvcId := strings.Split(pod.Name, "populate-")[1] - if string(pvc.UID) != pvcId { - continue - } - if pod.Status.Phase == core.PodFailed { - return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) - } + err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) + if err != nil { + return } if pvc.Status.Phase == core.ClaimBound { @@ -1633,6 +1628,19 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St return } +func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) error { + for _, pod := range populatorPods { + pvcId := strings.Split(pod.Name, "populate-")[1] + if givenPvcId != pvcId { + continue + } + if pod.Status.Phase == core.PodFailed { + return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + } + } + return nil +} + func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { podList, err := r.kubevirt.GetPodsWithLabels(map[string]string{}) if err != nil { From bbdd9ce126e446515be9ea0157c0d59fe9c2d807 Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Wed, 1 Nov 2023 16:35:11 +0200 Subject: [PATCH 3/4] Refactor the code Signed-off-by: Liran Rotenberg --- pkg/controller/plan/kubevirt.go | 4 ++-- pkg/controller/plan/migration.go | 16 +++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pkg/controller/plan/kubevirt.go b/pkg/controller/plan/kubevirt.go index c71b2e463..e670314e7 100644 --- a/pkg/controller/plan/kubevirt.go +++ b/pkg/controller/plan/kubevirt.go @@ -862,7 +862,7 @@ func (r *KubeVirt) SetPopulatorPodOwnership(vm *plan.VMStatus) (err error) { return } for _, pod := range pods { - pvcId := strings.Split(pod.Name, "populate-")[1] + pvcId := pod.Name[len(PopulatorPodPrefix):] for _, pvc := range pvcs { if string(pvc.UID) != pvcId { continue @@ -898,7 +898,7 @@ func (r *KubeVirt) getPopulatorPods() (pods []core.Pod, err error) { return nil, liberr.Wrap(err) } for _, pod := range migrationPods.Items { - if strings.HasPrefix(pod.Name, "populate-") { + if strings.HasPrefix(pod.Name, PopulatorPodPrefix) { pods = append(pods, pod) } } diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 488e3a6fa..8fca6f624 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -84,7 +84,8 @@ const ( ) const ( - TransferCompleted = "Transfer completed." + TransferCompleted = "Transfer completed." + PopulatorPodPrefix = "populate-" ) var ( @@ -1601,7 +1602,7 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) + _, err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) if err != nil { return } @@ -1628,17 +1629,18 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St return } -func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) error { +func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) { for _, pod := range populatorPods { - pvcId := strings.Split(pod.Name, "populate-")[1] + pvcId := pod.Name[len(PopulatorPodPrefix):] if givenPvcId != pvcId { continue } if pod.Status.Phase == core.PodFailed { - return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) } + break } - return nil + return false, nil } func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { @@ -1647,7 +1649,7 @@ func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID st return } for _, pod := range podList.Items { - if strings.HasPrefix(pod.Name, "populate-") { + if strings.HasPrefix(pod.Name, PopulatorPodPrefix) { // it's populator pod if _, ok := pod.Labels["migration"]; !ok { // un-labeled pod, we need to set it From c4d645d42723b609246306dc48a09127b19df47b Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 7 Nov 2023 18:27:16 +0200 Subject: [PATCH 4/4] Save annotation of restarts This change will count the number of restarts for the populator pod on the destination PVC. That allows us to limit the number of recreations of the pod. After 3 attempts, it will stop recreating it. Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 31 ++++++++++-------- .../populator-machinery/controller.go | 32 ++++++++++++++++++- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 8fca6f624..c38131366 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1580,10 +1580,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St if err != nil { return } - populatorPods, err := r.kubevirt.getPopulatorPods() - if err != nil { - return - } for _, pvc := range pvcs { if _, ok := pvc.Annotations["lun"]; ok { @@ -1602,11 +1598,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - _, err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) - if err != nil { - return - } - if pvc.Status.Phase == core.ClaimBound { task.Phase = Completed task.Reason = TransferCompleted @@ -1622,25 +1613,39 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St } percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total) - task.Progress.Completed = int64(percent * float64(task.Progress.Total)) + newProgress := int64(percent * float64(task.Progress.Total)) + if newProgress == task.Progress.Completed { + pvcId := string(pvc.UID) + populatorFailed := r.isPopulatorPodFailed(pvcId) + if populatorFailed { + return fmt.Errorf("populator pod failed for PVC %s. Please check the pod logs", pvcId) + } + } + task.Progress.Completed = newProgress } step.ReflectTasks() return } -func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) { +// Checks if the populator pod failed when the progress didn't change +func (r *Migration) isPopulatorPodFailed(givenPvcId string) bool { + populatorPods, err := r.kubevirt.getPopulatorPods() + if err != nil { + r.Log.Error(err, "couldn't get the populator pods") + return false + } for _, pod := range populatorPods { pvcId := pod.Name[len(PopulatorPodPrefix):] if givenPvcId != pvcId { continue } if pod.Status.Phase == core.PodFailed { - return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + return true } break } - return false, nil + return false } func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { diff --git a/pkg/lib-volume-populator/populator-machinery/controller.go b/pkg/lib-volume-populator/populator-machinery/controller.go index 44f40cb42..e92f690b8 100644 --- a/pkg/lib-volume-populator/populator-machinery/controller.go +++ b/pkg/lib-volume-populator/populator-machinery/controller.go @@ -76,6 +76,7 @@ const ( reasonPVCCreationError = "PopulatorPVCCreationError" reasonPopulatorProgress = "PopulatorProgress" AnnDefaultNetwork = "v1.multus-cni.io/default-network" + AnnPopulatorReCreations = "recreations" qemuGroup = 107 ) @@ -696,7 +697,18 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str if corev1.PodSucceeded != pod.Status.Phase { if corev1.PodFailed == pod.Status.Phase { - c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message) + restarts, ok := pvc.Annotations[AnnPopulatorReCreations] + if !ok { + return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, 1) + } + restartsInteger, err := strconv.Atoi(restarts) + if err != nil { + return err + } + if restartsInteger < 3 { + return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, restartsInteger+1) + } + c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name) } // We'll get called again later when the pod succeeds return nil @@ -791,6 +803,24 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str return nil } +func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName string, counter int) error { + pvc.Annotations[AnnPopulatorReCreations] = strconv.Itoa(counter) + err := c.updatePvc(ctx, pvc, namespace) + if err != nil { + return err + } + err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{}) + if err != nil { + return err + } + return nil +} + +func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) { + _, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) + return err +} + func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error { populatorKind := pvc.Spec.DataSourceRef.Kind var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,