From 96c7491d700241c8365d9af8be0fae4a38846b51 Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 7 Nov 2023 18:27:16 +0200 Subject: [PATCH] Save annotation of restarts This change will count the number of restarts for the populator pod on the destination PVC. That allows us to limit the number of recreations of the pod. After 3 attempts, it will stop recreating it. Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 30 ++++++++------- .../populator-machinery/controller.go | 37 ++++++++++++++++++- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 61df1d0e1..f10e258db 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1599,10 +1599,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St if err != nil { return } - populatorPods, err := r.kubevirt.getPopulatorPods() - if err != nil { - return - } for _, pvc := range pvcs { if _, ok := pvc.Annotations["lun"]; ok { @@ -1621,11 +1617,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - _, err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) - if err != nil { - return - } - if pvc.Status.Phase == core.ClaimBound { task.Phase = Completed task.Reason = TransferCompleted @@ -1641,25 +1632,38 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St } percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total) - task.Progress.Completed = int64(percent * float64(task.Progress.Total)) + newProgress := int64(percent * float64(task.Progress.Total)) + err = r.checkPopulatorPod(newProgress, task.Progress.Completed, string(pvc.UID)) + if err != nil { + return + } + task.Progress.Completed = newProgress } step.ReflectTasks() return } -func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) { +// Checks if the populator pod failed when the progress didn't change +func (r *Migration) checkPopulatorPod(newProgress, oldProgress int64, givenPvcId string) error { + if newProgress != oldProgress { + return nil + } + populatorPods, err := r.kubevirt.getPopulatorPods() + if err != nil { + return err + } for _, pod := range populatorPods { pvcId := pod.Name[len(PopulatorPodPrefix):] if givenPvcId != pvcId { continue } if pod.Status.Phase == core.PodFailed { - return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs", pod.Namespace, pod.Name, pvcId) } break } - return false, nil + return nil } func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { diff --git a/pkg/lib-volume-populator/populator-machinery/controller.go b/pkg/lib-volume-populator/populator-machinery/controller.go index 44f40cb42..8a45da493 100644 --- a/pkg/lib-volume-populator/populator-machinery/controller.go +++ b/pkg/lib-volume-populator/populator-machinery/controller.go @@ -76,6 +76,7 @@ const ( reasonPVCCreationError = "PopulatorPVCCreationError" reasonPopulatorProgress = "PopulatorProgress" AnnDefaultNetwork = "v1.multus-cni.io/default-network" + AnnPopulatorReCreations = "recreations" qemuGroup = 107 ) @@ -696,7 +697,20 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str if corev1.PodSucceeded != pod.Status.Phase { if corev1.PodFailed == pod.Status.Phase { - c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message) + restarts, ok := pvc.Annotations[AnnPopulatorReCreations] + if !ok { + err = c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, "1") + return err + } + restartsInteger, err := strconv.Atoi(restarts) + if err != nil { + return err + } + if restartsInteger < 3 { + err = c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, strconv.Itoa(restartsInteger+1)) + return err + } + c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name) } // We'll get called again later when the pod succeeds return nil @@ -791,6 +805,27 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str return nil } +func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName, counter string) (err error) { + pvc.Annotations[AnnPopulatorReCreations] = counter + err = c.updatePvc(ctx, pvc, namespace) + if err != nil { + return err + } + err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{}) + if err != nil { + return err + } + return +} + +func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) { + _, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) + if err != nil { + return err + } + return +} + func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error { populatorKind := pvc.Spec.DataSourceRef.Kind var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,