From d2b06afeee8340e2b19122b78a61324a286f681a Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 7 Nov 2023 18:27:16 +0200 Subject: [PATCH] Save annotation of restarts This change will count the number of restarts for the populator pod on the destination PVC. That allows us to limit the number of recreations of the pod. After 3 attempts, it will stop recreating it. Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 31 +++++++++------- .../populator-machinery/controller.go | 37 ++++++++++++++++++- 2 files changed, 54 insertions(+), 14 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 61df1d0e1..a29323c83 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1599,10 +1599,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St if err != nil { return } - populatorPods, err := r.kubevirt.getPopulatorPods() - if err != nil { - return - } for _, pvc := range pvcs { if _, ok := pvc.Annotations["lun"]; ok { @@ -1621,11 +1617,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - _, err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) - if err != nil { - return - } - if pvc.Status.Phase == core.ClaimBound { task.Phase = Completed task.Reason = TransferCompleted @@ -1641,25 +1632,39 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St } percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total) - task.Progress.Completed = int64(percent * float64(task.Progress.Total)) + newProgress := int64(percent * float64(task.Progress.Total)) + if newProgress == task.Progress.Completed { + pvcId := string(pvc.UID) + populatorFailed := r.isPopulatorPodFailed(pvcId) + if populatorFailed { + return fmt.Errorf("populator pod failed for PVC %s. Please check the pod logs", pvcId) + } + } + task.Progress.Completed = newProgress } step.ReflectTasks() return } -func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) { +// Checks if the populator pod failed when the progress didn't change +func (r *Migration) isPopulatorPodFailed(givenPvcId string) bool { + populatorPods, err := r.kubevirt.getPopulatorPods() + if err != nil { + r.Log.Error(err, "couldn't get the populator pods") + return false + } for _, pod := range populatorPods { pvcId := pod.Name[len(PopulatorPodPrefix):] if givenPvcId != pvcId { continue } if pod.Status.Phase == core.PodFailed { - return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + return true } break } - return false, nil + return false } func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { diff --git a/pkg/lib-volume-populator/populator-machinery/controller.go b/pkg/lib-volume-populator/populator-machinery/controller.go index 44f40cb42..8a45da493 100644 --- a/pkg/lib-volume-populator/populator-machinery/controller.go +++ b/pkg/lib-volume-populator/populator-machinery/controller.go @@ -76,6 +76,7 @@ const ( reasonPVCCreationError = "PopulatorPVCCreationError" reasonPopulatorProgress = "PopulatorProgress" AnnDefaultNetwork = "v1.multus-cni.io/default-network" + AnnPopulatorReCreations = "recreations" qemuGroup = 107 ) @@ -696,7 +697,20 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str if corev1.PodSucceeded != pod.Status.Phase { if corev1.PodFailed == pod.Status.Phase { - c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message) + restarts, ok := pvc.Annotations[AnnPopulatorReCreations] + if !ok { + err = c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, "1") + return err + } + restartsInteger, err := strconv.Atoi(restarts) + if err != nil { + return err + } + if restartsInteger < 3 { + err = c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, strconv.Itoa(restartsInteger+1)) + return err + } + c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name) } // We'll get called again later when the pod succeeds return nil @@ -791,6 +805,27 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str return nil } +func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName, counter string) (err error) { + pvc.Annotations[AnnPopulatorReCreations] = counter + err = c.updatePvc(ctx, pvc, namespace) + if err != nil { + return err + } + err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{}) + if err != nil { + return err + } + return +} + +func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) { + _, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) + if err != nil { + return err + } + return +} + func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error { populatorKind := pvc.Spec.DataSourceRef.Kind var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,