From c4d645d42723b609246306dc48a09127b19df47b Mon Sep 17 00:00:00 2001 From: Liran Rotenberg Date: Tue, 7 Nov 2023 18:27:16 +0200 Subject: [PATCH] Save annotation of restarts This change will count the number of restarts for the populator pod on the destination PVC. That allows us to limit the number of recreations of the pod. After 3 attempts, it will stop recreating it. Signed-off-by: Liran Rotenberg --- pkg/controller/plan/migration.go | 31 ++++++++++-------- .../populator-machinery/controller.go | 32 ++++++++++++++++++- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 8fca6f624..c38131366 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -1580,10 +1580,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St if err != nil { return } - populatorPods, err := r.kubevirt.getPopulatorPods() - if err != nil { - return - } for _, pvc := range pvcs { if _, ok := pvc.Annotations["lun"]; ok { @@ -1602,11 +1598,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St continue } - _, err = r.isPopulatorFailed(populatorPods, string(pvc.UID)) - if err != nil { - return - } - if pvc.Status.Phase == core.ClaimBound { task.Phase = Completed task.Reason = TransferCompleted @@ -1622,25 +1613,39 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St } percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total) - task.Progress.Completed = int64(percent * float64(task.Progress.Total)) + newProgress := int64(percent * float64(task.Progress.Total)) + if newProgress == task.Progress.Completed { + pvcId := string(pvc.UID) + populatorFailed := r.isPopulatorPodFailed(pvcId) + if populatorFailed { + return fmt.Errorf("populator pod failed for PVC %s. Please check the pod logs", pvcId) + } + } + task.Progress.Completed = newProgress } step.ReflectTasks() return } -func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) { +// Checks if the populator pod failed when the progress didn't change +func (r *Migration) isPopulatorPodFailed(givenPvcId string) bool { + populatorPods, err := r.kubevirt.getPopulatorPods() + if err != nil { + r.Log.Error(err, "couldn't get the populator pods") + return false + } for _, pod := range populatorPods { pvcId := pod.Name[len(PopulatorPodPrefix):] if givenPvcId != pvcId { continue } if pod.Status.Phase == core.PodFailed { - return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId) + return true } break } - return false, nil + return false } func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { diff --git a/pkg/lib-volume-populator/populator-machinery/controller.go b/pkg/lib-volume-populator/populator-machinery/controller.go index 44f40cb42..e92f690b8 100644 --- a/pkg/lib-volume-populator/populator-machinery/controller.go +++ b/pkg/lib-volume-populator/populator-machinery/controller.go @@ -76,6 +76,7 @@ const ( reasonPVCCreationError = "PopulatorPVCCreationError" reasonPopulatorProgress = "PopulatorProgress" AnnDefaultNetwork = "v1.multus-cni.io/default-network" + AnnPopulatorReCreations = "recreations" qemuGroup = 107 ) @@ -696,7 +697,18 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str if corev1.PodSucceeded != pod.Status.Phase { if corev1.PodFailed == pod.Status.Phase { - c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message) + restarts, ok := pvc.Annotations[AnnPopulatorReCreations] + if !ok { + return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, 1) + } + restartsInteger, err := strconv.Atoi(restarts) + if err != nil { + return err + } + if restartsInteger < 3 { + return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, restartsInteger+1) + } + c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name) } // We'll get called again later when the pod succeeds return nil @@ -791,6 +803,24 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str return nil } +func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName string, counter int) error { + pvc.Annotations[AnnPopulatorReCreations] = strconv.Itoa(counter) + err := c.updatePvc(ctx, pvc, namespace) + if err != nil { + return err + } + err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{}) + if err != nil { + return err + } + return nil +} + +func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) { + _, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) + return err +} + func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error { populatorKind := pvc.Spec.DataSourceRef.Kind var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,