diff --git a/pkg/controller/plan/kubevirt.go b/pkg/controller/plan/kubevirt.go index c71b2e463..e670314e7 100644 --- a/pkg/controller/plan/kubevirt.go +++ b/pkg/controller/plan/kubevirt.go @@ -862,7 +862,7 @@ func (r *KubeVirt) SetPopulatorPodOwnership(vm *plan.VMStatus) (err error) { return } for _, pod := range pods { - pvcId := strings.Split(pod.Name, "populate-")[1] + pvcId := pod.Name[len(PopulatorPodPrefix):] for _, pvc := range pvcs { if string(pvc.UID) != pvcId { continue @@ -898,7 +898,7 @@ func (r *KubeVirt) getPopulatorPods() (pods []core.Pod, err error) { return nil, liberr.Wrap(err) } for _, pod := range migrationPods.Items { - if strings.HasPrefix(pod.Name, "populate-") { + if strings.HasPrefix(pod.Name, PopulatorPodPrefix) { pods = append(pods, pod) } } diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 7fc33572f..c38131366 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -84,7 +84,8 @@ const ( ) const ( - TransferCompleted = "Transfer completed." + TransferCompleted = "Transfer completed." + PopulatorPodPrefix = "populate-" ) var ( @@ -1434,7 +1435,7 @@ func (r *Migration) updateCopyProgress(vm *plan.VMStatus, step *plan.Step) (err if r.Plan.Spec.Warm && len(importer.Status.ContainerStatuses) > 0 { vm.Warm.Failures = int(importer.Status.ContainerStatuses[0].RestartCount) } - if RestartLimitExceeded(importer) { + if restartLimitExceeded(importer) { task.MarkedCompleted() msg, _ := terminationMessage(importer) task.AddError(msg) @@ -1612,20 +1613,48 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St } percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total) - task.Progress.Completed = int64(percent * float64(task.Progress.Total)) + newProgress := int64(percent * float64(task.Progress.Total)) + if newProgress == task.Progress.Completed { + pvcId := string(pvc.UID) + populatorFailed := r.isPopulatorPodFailed(pvcId) + if populatorFailed { + return fmt.Errorf("populator pod failed for PVC %s. Please check the pod logs", pvcId) + } + } + task.Progress.Completed = newProgress } step.ReflectTasks() return } +// Checks if the populator pod failed when the progress didn't change +func (r *Migration) isPopulatorPodFailed(givenPvcId string) bool { + populatorPods, err := r.kubevirt.getPopulatorPods() + if err != nil { + r.Log.Error(err, "couldn't get the populator pods") + return false + } + for _, pod := range populatorPods { + pvcId := pod.Name[len(PopulatorPodPrefix):] + if givenPvcId != pvcId { + continue + } + if pod.Status.Phase == core.PodFailed { + return true + } + break + } + return false +} + func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) { podList, err := r.kubevirt.GetPodsWithLabels(map[string]string{}) if err != nil { return } for _, pod := range podList.Items { - if strings.HasPrefix(pod.Name, "populate-") { + if strings.HasPrefix(pod.Name, PopulatorPodPrefix) { // it's populator pod if _, ok := pod.Labels["migration"]; !ok { // un-labeled pod, we need to set it @@ -1682,7 +1711,7 @@ func terminationMessage(pod *core.Pod) (msg string, ok bool) { } // Return whether the pod has failed and restarted too many times. -func RestartLimitExceeded(pod *core.Pod) (exceeded bool) { +func restartLimitExceeded(pod *core.Pod) (exceeded bool) { if len(pod.Status.ContainerStatuses) == 0 { return } diff --git a/pkg/lib-volume-populator/populator-machinery/BUILD.bazel b/pkg/lib-volume-populator/populator-machinery/BUILD.bazel index d9ec31e6f..af0c99c7b 100644 --- a/pkg/lib-volume-populator/populator-machinery/BUILD.bazel +++ b/pkg/lib-volume-populator/populator-machinery/BUILD.bazel @@ -10,7 +10,6 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/apis/forklift/v1beta1", - "//pkg/controller/plan", "//vendor/k8s.io/api/core/v1:core", "//vendor/k8s.io/api/storage/v1:storage", "//vendor/k8s.io/apimachinery/pkg/api/errors", diff --git a/pkg/lib-volume-populator/populator-machinery/controller.go b/pkg/lib-volume-populator/populator-machinery/controller.go index 28d862585..e92f690b8 100644 --- a/pkg/lib-volume-populator/populator-machinery/controller.go +++ b/pkg/lib-volume-populator/populator-machinery/controller.go @@ -31,7 +31,6 @@ import ( "time" "github.com/konveyor/forklift-controller/pkg/apis/forklift/v1beta1" - "github.com/konveyor/forklift-controller/pkg/controller/plan" corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -77,6 +76,7 @@ const ( reasonPVCCreationError = "PopulatorPVCCreationError" reasonPopulatorProgress = "PopulatorProgress" AnnDefaultNetwork = "v1.multus-cni.io/default-network" + AnnPopulatorReCreations = "recreations" qemuGroup = 107 ) @@ -697,14 +697,18 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str if corev1.PodSucceeded != pod.Status.Phase { if corev1.PodFailed == pod.Status.Phase { - c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message) - // Delete failed pods so we can try again - if !plan.RestartLimitExceeded(pod) { - err = c.kubeClient.CoreV1().Pods(populatorNamespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) - if err != nil { - return err - } + restarts, ok := pvc.Annotations[AnnPopulatorReCreations] + if !ok { + return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, 1) + } + restartsInteger, err := strconv.Atoi(restarts) + if err != nil { + return err } + if restartsInteger < 3 { + return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, restartsInteger+1) + } + c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name) } // We'll get called again later when the pod succeeds return nil @@ -799,6 +803,24 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str return nil } +func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName string, counter int) error { + pvc.Annotations[AnnPopulatorReCreations] = strconv.Itoa(counter) + err := c.updatePvc(ctx, pvc, namespace) + if err != nil { + return err + } + err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{}) + if err != nil { + return err + } + return nil +} + +func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) { + _, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{}) + return err +} + func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error { populatorKind := pvc.Spec.DataSourceRef.Kind var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,