Skip to content

Commit

Permalink
Save annotation of restarts
Browse files Browse the repository at this point in the history
This change will count the number of restarts for the populator pod on
the destination PVC. That allows us to limit the number of recreations
of the pod. After 3 attempts, it will stop recreating it.

Signed-off-by: Liran Rotenberg <[email protected]>
  • Loading branch information
liranr23 committed Nov 8, 2023
1 parent 92195e1 commit 96c7491
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 14 deletions.
30 changes: 17 additions & 13 deletions pkg/controller/plan/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -1599,10 +1599,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
if err != nil {
return
}
populatorPods, err := r.kubevirt.getPopulatorPods()
if err != nil {
return
}

for _, pvc := range pvcs {
if _, ok := pvc.Annotations["lun"]; ok {
Expand All @@ -1621,11 +1617,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
continue
}

_, err = r.isPopulatorFailed(populatorPods, string(pvc.UID))
if err != nil {
return
}

if pvc.Status.Phase == core.ClaimBound {
task.Phase = Completed
task.Reason = TransferCompleted
Expand All @@ -1641,25 +1632,38 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
}

percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total)
task.Progress.Completed = int64(percent * float64(task.Progress.Total))
newProgress := int64(percent * float64(task.Progress.Total))
err = r.checkPopulatorPod(newProgress, task.Progress.Completed, string(pvc.UID))
if err != nil {
return
}
task.Progress.Completed = newProgress
}

step.ReflectTasks()
return
}

func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) {
// Checks if the populator pod failed when the progress didn't change
func (r *Migration) checkPopulatorPod(newProgress, oldProgress int64, givenPvcId string) error {
if newProgress != oldProgress {
return nil
}
populatorPods, err := r.kubevirt.getPopulatorPods()
if err != nil {
return err
}
for _, pod := range populatorPods {
pvcId := pod.Name[len(PopulatorPodPrefix):]
if givenPvcId != pvcId {
continue
}
if pod.Status.Phase == core.PodFailed {
return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId)
return fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs", pod.Namespace, pod.Name, pvcId)
}
break
}
return false, nil
return nil
}

func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) {
Expand Down
37 changes: 36 additions & 1 deletion pkg/lib-volume-populator/populator-machinery/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ const (
reasonPVCCreationError = "PopulatorPVCCreationError"
reasonPopulatorProgress = "PopulatorProgress"
AnnDefaultNetwork = "v1.multus-cni.io/default-network"
AnnPopulatorReCreations = "recreations"

qemuGroup = 107
)
Expand Down Expand Up @@ -696,7 +697,20 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str

if corev1.PodSucceeded != pod.Status.Phase {
if corev1.PodFailed == pod.Status.Phase {
c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message)
restarts, ok := pvc.Annotations[AnnPopulatorReCreations]
if !ok {
err = c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, "1")
return err
}
restartsInteger, err := strconv.Atoi(restarts)
if err != nil {
return err
}
if restartsInteger < 3 {
err = c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, strconv.Itoa(restartsInteger+1))
return err
}
c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name)
}
// We'll get called again later when the pod succeeds
return nil
Expand Down Expand Up @@ -791,6 +805,27 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str
return nil
}

func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName, counter string) (err error) {
pvc.Annotations[AnnPopulatorReCreations] = counter
err = c.updatePvc(ctx, pvc, namespace)
if err != nil {
return err
}
err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{})
if err != nil {
return err
}
return
}

func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) {
_, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{})
if err != nil {
return err
}
return
}

func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error {
populatorKind := pvc.Spec.DataSourceRef.Kind
var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,
Expand Down

0 comments on commit 96c7491

Please sign in to comment.