Skip to content

Commit

Permalink
Save annotation of restarts
Browse files Browse the repository at this point in the history
This change will count the number of restarts for the populator pod on
the destination PVC. That allows us to limit the number of recreations
of the pod. After 3 attempts, it will stop recreating it.

Signed-off-by: Liran Rotenberg <[email protected]>
  • Loading branch information
liranr23 committed Nov 12, 2023
1 parent 92195e1 commit d2b06af
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 14 deletions.
31 changes: 18 additions & 13 deletions pkg/controller/plan/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -1599,10 +1599,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
if err != nil {
return
}
populatorPods, err := r.kubevirt.getPopulatorPods()
if err != nil {
return
}

for _, pvc := range pvcs {
if _, ok := pvc.Annotations["lun"]; ok {
Expand All @@ -1621,11 +1617,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
continue
}

_, err = r.isPopulatorFailed(populatorPods, string(pvc.UID))
if err != nil {
return
}

if pvc.Status.Phase == core.ClaimBound {
task.Phase = Completed
task.Reason = TransferCompleted
Expand All @@ -1641,25 +1632,39 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
}

percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total)
task.Progress.Completed = int64(percent * float64(task.Progress.Total))
newProgress := int64(percent * float64(task.Progress.Total))
if newProgress == task.Progress.Completed {
pvcId := string(pvc.UID)
populatorFailed := r.isPopulatorPodFailed(pvcId)
if populatorFailed {
return fmt.Errorf("populator pod failed for PVC %s. Please check the pod logs", pvcId)
}
}
task.Progress.Completed = newProgress
}

step.ReflectTasks()
return
}

func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) {
// Checks if the populator pod failed when the progress didn't change
func (r *Migration) isPopulatorPodFailed(givenPvcId string) bool {
populatorPods, err := r.kubevirt.getPopulatorPods()
if err != nil {
r.Log.Error(err, "couldn't get the populator pods")
return false
}
for _, pod := range populatorPods {
pvcId := pod.Name[len(PopulatorPodPrefix):]
if givenPvcId != pvcId {
continue
}
if pod.Status.Phase == core.PodFailed {
return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId)
return true
}
break
}
return false, nil
return false
}

func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) {
Expand Down
37 changes: 36 additions & 1 deletion pkg/lib-volume-populator/populator-machinery/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ const (
reasonPVCCreationError = "PopulatorPVCCreationError"
reasonPopulatorProgress = "PopulatorProgress"
AnnDefaultNetwork = "v1.multus-cni.io/default-network"
AnnPopulatorReCreations = "recreations"

qemuGroup = 107
)
Expand Down Expand Up @@ -696,7 +697,20 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str

if corev1.PodSucceeded != pod.Status.Phase {
if corev1.PodFailed == pod.Status.Phase {
c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message)
restarts, ok := pvc.Annotations[AnnPopulatorReCreations]
if !ok {
err = c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, "1")
return err
}
restartsInteger, err := strconv.Atoi(restarts)
if err != nil {
return err
}
if restartsInteger < 3 {
err = c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, strconv.Itoa(restartsInteger+1))
return err
}
c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name)
}
// We'll get called again later when the pod succeeds
return nil
Expand Down Expand Up @@ -791,6 +805,27 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str
return nil
}

func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName, counter string) (err error) {
pvc.Annotations[AnnPopulatorReCreations] = counter
err = c.updatePvc(ctx, pvc, namespace)
if err != nil {
return err
}
err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{})
if err != nil {
return err
}
return
}

func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) {
_, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{})
if err != nil {
return err
}
return
}

func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error {
populatorKind := pvc.Spec.DataSourceRef.Kind
var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,
Expand Down

0 comments on commit d2b06af

Please sign in to comment.