Skip to content

Commit

Permalink
Save annotation of restarts
Browse files Browse the repository at this point in the history
This change will count the number of restarts for the populator pod on
the destination PVC. That allows us to limit the number of recreations
of the pod. After 3 attempts, it will stop recreating it.

Signed-off-by: Liran Rotenberg <[email protected]>
  • Loading branch information
liranr23 committed Nov 12, 2023
1 parent 92195e1 commit c677f4f
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 14 deletions.
31 changes: 18 additions & 13 deletions pkg/controller/plan/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -1599,10 +1599,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
if err != nil {
return
}
populatorPods, err := r.kubevirt.getPopulatorPods()
if err != nil {
return
}

for _, pvc := range pvcs {
if _, ok := pvc.Annotations["lun"]; ok {
Expand All @@ -1621,11 +1617,6 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
continue
}

_, err = r.isPopulatorFailed(populatorPods, string(pvc.UID))
if err != nil {
return
}

if pvc.Status.Phase == core.ClaimBound {
task.Phase = Completed
task.Reason = TransferCompleted
Expand All @@ -1641,25 +1632,39 @@ func (r *Migration) updatePopulatorCopyProgress(vm *plan.VMStatus, step *plan.St
}

percent := float64(transferredBytes/0x100000) / float64(task.Progress.Total)
task.Progress.Completed = int64(percent * float64(task.Progress.Total))
newProgress := int64(percent * float64(task.Progress.Total))
if newProgress == task.Progress.Completed {
pvcId := string(pvc.UID)
populatorFailed := r.isPopulatorPodFailed(pvcId)
if populatorFailed {
return fmt.Errorf("populator pod failed for PVC %s. Please check the pod logs", pvcId)
}
}
task.Progress.Completed = newProgress
}

step.ReflectTasks()
return
}

func (r *Migration) isPopulatorFailed(populatorPods []core.Pod, givenPvcId string) (bool, error) {
// Checks if the populator pod failed when the progress didn't change
func (r *Migration) isPopulatorPodFailed(givenPvcId string) bool {
populatorPods, err := r.kubevirt.getPopulatorPods()
if err != nil {
r.Log.Error(err, "couldn't get the populator pods")
return false
}
for _, pod := range populatorPods {
pvcId := pod.Name[len(PopulatorPodPrefix):]
if givenPvcId != pvcId {
continue
}
if pod.Status.Phase == core.PodFailed {
return true, fmt.Errorf("populator pod %s/%s failed for PVC %s. Please check the pod logs.", pod.Namespace, pod.Name, pvcId)
return true
}
break
}
return false, nil
return false
}

func (r *Migration) setPopulatorPodsWithLabels(vm *plan.VMStatus, migrationID string) {
Expand Down
35 changes: 34 additions & 1 deletion pkg/lib-volume-populator/populator-machinery/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ const (
reasonPVCCreationError = "PopulatorPVCCreationError"
reasonPopulatorProgress = "PopulatorProgress"
AnnDefaultNetwork = "v1.multus-cni.io/default-network"
AnnPopulatorReCreations = "recreations"

qemuGroup = 107
)
Expand Down Expand Up @@ -696,7 +697,18 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str

if corev1.PodSucceeded != pod.Status.Phase {
if corev1.PodFailed == pod.Status.Phase {
c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed: %s", pod.Status.Message)
restarts, ok := pvc.Annotations[AnnPopulatorReCreations]
if !ok {
return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, 1)
}
restartsInteger, err := strconv.Atoi(restarts)
if err != nil {
return err
}
if restartsInteger < 3 {
return c.retryFailedPopulator(ctx, pvc, populatorNamespace, pod.Name, restartsInteger+1)
}
c.recorder.Eventf(pvc, corev1.EventTypeWarning, reasonPodFailed, "Populator failed after few (3) attempts: Please check the logs of the populator pod, %s/%s", populatorNamespace, pod.Name)
}
// We'll get called again later when the pod succeeds
return nil
Expand Down Expand Up @@ -791,6 +803,27 @@ func (c *controller) syncPvc(ctx context.Context, key, pvcNamespace, pvcName str
return nil
}

func (c *controller) retryFailedPopulator(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace, podName string, counter int) error {
pvc.Annotations[AnnPopulatorReCreations] = strconv.Itoa(counter)
err := c.updatePvc(ctx, pvc, namespace)
if err != nil {
return err
}
err = c.kubeClient.CoreV1().Pods(namespace).Delete(ctx, podName, metav1.DeleteOptions{})
if err != nil {
return err
}
return nil
}

func (c *controller) updatePvc(ctx context.Context, pvc *corev1.PersistentVolumeClaim, namespace string) (err error) {
_, err = c.kubeClient.CoreV1().PersistentVolumeClaims(namespace).Update(ctx, pvc, metav1.UpdateOptions{})
if err != nil {
return err
}
return err
}

func (c *controller) updateProgress(pvc *corev1.PersistentVolumeClaim, podIP string, cr *unstructured.Unstructured) error {
populatorKind := pvc.Spec.DataSourceRef.Kind
var diskRegex = regexp.MustCompile(fmt.Sprintf(`volume_populators_%s\{%s=\"([0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12})"\} (\d{1,3}.*)`,
Expand Down

0 comments on commit c677f4f

Please sign in to comment.