From 963bf4cd98380153bc0912cca35caf1db26410cc Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Tue, 3 Dec 2024 15:05:58 +0100 Subject: [PATCH] MTV-1717 | Wait for DV status Issue: When CDI is managing lot of DVs at once it can take some time before the CDI reconciles the DVs status and start the import. In the function `updateCopyProgress` we check the DV status and if it is paused we do continue and as next phase we have snapshot removal. So if the DV is still in paused state and we start the cutover and MTV skips over the phase thinking it already finished as the DV is in paused. Fix: Add WaitForDataVolumesStatus phase to wait until the DVs do not have the paused status. Ref: https://issues.redhat.com/browse/MTV-1717 https://github.com/kubev2v/forklift/blob/ea83264881b3dcd4a88dd627c1ca75da2483f408/pkg/controller/plan/migration.go#L1573-L1575 Signed-off-by: Martin Necas --- .../forkliftcontroller/defaults/main.yml | 1 + .../controller/deployment-controller.yml.j2 | 4 + pkg/controller/plan/migration.go | 131 +++++++++++++----- pkg/settings/migration.go | 6 + 4 files changed, 106 insertions(+), 36 deletions(-) diff --git a/operator/roles/forkliftcontroller/defaults/main.yml b/operator/roles/forkliftcontroller/defaults/main.yml index 1b0828f0d..8963a66f9 100644 --- a/operator/roles/forkliftcontroller/defaults/main.yml +++ b/operator/roles/forkliftcontroller/defaults/main.yml @@ -34,6 +34,7 @@ controller_precopy_interval: 60 controller_snapshot_removal_timeout_minuts: 120 controller_snapshot_status_check_rate_seconds: 10 controller_cleanup_retries: 10 +controller_dv_status_check_retries: 10 controller_vsphere_incremental_backup: true controller_ovirt_warm_migration: true controller_max_vm_inflight: 20 diff --git a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 index 7082b2b8b..824fb9a8e 100644 --- a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 +++ b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 @@ -81,6 +81,10 @@ spec: - name: CLEANUP_RETRIES value: "{{ controller_cleanup_retries }}" {% endif %} +{% if controller_dv_status_check_retries is number %} + - name: DV_STATUS_CHECK_RETRIES + value: "{{ controller_dv_status_check_retries }}" +{% endif %} {% if controller_max_vm_inflight is number %} - name: MAX_VM_INFLIGHT value: "{{ controller_max_vm_inflight }}" diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 4ef55f4cb..1b9f56131 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -55,36 +55,38 @@ var ( // Phases. const ( - Started = "Started" - PreHook = "PreHook" - StorePowerState = "StorePowerState" - PowerOffSource = "PowerOffSource" - WaitForPowerOff = "WaitForPowerOff" - CreateDataVolumes = "CreateDataVolumes" - CreateVM = "CreateVM" - CopyDisks = "CopyDisks" - AllocateDisks = "AllocateDisks" - CopyingPaused = "CopyingPaused" - AddCheckpoint = "AddCheckpoint" - AddFinalCheckpoint = "AddFinalCheckpoint" - CreateSnapshot = "CreateSnapshot" - CreateInitialSnapshot = "CreateInitialSnapshot" - CreateFinalSnapshot = "CreateFinalSnapshot" - Finalize = "Finalize" - CreateGuestConversionPod = "CreateGuestConversionPod" - ConvertGuest = "ConvertGuest" - CopyDisksVirtV2V = "CopyDisksVirtV2V" - PostHook = "PostHook" - Completed = "Completed" - WaitForSnapshot = "WaitForSnapshot" - WaitForInitialSnapshot = "WaitForInitialSnapshot" - WaitForFinalSnapshot = "WaitForFinalSnapshot" - ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot" - StoreSnapshotDeltas = "StoreSnapshotDeltas" - StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas" - RemovePreviousSnapshot = "RemovePreviousSnapshot" - RemovePenultimateSnapshot = "RemovePenultimateSnapshot" - RemoveFinalSnapshot = "RemoveFinalSnapshot" + Started = "Started" + PreHook = "PreHook" + StorePowerState = "StorePowerState" + PowerOffSource = "PowerOffSource" + WaitForPowerOff = "WaitForPowerOff" + CreateDataVolumes = "CreateDataVolumes" + WaitForDataVolumesStatus = "WaitForDataVolumesStatus" + WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus" + CreateVM = "CreateVM" + CopyDisks = "CopyDisks" + AllocateDisks = "AllocateDisks" + CopyingPaused = "CopyingPaused" + AddCheckpoint = "AddCheckpoint" + AddFinalCheckpoint = "AddFinalCheckpoint" + CreateSnapshot = "CreateSnapshot" + CreateInitialSnapshot = "CreateInitialSnapshot" + CreateFinalSnapshot = "CreateFinalSnapshot" + Finalize = "Finalize" + CreateGuestConversionPod = "CreateGuestConversionPod" + ConvertGuest = "ConvertGuest" + CopyDisksVirtV2V = "CopyDisksVirtV2V" + PostHook = "PostHook" + Completed = "Completed" + WaitForSnapshot = "WaitForSnapshot" + WaitForInitialSnapshot = "WaitForInitialSnapshot" + WaitForFinalSnapshot = "WaitForFinalSnapshot" + ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot" + StoreSnapshotDeltas = "StoreSnapshotDeltas" + StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas" + RemovePreviousSnapshot = "RemovePreviousSnapshot" + RemovePenultimateSnapshot = "RemovePenultimateSnapshot" + RemoveFinalSnapshot = "RemoveFinalSnapshot" ) // Steps. @@ -100,8 +102,9 @@ const ( ) const ( - TransferCompleted = "Transfer completed." - PopulatorPodPrefix = "populate-" + TransferCompleted = "Transfer completed." + PopulatorPodPrefix = "populate-" + DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries" ) var ( @@ -134,6 +137,7 @@ var ( {Name: WaitForInitialSnapshot}, {Name: StoreInitialSnapshotDeltas, All: VSphere}, {Name: CreateDataVolumes}, + {Name: WaitForDataVolumesStatus}, {Name: CopyDisks}, {Name: CopyingPaused}, {Name: RemovePreviousSnapshot, All: VSphere}, @@ -148,6 +152,7 @@ var ( {Name: CreateFinalSnapshot}, {Name: WaitForFinalSnapshot}, {Name: AddFinalCheckpoint}, + {Name: WaitForFinalDataVolumesStatus}, {Name: Finalize}, {Name: RemoveFinalSnapshot, All: VSphere}, {Name: CreateGuestConversionPod, All: RequiresConversion}, @@ -662,9 +667,9 @@ func (r *Migration) step(vm *plan.VMStatus) (step string) { step = Initialize case AllocateDisks: step = DiskAllocation - case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot: + case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus: step = DiskTransfer - case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot: + case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalDataVolumesStatus: step = Cutover case CreateGuestConversionPod, ConvertGuest: step = ImageConversion @@ -1039,6 +1044,51 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { if ready { vm.Phase = r.next(vm.Phase) } + case WaitForDataVolumesStatus, WaitForFinalDataVolumesStatus: + step, found := vm.FindStep(r.step(vm)) + if !found { + vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) + break + } + + dvs, err := r.kubevirt.getDVs(vm) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if !r.hasPausedDv(dvs) { + vm.Phase = r.next(vm.Phase) + // Reset for next precopy + step.Annotations[DvStatusCheckRetriesAnnotation] = "0" + } else { + var retries int + retriesAnnotation := step.Annotations[DvStatusCheckRetriesAnnotation] + if retriesAnnotation != "" { + retries, err = strconv.Atoi(retriesAnnotation) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if retries >= settings.Settings.DvStatusCheckRetries { + // Do not fail the step as this can happen when the user runs the warm migration but the VM is already shutdown + // In that case we don't create any delta and don't change the CDI DV status. + r.Log.Info( + "DataVolume status check exceeded the retry limit."+ + "If this causes the problems with the snapshot removal in the CDI please bump the controller_dv_status_check_retries.", + "vm", + vm.String()) + vm.Phase = r.next(vm.Phase) + // Reset for next precopy + step.Annotations[DvStatusCheckRetriesAnnotation] = "0" + } else { + step.Annotations[DvStatusCheckRetriesAnnotation] = strconv.Itoa(retries + 1) + } + } else { + step.Annotations[DvStatusCheckRetriesAnnotation] = "0" + } + } case StoreInitialSnapshotDeltas, StoreSnapshotDeltas: step, found := vm.FindStep(r.step(vm)) if !found { @@ -1073,9 +1123,9 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { switch vm.Phase { case AddCheckpoint: - vm.Phase = CopyDisks + vm.Phase = WaitForDataVolumesStatus case AddFinalCheckpoint: - vm.Phase = Finalize + vm.Phase = WaitForFinalDataVolumesStatus } case StorePowerState: step, found := vm.FindStep(r.step(vm)) @@ -1259,6 +1309,15 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { return } +func (r *Migration) hasPausedDv(dvs []ExtendedDataVolume) bool { + for _, dv := range dvs { + if dv.Status.Phase == Paused { + return true + } + } + return false +} + func (r *Migration) resetPrecopyTasks(vm *plan.VMStatus, step *plan.Step) { step.Completed = nil for _, task := range step.Tasks { diff --git a/pkg/settings/migration.go b/pkg/settings/migration.go index 70413e400..0b8096156 100644 --- a/pkg/settings/migration.go +++ b/pkg/settings/migration.go @@ -24,6 +24,7 @@ const ( FileSystemOverhead = "FILESYSTEM_OVERHEAD" BlockOverhead = "BLOCK_OVERHEAD" CleanupRetries = "CLEANUP_RETRIES" + DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" OvirtOsConfigMap = "OVIRT_OS_MAP" VsphereOsConfigMap = "VSPHERE_OS_MAP" VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" @@ -58,6 +59,8 @@ type Migration struct { BlockOverhead int64 // Cleanup retries CleanupRetries int + // DvStatusCheckRetries retries + DvStatusCheckRetries int // oVirt OS config map name OvirtOsConfigMap string // vSphere OS config map name @@ -100,6 +103,9 @@ func (r *Migration) Load() (err error) { if r.CleanupRetries, err = getPositiveEnvLimit(CleanupRetries, 10); err != nil { return liberr.Wrap(err) } + if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil { + return liberr.Wrap(err) + } if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok { r.VirtV2vImage = virtV2vImage } else if Settings.Role.Has(MainRole) {