Skip to content

Commit

Permalink
MTV-1717 | Wait for DV status
Browse files Browse the repository at this point in the history
Issue: When CDI is managing lot of DVs at once it can take some time
before the CDI reconciles the DVs status and start the import. In the
function `updateCopyProgress` we check the DV status and if it is paused
we do continue and as next phase we have snapshot removal. So if the DV
is still in paused state and we start the cutover and MTV skips over the
phase thinking it already finished as the DV is in paused.

Fix: Add WaitForDataVolumesStatus phase to wait until the DVs do not have
the paused status.

Ref: https://issues.redhat.com/browse/MTV-1717

https://github.com/kubev2v/forklift/blob/ea83264881b3dcd4a88dd627c1ca75da2483f408/pkg/controller/plan/migration.go#L1573-L1575

Signed-off-by: Martin Necas <[email protected]>
  • Loading branch information
mnecas committed Dec 9, 2024
1 parent f18f41e commit b2c304d
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 36 deletions.
1 change: 1 addition & 0 deletions operator/roles/forkliftcontroller/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ controller_precopy_interval: 60
controller_snapshot_removal_timeout_minuts: 120
controller_snapshot_status_check_rate_seconds: 10
controller_cleanup_retries: 10
controller_dv_status_check_retries: 10
controller_vsphere_incremental_backup: true
controller_ovirt_warm_migration: true
controller_retain_precopy_importer_pods: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ spec:
- name: CLEANUP_RETRIES
value: "{{ controller_cleanup_retries }}"
{% endif %}
{% if controller_dv_status_check_retries is number %}
- name: DV_STATUS_CHECK_RETRIES
value: "{{ controller_dv_status_check_retries }}"
{% endif %}
{% if controller_max_vm_inflight is number %}
- name: MAX_VM_INFLIGHT
value: "{{ controller_max_vm_inflight }}"
Expand Down
131 changes: 95 additions & 36 deletions pkg/controller/plan/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,36 +55,38 @@ var (

// Phases.
const (
Started = "Started"
PreHook = "PreHook"
StorePowerState = "StorePowerState"
PowerOffSource = "PowerOffSource"
WaitForPowerOff = "WaitForPowerOff"
CreateDataVolumes = "CreateDataVolumes"
CreateVM = "CreateVM"
CopyDisks = "CopyDisks"
AllocateDisks = "AllocateDisks"
CopyingPaused = "CopyingPaused"
AddCheckpoint = "AddCheckpoint"
AddFinalCheckpoint = "AddFinalCheckpoint"
CreateSnapshot = "CreateSnapshot"
CreateInitialSnapshot = "CreateInitialSnapshot"
CreateFinalSnapshot = "CreateFinalSnapshot"
Finalize = "Finalize"
CreateGuestConversionPod = "CreateGuestConversionPod"
ConvertGuest = "ConvertGuest"
CopyDisksVirtV2V = "CopyDisksVirtV2V"
PostHook = "PostHook"
Completed = "Completed"
WaitForSnapshot = "WaitForSnapshot"
WaitForInitialSnapshot = "WaitForInitialSnapshot"
WaitForFinalSnapshot = "WaitForFinalSnapshot"
ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot"
StoreSnapshotDeltas = "StoreSnapshotDeltas"
StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas"
RemovePreviousSnapshot = "RemovePreviousSnapshot"
RemovePenultimateSnapshot = "RemovePenultimateSnapshot"
RemoveFinalSnapshot = "RemoveFinalSnapshot"
Started = "Started"
PreHook = "PreHook"
StorePowerState = "StorePowerState"
PowerOffSource = "PowerOffSource"
WaitForPowerOff = "WaitForPowerOff"
CreateDataVolumes = "CreateDataVolumes"
WaitForDataVolumesStatus = "WaitForDataVolumesStatus"
WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus"
CreateVM = "CreateVM"
CopyDisks = "CopyDisks"
AllocateDisks = "AllocateDisks"
CopyingPaused = "CopyingPaused"
AddCheckpoint = "AddCheckpoint"
AddFinalCheckpoint = "AddFinalCheckpoint"
CreateSnapshot = "CreateSnapshot"
CreateInitialSnapshot = "CreateInitialSnapshot"
CreateFinalSnapshot = "CreateFinalSnapshot"
Finalize = "Finalize"
CreateGuestConversionPod = "CreateGuestConversionPod"
ConvertGuest = "ConvertGuest"
CopyDisksVirtV2V = "CopyDisksVirtV2V"
PostHook = "PostHook"
Completed = "Completed"
WaitForSnapshot = "WaitForSnapshot"
WaitForInitialSnapshot = "WaitForInitialSnapshot"
WaitForFinalSnapshot = "WaitForFinalSnapshot"
ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot"
StoreSnapshotDeltas = "StoreSnapshotDeltas"
StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas"
RemovePreviousSnapshot = "RemovePreviousSnapshot"
RemovePenultimateSnapshot = "RemovePenultimateSnapshot"
RemoveFinalSnapshot = "RemoveFinalSnapshot"
)

// Steps.
Expand All @@ -100,8 +102,9 @@ const (
)

const (
TransferCompleted = "Transfer completed."
PopulatorPodPrefix = "populate-"
TransferCompleted = "Transfer completed."
PopulatorPodPrefix = "populate-"
DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries"
)

var (
Expand Down Expand Up @@ -134,6 +137,7 @@ var (
{Name: WaitForInitialSnapshot},
{Name: StoreInitialSnapshotDeltas, All: VSphere},
{Name: CreateDataVolumes},
{Name: WaitForDataVolumesStatus},
{Name: CopyDisks},
{Name: CopyingPaused},
{Name: RemovePreviousSnapshot, All: VSphere},
Expand All @@ -148,6 +152,7 @@ var (
{Name: CreateFinalSnapshot},
{Name: WaitForFinalSnapshot},
{Name: AddFinalCheckpoint},
{Name: WaitForFinalDataVolumesStatus},
{Name: Finalize},
{Name: RemoveFinalSnapshot, All: VSphere},
{Name: CreateGuestConversionPod, All: RequiresConversion},
Expand Down Expand Up @@ -662,9 +667,9 @@ func (r *Migration) step(vm *plan.VMStatus) (step string) {
step = Initialize
case AllocateDisks:
step = DiskAllocation
case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot:
case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus:
step = DiskTransfer
case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot:
case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalDataVolumesStatus:
step = Cutover
case CreateGuestConversionPod, ConvertGuest:
step = ImageConversion
Expand Down Expand Up @@ -1039,6 +1044,51 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) {
if ready {
vm.Phase = r.next(vm.Phase)
}
case WaitForDataVolumesStatus, WaitForFinalDataVolumesStatus:
step, found := vm.FindStep(r.step(vm))
if !found {
vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm)))
break
}

dvs, err := r.kubevirt.getDVs(vm)
if err != nil {
step.AddError(err.Error())
err = nil
break
}
if !r.hasPausedDv(dvs) {
vm.Phase = r.next(vm.Phase)
// Reset for next precopy
step.Annotations[DvStatusCheckRetriesAnnotation] = "1"
} else {
var retries int
retriesAnnotation := step.Annotations[DvStatusCheckRetriesAnnotation]
if retriesAnnotation == "" {
step.Annotations[DvStatusCheckRetriesAnnotation] = "1"
} else {
retries, err = strconv.Atoi(retriesAnnotation)
if err != nil {
step.AddError(err.Error())
err = nil
break
}
if retries >= settings.Settings.DvStatusCheckRetries {
// Do not fail the step as this can happen when the user runs the warm migration but the VM is already shutdown
// In that case we don't create any delta and don't change the CDI DV status.
r.Log.Info(
"DataVolume status check exceeded the retry limit."+
"If this causes the problems with the snapshot removal in the CDI please bump the controller_dv_status_check_retries.",
"vm",
vm.String())
vm.Phase = r.next(vm.Phase)
// Reset for next precopy
step.Annotations[DvStatusCheckRetriesAnnotation] = "1"
} else {
step.Annotations[DvStatusCheckRetriesAnnotation] = strconv.Itoa(retries + 1)
}
}
}
case StoreInitialSnapshotDeltas, StoreSnapshotDeltas:
step, found := vm.FindStep(r.step(vm))
if !found {
Expand Down Expand Up @@ -1073,9 +1123,9 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) {

switch vm.Phase {
case AddCheckpoint:
vm.Phase = CopyDisks
vm.Phase = WaitForDataVolumesStatus
case AddFinalCheckpoint:
vm.Phase = Finalize
vm.Phase = WaitForFinalDataVolumesStatus
}
case StorePowerState:
step, found := vm.FindStep(r.step(vm))
Expand Down Expand Up @@ -1259,6 +1309,15 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) {
return
}

func (r *Migration) hasPausedDv(dvs []ExtendedDataVolume) bool {
for _, dv := range dvs {
if dv.Status.Phase == Paused {
return true
}
}
return false
}

func (r *Migration) resetPrecopyTasks(vm *plan.VMStatus, step *plan.Step) {
step.Completed = nil
for _, task := range step.Tasks {
Expand Down
6 changes: 6 additions & 0 deletions pkg/settings/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ const (
FileSystemOverhead = "FILESYSTEM_OVERHEAD"
BlockOverhead = "BLOCK_OVERHEAD"
CleanupRetries = "CLEANUP_RETRIES"
DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES"
OvirtOsConfigMap = "OVIRT_OS_MAP"
VsphereOsConfigMap = "VSPHERE_OS_MAP"
VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP"
Expand Down Expand Up @@ -58,6 +59,8 @@ type Migration struct {
BlockOverhead int64
// Cleanup retries
CleanupRetries int
// DvStatusCheckRetries retries
DvStatusCheckRetries int
// oVirt OS config map name
OvirtOsConfigMap string
// vSphere OS config map name
Expand Down Expand Up @@ -100,6 +103,9 @@ func (r *Migration) Load() (err error) {
if r.CleanupRetries, err = getPositiveEnvLimit(CleanupRetries, 10); err != nil {
return liberr.Wrap(err)
}
if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil {
return liberr.Wrap(err)
}
if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok {
r.VirtV2vImage = virtV2vImage
} else if Settings.Role.Has(MainRole) {
Expand Down

0 comments on commit b2c304d

Please sign in to comment.