diff --git a/operator/roles/forkliftcontroller/defaults/main.yml b/operator/roles/forkliftcontroller/defaults/main.yml index 92db25c0f..86f7c8b39 100644 --- a/operator/roles/forkliftcontroller/defaults/main.yml +++ b/operator/roles/forkliftcontroller/defaults/main.yml @@ -35,6 +35,7 @@ controller_snapshot_removal_timeout_minuts: 120 controller_snapshot_status_check_rate_seconds: 10 controller_cleanup_retries: 10 controller_dv_status_check_retries: 10 +controller_snapshot_removal_check_retries: 20 controller_vsphere_incremental_backup: true controller_ovirt_warm_migration: true controller_retain_precopy_importer_pods: false diff --git a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 index 84cc6d480..28bfbcc77 100644 --- a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 +++ b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 @@ -85,6 +85,10 @@ spec: - name: DV_STATUS_CHECK_RETRIES value: "{{ controller_dv_status_check_retries }}" {% endif %} +{% if controller_snapshot_removal_check_retries is number %} + - name: SNAPSHOT_REMOVAL_CHECK_RETRIES + value: "{{ controller_snapshot_removal_check_retries }}" +{% endif %} {% if controller_max_vm_inflight is number %} - name: MAX_VM_INFLIGHT value: "{{ controller_max_vm_inflight }}" diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 0a7c7878b..979afe3bd 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -55,38 +55,41 @@ var ( // Phases. const ( - Started = "Started" - PreHook = "PreHook" - StorePowerState = "StorePowerState" - PowerOffSource = "PowerOffSource" - WaitForPowerOff = "WaitForPowerOff" - CreateDataVolumes = "CreateDataVolumes" - WaitForDataVolumesStatus = "WaitForDataVolumesStatus" - WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus" - CreateVM = "CreateVM" - CopyDisks = "CopyDisks" - AllocateDisks = "AllocateDisks" - CopyingPaused = "CopyingPaused" - AddCheckpoint = "AddCheckpoint" - AddFinalCheckpoint = "AddFinalCheckpoint" - CreateSnapshot = "CreateSnapshot" - CreateInitialSnapshot = "CreateInitialSnapshot" - CreateFinalSnapshot = "CreateFinalSnapshot" - Finalize = "Finalize" - CreateGuestConversionPod = "CreateGuestConversionPod" - ConvertGuest = "ConvertGuest" - CopyDisksVirtV2V = "CopyDisksVirtV2V" - PostHook = "PostHook" - Completed = "Completed" - WaitForSnapshot = "WaitForSnapshot" - WaitForInitialSnapshot = "WaitForInitialSnapshot" - WaitForFinalSnapshot = "WaitForFinalSnapshot" - ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot" - StoreSnapshotDeltas = "StoreSnapshotDeltas" - StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas" - RemovePreviousSnapshot = "RemovePreviousSnapshot" - RemovePenultimateSnapshot = "RemovePenultimateSnapshot" - RemoveFinalSnapshot = "RemoveFinalSnapshot" + Started = "Started" + PreHook = "PreHook" + StorePowerState = "StorePowerState" + PowerOffSource = "PowerOffSource" + WaitForPowerOff = "WaitForPowerOff" + CreateDataVolumes = "CreateDataVolumes" + WaitForDataVolumesStatus = "WaitForDataVolumesStatus" + WaitForFinalDataVolumesStatus = "WaitForFinalDataVolumesStatus" + CreateVM = "CreateVM" + CopyDisks = "CopyDisks" + AllocateDisks = "AllocateDisks" + CopyingPaused = "CopyingPaused" + AddCheckpoint = "AddCheckpoint" + AddFinalCheckpoint = "AddFinalCheckpoint" + CreateSnapshot = "CreateSnapshot" + CreateInitialSnapshot = "CreateInitialSnapshot" + CreateFinalSnapshot = "CreateFinalSnapshot" + Finalize = "Finalize" + CreateGuestConversionPod = "CreateGuestConversionPod" + ConvertGuest = "ConvertGuest" + CopyDisksVirtV2V = "CopyDisksVirtV2V" + PostHook = "PostHook" + Completed = "Completed" + WaitForSnapshot = "WaitForSnapshot" + WaitForInitialSnapshot = "WaitForInitialSnapshot" + WaitForFinalSnapshot = "WaitForFinalSnapshot" + ConvertOpenstackSnapshot = "ConvertOpenstackSnapshot" + StoreSnapshotDeltas = "StoreSnapshotDeltas" + StoreInitialSnapshotDeltas = "StoreInitialSnapshotDeltas" + RemovePreviousSnapshot = "RemovePreviousSnapshot" + RemovePenultimateSnapshot = "RemovePenultimateSnapshot" + RemoveFinalSnapshot = "RemoveFinalSnapshot" + WaitForFinalSnapshotRemoval = "WaitForFinalSnapshotRemoval" + WaitForPreviousSnapshotRemoval = "WaitForPreviousSnapshotRemoval" + WaitForPenultimateSnapshotRemoval = "WaitForPenultimateSnapshotRemoval" ) // Steps. @@ -105,6 +108,7 @@ const ( TransferCompleted = "Transfer completed." PopulatorPodPrefix = "populate-" DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries" + SnapshotRemovalCheckRetries = "snapshotRemovalCheckRetries" ) var ( @@ -141,6 +145,7 @@ var ( {Name: CopyDisks}, {Name: CopyingPaused}, {Name: RemovePreviousSnapshot, All: VSphere}, + {Name: WaitForPreviousSnapshotRemoval, All: VSphere}, {Name: CreateSnapshot}, {Name: WaitForSnapshot}, {Name: StoreSnapshotDeltas, All: VSphere}, @@ -149,12 +154,14 @@ var ( {Name: PowerOffSource}, {Name: WaitForPowerOff}, {Name: RemovePenultimateSnapshot, All: VSphere}, + {Name: WaitForPenultimateSnapshotRemoval, All: VSphere}, {Name: CreateFinalSnapshot}, {Name: WaitForFinalSnapshot}, {Name: AddFinalCheckpoint}, {Name: WaitForFinalDataVolumesStatus}, {Name: Finalize}, {Name: RemoveFinalSnapshot, All: VSphere}, + {Name: WaitForFinalSnapshotRemoval, All: VSphere}, {Name: CreateGuestConversionPod, All: RequiresConversion}, {Name: ConvertGuest, All: RequiresConversion}, {Name: CreateVM}, @@ -667,9 +674,9 @@ func (r *Migration) step(vm *plan.VMStatus) (step string) { step = Initialize case AllocateDisks: step = DiskAllocation - case CopyDisks, CopyingPaused, RemovePreviousSnapshot, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus: + case CopyDisks, CopyingPaused, RemovePreviousSnapshot, WaitForPreviousSnapshotRemoval, CreateSnapshot, WaitForSnapshot, StoreSnapshotDeltas, AddCheckpoint, ConvertOpenstackSnapshot, WaitForDataVolumesStatus: step = DiskTransfer - case RemovePenultimateSnapshot, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalDataVolumesStatus: + case RemovePenultimateSnapshot, WaitForPenultimateSnapshotRemoval, CreateFinalSnapshot, WaitForFinalSnapshot, AddFinalCheckpoint, Finalize, RemoveFinalSnapshot, WaitForFinalSnapshotRemoval, WaitForFinalDataVolumesStatus: step = Cutover case CreateGuestConversionPod, ConvertGuest: step = ImageConversion @@ -1008,6 +1015,34 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { break } vm.Phase = r.next(vm.Phase) + case WaitForPreviousSnapshotRemoval, WaitForPenultimateSnapshotRemoval, WaitForFinalSnapshotRemoval: + step, found := vm.FindStep(r.step(vm)) + if !found { + vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) + break + } + // FIXME: This is just temporary timeout to unblock the migrations which get stuck on issue https://issues.redhat.com/browse/MTV-1753 + // This should be fixed properly by adding the task manager inside the inventory and monitor the task status + // from the main controller. + var retries int + retriesAnnotation := step.Annotations[SnapshotRemovalCheckRetries] + if retriesAnnotation == "" { + step.Annotations[SnapshotRemovalCheckRetries] = "1" + } else { + retries, err = strconv.Atoi(retriesAnnotation) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if retries >= settings.Settings.SnapshotRemovalCheckRetries { + vm.Phase = r.next(vm.Phase) + // Reset for next precopy + step.Annotations[SnapshotRemovalCheckRetries] = "1" + } else { + step.Annotations[SnapshotRemovalCheckRetries] = strconv.Itoa(retries + 1) + } + } case CreateInitialSnapshot, CreateSnapshot, CreateFinalSnapshot: step, found := vm.FindStep(r.step(vm)) if !found { diff --git a/pkg/settings/migration.go b/pkg/settings/migration.go index 0b8096156..354eadd18 100644 --- a/pkg/settings/migration.go +++ b/pkg/settings/migration.go @@ -12,25 +12,26 @@ import ( // Environment variables. const ( - MaxVmInFlight = "MAX_VM_INFLIGHT" - HookRetry = "HOOK_RETRY" - ImporterRetry = "IMPORTER_RETRY" - VirtV2vImage = "VIRT_V2V_IMAGE" - PrecopyInterval = "PRECOPY_INTERVAL" - VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" - SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" - SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" - CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" - FileSystemOverhead = "FILESYSTEM_OVERHEAD" - BlockOverhead = "BLOCK_OVERHEAD" - CleanupRetries = "CLEANUP_RETRIES" - DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" - OvirtOsConfigMap = "OVIRT_OS_MAP" - VsphereOsConfigMap = "VSPHERE_OS_MAP" - VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" - VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" - VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" - VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" + MaxVmInFlight = "MAX_VM_INFLIGHT" + HookRetry = "HOOK_RETRY" + ImporterRetry = "IMPORTER_RETRY" + VirtV2vImage = "VIRT_V2V_IMAGE" + PrecopyInterval = "PRECOPY_INTERVAL" + VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" + SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" + SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" + CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" + FileSystemOverhead = "FILESYSTEM_OVERHEAD" + BlockOverhead = "BLOCK_OVERHEAD" + CleanupRetries = "CLEANUP_RETRIES" + DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" + SnapshotRemovalCheckRetries = "SNAPSHOT_REMOVAL_CHECK_RETRIES" + OvirtOsConfigMap = "OVIRT_OS_MAP" + VsphereOsConfigMap = "VSPHERE_OS_MAP" + VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" + VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" + VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" + VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" ) // Migration settings @@ -61,6 +62,8 @@ type Migration struct { CleanupRetries int // DvStatusCheckRetries retries DvStatusCheckRetries int + // SnapshotRemovalCheckRetries retries + SnapshotRemovalCheckRetries int // oVirt OS config map name OvirtOsConfigMap string // vSphere OS config map name @@ -106,6 +109,9 @@ func (r *Migration) Load() (err error) { if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil { return liberr.Wrap(err) } + if r.SnapshotRemovalCheckRetries, err = getPositiveEnvLimit(SnapshotRemovalCheckRetries, 20); err != nil { + return liberr.Wrap(err) + } if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok { r.VirtV2vImage = virtV2vImage } else if Settings.Role.Has(MainRole) {