From 0a308574f98cd88d6aadaff8bd83129892c791f6 Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Fri, 4 Oct 2024 18:20:54 +0200 Subject: [PATCH] MTV-1543 | Fix warm migration scheduler Issue: When running the warm migration with VMs more than the MaxInFlight disks. The VMs over this number won't start the migration till the cutover. Once the cutover is started the VMs which were not started migrate same as in the old migration. So there is larger downtime. Fix: Ignore the cost when in `CopyingPaused`, this allows the other Migrations to start as the cost is reduced when the VM disk transfer is finished. Note: This patch also improves the cold migration as it ignores the count when creating the VM so the other VM migrations can get started. And dramaticially improves the warm migration time as we are not waiting for the guest conversion as we already have the disk so we start the guest conversion and do not halt the scheduler. Fixes: https://issues.redhat.com/browse/MTV-1543 Signed-off-by: Martin Necas --- .../plan/scheduler/vsphere/scheduler.go | 49 +++++++++++++++---- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/pkg/controller/plan/scheduler/vsphere/scheduler.go b/pkg/controller/plan/scheduler/vsphere/scheduler.go index 0b2abef98..e103c1213 100644 --- a/pkg/controller/plan/scheduler/vsphere/scheduler.go +++ b/pkg/controller/plan/scheduler/vsphere/scheduler.go @@ -14,6 +14,21 @@ import ( liberr "github.com/konveyor/forklift-controller/pkg/lib/error" ) +// Phases. +const ( + CopyingPaused = "CopyingPaused" + CreateGuestConversionPod = "CreateGuestConversionPod" + ConvertGuest = "ConvertGuest" + CreateVM = "CreateVM" + PostHook = "PostHook" + Completed = "Completed" +) + +// Steps. +const ( + DiskTransfer = "DiskTransfer" +) + // Package level mutex to ensure that // multiple concurrent reconciles don't // attempt to schedule VMs into the same @@ -107,7 +122,7 @@ func (r *Scheduler) buildInFlight() (err error) { return } if vmStatus.Running() { - r.inFlight[vm.Host] += r.cost(vm) + r.inFlight[vm.Host] += r.cost(vm, vmStatus) } } @@ -153,7 +168,7 @@ func (r *Scheduler) buildInFlight() (err error) { } return err } - r.inFlight[vm.Host] += r.cost(vm) + r.inFlight[vm.Host] += r.cost(vm, vmStatus) } } @@ -170,11 +185,10 @@ func (r *Scheduler) buildPending() (err error) { if err != nil { return } - if !vmStatus.MarkedStarted() && !vmStatus.MarkedCompleted() { pending := &pendingVM{ status: vmStatus, - cost: r.cost(vm), + cost: r.cost(vm, vmStatus), } r.pending[vm.Host] = append(r.pending[vm.Host], pending) } @@ -182,13 +196,28 @@ func (r *Scheduler) buildPending() (err error) { return } -func (r *Scheduler) cost(vm *model.VM) int { - if coldLocal, _ := r.Plan.VSphereColdLocal(); coldLocal { - /// virt-v2v transfers one disk at a time - return 1 +func (r *Scheduler) cost(vm *model.VM, vmStatus *plan.VMStatus) int { + coldLocal, _ := r.Plan.VSphereColdLocal() + if coldLocal { + switch vmStatus.Phase { + case CreateVM, PostHook, Completed: + // In these phases we already have the disk transferred and are left only to create the VM + // By setting the cost to 0 other VMs can start migrating + return 0 + default: + return 1 + } } else { - // CDI transfers the disks in parallel by different pods - return len(vm.Disks) + switch vmStatus.Phase { + case CreateVM, PostHook, Completed, CopyingPaused, ConvertGuest, CreateGuestConversionPod: + // The warm/remote migrations this is done on already transferred disks, + // and we can start other VM migrations at these point. + // By setting the cost to 0 other VMs can start migrating + return 0 + default: + // CDI transfers the disks in parallel by different pods + return len(vm.Disks) + } } }