From 7214190c7487ffe6b257b7940b1f80f8c5b61f5e Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Wed, 11 Dec 2024 13:23:57 +0100 Subject: [PATCH 1/7] MTV-1774 | Start all available VMs from scheduler Issue: This issue is visible with a combination of two problems. The first problem is if there is some step/phase that takes some time to finish and halts the process. An example of such an issue MTV-1775. This causes the VM migration startup to take some time as we don't start all available VMs at once but we add VMs one by one in the reconcile. This in large-scale migration can take a long time. For example on the scale of 200 VMs in best case scenario it would take 10 minutes to start all VMs as we have 3s reconciled. Fix: Start all available VMs from the scheduler at once. Ref: https://issues.redhat.com/browse/MTV-1774 Signed-off-by: Martin Necas --- pkg/controller/plan/migration.go | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 979afe3bd..2535e97c8 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -222,16 +222,22 @@ func (r *Migration) Run() (reQ time.Duration, err error) { return } } - - vm, hasNext, err := r.scheduler.Next() - if err != nil { - return - } - if hasNext { - err = r.execute(vm) + for { + var hasNext bool + var vm *plan.VMStatus + vm, hasNext, err = r.scheduler.Next() if err != nil { return } + if hasNext { + err = r.execute(vm) + if err != nil { + return + } + } else { + r.Log.Info("The scheduler does not have any additional VMs.") + break + } } completed, err := r.end() From 7c622494a6f82a30b2a9b12b11530634bd7cee19 Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Wed, 11 Dec 2024 13:31:55 +0100 Subject: [PATCH 2/7] Move waiting for tasks to separate phase to unblock process Issue: When the Forklift creates the snapshot we need to wait for the task to finish. Right now we are using task.WaitForResult, this causes the whole process to wait for the snapshot creation and blocks other VM migrations. Same problem we have with snapshot removal, if the ESXi host is busy and we start the snapshot removal the snapshots can take longer than the reconcile cycle (3s) and we can fail due to it. Fix: Instead of using the task.WaitForResult the forklift will start querying for the latest tasks per VM, by default it's 10 tasks. This querying will be done in a separate phase then the creation/deletion. So we will have WaitFor phases for each of the object manipulations. We find the specific task for the creation/deletion and check its status. This has the advantage that we are not only getting the status of the task but in addition also the results of the task, so we can propagate them to the user, in case the creation/deletion fails. Ref: - https://issues.redhat.com/browse/MTV-1753 - https://issues.redhat.com/browse/MTV-1775 Signed-off-by: Martin Necas --- .../forkliftcontroller/defaults/main.yml | 1 - .../controller/deployment-controller.yml.j2 | 4 - pkg/controller/plan/adapter/base/doc.go | 4 +- pkg/controller/plan/adapter/ocp/client.go | 7 +- .../plan/adapter/openstack/client.go | 7 +- pkg/controller/plan/adapter/ova/client.go | 7 +- pkg/controller/plan/adapter/ovirt/client.go | 7 +- .../plan/adapter/vsphere/BUILD.bazel | 1 + pkg/controller/plan/adapter/vsphere/client.go | 88 +++++++++++++++---- pkg/controller/plan/migration.go | 39 ++++---- pkg/settings/migration.go | 44 ++++------ 11 files changed, 136 insertions(+), 73 deletions(-) diff --git a/operator/roles/forkliftcontroller/defaults/main.yml b/operator/roles/forkliftcontroller/defaults/main.yml index 5733c6868..29f491f98 100644 --- a/operator/roles/forkliftcontroller/defaults/main.yml +++ b/operator/roles/forkliftcontroller/defaults/main.yml @@ -35,7 +35,6 @@ controller_snapshot_removal_timeout_minuts: 120 controller_snapshot_status_check_rate_seconds: 10 controller_cleanup_retries: 10 controller_dv_status_check_retries: 10 -controller_snapshot_removal_check_retries: 20 controller_vsphere_incremental_backup: true controller_ovirt_warm_migration: true controller_max_vm_inflight: 20 diff --git a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 index c346dbfbe..824fb9a8e 100644 --- a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 +++ b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 @@ -85,10 +85,6 @@ spec: - name: DV_STATUS_CHECK_RETRIES value: "{{ controller_dv_status_check_retries }}" {% endif %} -{% if controller_snapshot_removal_check_retries is number %} - - name: SNAPSHOT_REMOVAL_CHECK_RETRIES - value: "{{ controller_snapshot_removal_check_retries }}" -{% endif %} {% if controller_max_vm_inflight is number %} - name: MAX_VM_INFLIGHT value: "{{ controller_max_vm_inflight }}" diff --git a/pkg/controller/plan/adapter/base/doc.go b/pkg/controller/plan/adapter/base/doc.go index 0900f3397..c08f75464 100644 --- a/pkg/controller/plan/adapter/base/doc.go +++ b/pkg/controller/plan/adapter/base/doc.go @@ -108,7 +108,9 @@ type Client interface { // Remove a snapshot. RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) error // Check if a snapshot is ready to transfer. - CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) + CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) + // Check if a snapshot is removed. + CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (ready bool, err error) // Set DataVolume checkpoints. SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) (err error) // Close connections to the provider API. diff --git a/pkg/controller/plan/adapter/ocp/client.go b/pkg/controller/plan/adapter/ocp/client.go index c17cf49a8..951264857 100644 --- a/pkg/controller/plan/adapter/ocp/client.go +++ b/pkg/controller/plan/adapter/ocp/client.go @@ -25,7 +25,12 @@ type Client struct { } // CheckSnapshotReady implements base.Client -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (bool, error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { + return +} + +// CheckSnapshotRemoved implements base.Client +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { return false, nil } diff --git a/pkg/controller/plan/adapter/openstack/client.go b/pkg/controller/plan/adapter/openstack/client.go index e8e7a89da..6291a095c 100644 --- a/pkg/controller/plan/adapter/openstack/client.go +++ b/pkg/controller/plan/adapter/openstack/client.go @@ -115,10 +115,15 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (imageI } // Check if a snapshot is ready to transfer. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, imageID string) (ready bool, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { return } +// CheckSnapshotRemoved implements base.Client +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { + return false, nil +} + // Set DataVolume checkpoints. func (r *Client) SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) error { return nil diff --git a/pkg/controller/plan/adapter/ova/client.go b/pkg/controller/plan/adapter/ova/client.go index c1ef2838d..c0a676e1c 100644 --- a/pkg/controller/plan/adapter/ova/client.go +++ b/pkg/controller/plan/adapter/ova/client.go @@ -61,10 +61,15 @@ func (r *Client) GetSnapshotDeltas(vmRef ref.Ref, snapshot string, hostsFunc uti } // Check if a snapshot is ready to transfer, to avoid importer restarts. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { return } +// CheckSnapshotRemoved implements base.Client +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { + return false, nil +} + // Set DataVolume checkpoints. func (r *Client) SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) (err error) { return diff --git a/pkg/controller/plan/adapter/ovirt/client.go b/pkg/controller/plan/adapter/ovirt/client.go index 4a7a7fec5..11d404bdc 100644 --- a/pkg/controller/plan/adapter/ovirt/client.go +++ b/pkg/controller/plan/adapter/ovirt/client.go @@ -75,7 +75,7 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapsh } // Check if a snapshot is ready to transfer, to avoid importer restarts. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { correlationID, err := r.getSnapshotCorrelationID(vmRef, &snapshot) if err != nil { err = liberr.Wrap(err) @@ -104,6 +104,11 @@ func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, return } +// CheckSnapshotRemoved implements base.Client +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { + return false, nil +} + // Remove a VM snapshot. No-op for this provider. func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { return diff --git a/pkg/controller/plan/adapter/vsphere/BUILD.bazel b/pkg/controller/plan/adapter/vsphere/BUILD.bazel index c744b9e7c..68d93a55a 100644 --- a/pkg/controller/plan/adapter/vsphere/BUILD.bazel +++ b/pkg/controller/plan/adapter/vsphere/BUILD.bazel @@ -36,6 +36,7 @@ go_library( "//vendor/github.com/vmware/govmomi/find", "//vendor/github.com/vmware/govmomi/object", "//vendor/github.com/vmware/govmomi/session", + "//vendor/github.com/vmware/govmomi/task", "//vendor/github.com/vmware/govmomi/vim25", "//vendor/github.com/vmware/govmomi/vim25/mo", "//vendor/github.com/vmware/govmomi/vim25/soap", diff --git a/pkg/controller/plan/adapter/vsphere/client.go b/pkg/controller/plan/adapter/vsphere/client.go index a48af1af5..6e52d5793 100644 --- a/pkg/controller/plan/adapter/vsphere/client.go +++ b/pkg/controller/plan/adapter/vsphere/client.go @@ -16,6 +16,7 @@ import ( "github.com/vmware/govmomi" "github.com/vmware/govmomi/object" "github.com/vmware/govmomi/session" + "github.com/vmware/govmomi/task" "github.com/vmware/govmomi/vim25" "github.com/vmware/govmomi/vim25/mo" "github.com/vmware/govmomi/vim25/soap" @@ -27,8 +28,11 @@ import ( ) const ( - snapshotName = "forklift-migration-precopy" - snapshotDesc = "Forklift Operator warm migration precopy" + snapshotName = "forklift-migration-precopy" + snapshotDesc = "Forklift Operator warm migration precopy" + VirtualMachine = "VirtualMachine" + CreateSnapshotTask = "CreateSnapshot_Task" + RemoveSnapshotTask = "RemoveSnapshot_Task" ) // vSphere VM Client @@ -39,9 +43,9 @@ type Client struct { } // Create a VM snapshot and return its ID. -func (r *Client) CreateSnapshot(vmRef ref.Ref, hosts util.HostsFunc) (id string, err error) { +func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (id string, err error) { r.Log.V(1).Info("Creating snapshot", "vmRef", vmRef) - vm, err := r.getVM(vmRef, hosts) + vm, err := r.getVM(vmRef, hostsFunc) if err != nil { return } @@ -50,28 +54,82 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hosts util.HostsFunc) (id string, err = liberr.Wrap(err) return } - res, err := task.WaitForResult(context.TODO(), nil) + return task.Common.Reference().Value, nil +} + +// Check if a snapshot is ready to transfer. +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { + taskInfo, err := r.getLatestTaskByName(vmRef, CreateSnapshotTask) if err != nil { - err = liberr.Wrap(err) - return + return false, "", liberr.Wrap(err) } - id = res.Result.(types.ManagedObjectReference).Value - r.Log.Info("Created snapshot", "vmRef", vmRef, "id", id) + ready, err = r.checkTaskStatus(taskInfo) + if err != nil { + return false, "", liberr.Wrap(err) + } + if ready { + return true, taskInfo.Result.(types.ManagedObjectReference).Value, nil + } else { + // The snapshot is not ready, retry the check + return false, "", nil + } +} - return +// Check if a snapshot is removed. +func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (ready bool, err error) { + taskInfo, err := r.getLatestTaskByName(vmRef, RemoveSnapshotTask) + if err != nil { + return false, liberr.Wrap(err) + } + return r.checkTaskStatus(taskInfo) } -// Check if a snapshot is ready to transfer. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { - return true, nil +func (r *Client) checkTaskStatus(taskInfo *types.TaskInfo) (ready bool, err error) { + r.Log.Info("Snapshot task", "task", taskInfo.Task.Value, "name", taskInfo.Name, "status", taskInfo.State) + switch taskInfo.State { + case types.TaskInfoStateSuccess: + return true, nil + case types.TaskInfoStateError: + return false, fmt.Errorf(taskInfo.Error.LocalizedMessage) + default: + return false, nil + } +} + +func (r *Client) getLatestTaskByName(vmRef ref.Ref, taskName string) (*types.TaskInfo, error) { + taskManager := task.NewManager(r.client.Client) + taskCollector, err := taskManager.CreateCollectorForTasks(context.TODO(), types.TaskFilterSpec{ + Entity: &types.TaskFilterSpecByEntity{ + Entity: types.ManagedObjectReference{ + Type: VirtualMachine, + Value: vmRef.ID, + }, + Recursion: types.TaskFilterSpecRecursionOptionSelf, + }, + }) + if err != nil { + return nil, err + } + //nolint:errcheck + defer taskCollector.Destroy(context.Background()) + tasks, err := taskCollector.LatestPage(context.TODO()) + if err != nil { + return nil, err + } + for _, taskInfo := range tasks { + if taskInfo.Name == taskName { + return &taskInfo, nil + } + } + return nil, fmt.Errorf("no task found with name %s, vmRef %v", taskName, vmRef) } // Remove a VM snapshot. -func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hosts util.HostsFunc) (err error) { +func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { r.Log.V(1).Info("RemoveSnapshot", "vmRef", vmRef, "snapshot", snapshot) - err = r.removeSnapshot(vmRef, snapshot, false, hosts) + err = r.removeSnapshot(vmRef, snapshot, false, hostsFunc) return } diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 2535e97c8..692fb7edc 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -108,7 +108,6 @@ const ( TransferCompleted = "Transfer completed." PopulatorPodPrefix = "populate-" DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries" - SnapshotRemovalCheckRetries = "snapshotRemovalCheckRetries" ) var ( @@ -1027,27 +1026,15 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) break } - // FIXME: This is just temporary timeout to unblock the migrations which get stuck on issue https://issues.redhat.com/browse/MTV-1753 - // This should be fixed properly by adding the task manager inside the inventory and monitor the task status - // from the main controller. - var retries int - retriesAnnotation := step.Annotations[SnapshotRemovalCheckRetries] - if retriesAnnotation == "" { - step.Annotations[SnapshotRemovalCheckRetries] = "1" - } else { - retries, err = strconv.Atoi(retriesAnnotation) - if err != nil { - step.AddError(err.Error()) - err = nil - break - } - if retries >= settings.Settings.SnapshotRemovalCheckRetries { - vm.Phase = r.next(vm.Phase) - // Reset for next precopy - step.Annotations[SnapshotRemovalCheckRetries] = "1" - } else { - step.Annotations[SnapshotRemovalCheckRetries] = strconv.Itoa(retries + 1) - } + snapshot := vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot + ready, err := r.provider.CheckSnapshotRemoved(vm.Ref, snapshot) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if ready { + vm.Phase = r.next(vm.Phase) } case CreateInitialSnapshot, CreateSnapshot, CreateFinalSnapshot: step, found := vm.FindStep(r.step(vm)) @@ -1076,12 +1063,18 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { break } snapshot := vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot - ready, err := r.provider.CheckSnapshotReady(vm.Ref, snapshot) + ready, snapshotId, err := r.provider.CheckSnapshotReady(vm.Ref, snapshot) if err != nil { step.AddError(err.Error()) err = nil break } + // If the provider does not directly create the snapshot, but we need to wait for the snapshot to be created + // We start the creation task in CreateSnapshot, set the task ID as a snapshot id which needs to be replaced + // by the snapshot id after the task finishes. + if snapshotId != "" { + vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot = snapshotId + } if ready { vm.Phase = r.next(vm.Phase) } diff --git a/pkg/settings/migration.go b/pkg/settings/migration.go index 354eadd18..0b8096156 100644 --- a/pkg/settings/migration.go +++ b/pkg/settings/migration.go @@ -12,26 +12,25 @@ import ( // Environment variables. const ( - MaxVmInFlight = "MAX_VM_INFLIGHT" - HookRetry = "HOOK_RETRY" - ImporterRetry = "IMPORTER_RETRY" - VirtV2vImage = "VIRT_V2V_IMAGE" - PrecopyInterval = "PRECOPY_INTERVAL" - VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" - SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" - SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" - CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" - FileSystemOverhead = "FILESYSTEM_OVERHEAD" - BlockOverhead = "BLOCK_OVERHEAD" - CleanupRetries = "CLEANUP_RETRIES" - DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" - SnapshotRemovalCheckRetries = "SNAPSHOT_REMOVAL_CHECK_RETRIES" - OvirtOsConfigMap = "OVIRT_OS_MAP" - VsphereOsConfigMap = "VSPHERE_OS_MAP" - VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" - VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" - VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" - VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" + MaxVmInFlight = "MAX_VM_INFLIGHT" + HookRetry = "HOOK_RETRY" + ImporterRetry = "IMPORTER_RETRY" + VirtV2vImage = "VIRT_V2V_IMAGE" + PrecopyInterval = "PRECOPY_INTERVAL" + VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" + SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" + SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" + CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" + FileSystemOverhead = "FILESYSTEM_OVERHEAD" + BlockOverhead = "BLOCK_OVERHEAD" + CleanupRetries = "CLEANUP_RETRIES" + DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" + OvirtOsConfigMap = "OVIRT_OS_MAP" + VsphereOsConfigMap = "VSPHERE_OS_MAP" + VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" + VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" + VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" + VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" ) // Migration settings @@ -62,8 +61,6 @@ type Migration struct { CleanupRetries int // DvStatusCheckRetries retries DvStatusCheckRetries int - // SnapshotRemovalCheckRetries retries - SnapshotRemovalCheckRetries int // oVirt OS config map name OvirtOsConfigMap string // vSphere OS config map name @@ -109,9 +106,6 @@ func (r *Migration) Load() (err error) { if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil { return liberr.Wrap(err) } - if r.SnapshotRemovalCheckRetries, err = getPositiveEnvLimit(SnapshotRemovalCheckRetries, 20); err != nil { - return liberr.Wrap(err) - } if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok { r.VirtV2vImage = virtV2vImage } else if Settings.Role.Has(MainRole) { From ec358b5d522fcd5adc6ade89d281a6514e213ed6 Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Thu, 12 Dec 2024 20:19:05 +0100 Subject: [PATCH 3/7] Revert "Move waiting for tasks to separate phase to unblock process" This reverts commit 4208890b0d98b3541a7e744cc39aad9633a80468. Signed-off-by: Martin Necas --- .../forkliftcontroller/defaults/main.yml | 1 + .../controller/deployment-controller.yml.j2 | 4 + pkg/controller/plan/adapter/base/doc.go | 4 +- pkg/controller/plan/adapter/ocp/client.go | 7 +- .../plan/adapter/openstack/client.go | 7 +- pkg/controller/plan/adapter/ova/client.go | 7 +- pkg/controller/plan/adapter/ovirt/client.go | 7 +- .../plan/adapter/vsphere/BUILD.bazel | 1 - pkg/controller/plan/adapter/vsphere/client.go | 88 ++++--------------- pkg/controller/plan/migration.go | 39 ++++---- pkg/settings/migration.go | 44 ++++++---- 11 files changed, 73 insertions(+), 136 deletions(-) diff --git a/operator/roles/forkliftcontroller/defaults/main.yml b/operator/roles/forkliftcontroller/defaults/main.yml index 29f491f98..5733c6868 100644 --- a/operator/roles/forkliftcontroller/defaults/main.yml +++ b/operator/roles/forkliftcontroller/defaults/main.yml @@ -35,6 +35,7 @@ controller_snapshot_removal_timeout_minuts: 120 controller_snapshot_status_check_rate_seconds: 10 controller_cleanup_retries: 10 controller_dv_status_check_retries: 10 +controller_snapshot_removal_check_retries: 20 controller_vsphere_incremental_backup: true controller_ovirt_warm_migration: true controller_max_vm_inflight: 20 diff --git a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 index 824fb9a8e..c346dbfbe 100644 --- a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 +++ b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 @@ -85,6 +85,10 @@ spec: - name: DV_STATUS_CHECK_RETRIES value: "{{ controller_dv_status_check_retries }}" {% endif %} +{% if controller_snapshot_removal_check_retries is number %} + - name: SNAPSHOT_REMOVAL_CHECK_RETRIES + value: "{{ controller_snapshot_removal_check_retries }}" +{% endif %} {% if controller_max_vm_inflight is number %} - name: MAX_VM_INFLIGHT value: "{{ controller_max_vm_inflight }}" diff --git a/pkg/controller/plan/adapter/base/doc.go b/pkg/controller/plan/adapter/base/doc.go index c08f75464..0900f3397 100644 --- a/pkg/controller/plan/adapter/base/doc.go +++ b/pkg/controller/plan/adapter/base/doc.go @@ -108,9 +108,7 @@ type Client interface { // Remove a snapshot. RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) error // Check if a snapshot is ready to transfer. - CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) - // Check if a snapshot is removed. - CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (ready bool, err error) + CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) // Set DataVolume checkpoints. SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) (err error) // Close connections to the provider API. diff --git a/pkg/controller/plan/adapter/ocp/client.go b/pkg/controller/plan/adapter/ocp/client.go index 951264857..c17cf49a8 100644 --- a/pkg/controller/plan/adapter/ocp/client.go +++ b/pkg/controller/plan/adapter/ocp/client.go @@ -25,12 +25,7 @@ type Client struct { } // CheckSnapshotReady implements base.Client -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { - return -} - -// CheckSnapshotRemoved implements base.Client -func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (bool, error) { return false, nil } diff --git a/pkg/controller/plan/adapter/openstack/client.go b/pkg/controller/plan/adapter/openstack/client.go index 6291a095c..e8e7a89da 100644 --- a/pkg/controller/plan/adapter/openstack/client.go +++ b/pkg/controller/plan/adapter/openstack/client.go @@ -115,15 +115,10 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (imageI } // Check if a snapshot is ready to transfer. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, imageID string) (ready bool, err error) { return } -// CheckSnapshotRemoved implements base.Client -func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { - return false, nil -} - // Set DataVolume checkpoints. func (r *Client) SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) error { return nil diff --git a/pkg/controller/plan/adapter/ova/client.go b/pkg/controller/plan/adapter/ova/client.go index c0a676e1c..c1ef2838d 100644 --- a/pkg/controller/plan/adapter/ova/client.go +++ b/pkg/controller/plan/adapter/ova/client.go @@ -61,15 +61,10 @@ func (r *Client) GetSnapshotDeltas(vmRef ref.Ref, snapshot string, hostsFunc uti } // Check if a snapshot is ready to transfer, to avoid importer restarts. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { return } -// CheckSnapshotRemoved implements base.Client -func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { - return false, nil -} - // Set DataVolume checkpoints. func (r *Client) SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) (err error) { return diff --git a/pkg/controller/plan/adapter/ovirt/client.go b/pkg/controller/plan/adapter/ovirt/client.go index 11d404bdc..4a7a7fec5 100644 --- a/pkg/controller/plan/adapter/ovirt/client.go +++ b/pkg/controller/plan/adapter/ovirt/client.go @@ -75,7 +75,7 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapsh } // Check if a snapshot is ready to transfer, to avoid importer restarts. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { correlationID, err := r.getSnapshotCorrelationID(vmRef, &snapshot) if err != nil { err = liberr.Wrap(err) @@ -104,11 +104,6 @@ func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, return } -// CheckSnapshotRemoved implements base.Client -func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (bool, error) { - return false, nil -} - // Remove a VM snapshot. No-op for this provider. func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { return diff --git a/pkg/controller/plan/adapter/vsphere/BUILD.bazel b/pkg/controller/plan/adapter/vsphere/BUILD.bazel index 68d93a55a..c744b9e7c 100644 --- a/pkg/controller/plan/adapter/vsphere/BUILD.bazel +++ b/pkg/controller/plan/adapter/vsphere/BUILD.bazel @@ -36,7 +36,6 @@ go_library( "//vendor/github.com/vmware/govmomi/find", "//vendor/github.com/vmware/govmomi/object", "//vendor/github.com/vmware/govmomi/session", - "//vendor/github.com/vmware/govmomi/task", "//vendor/github.com/vmware/govmomi/vim25", "//vendor/github.com/vmware/govmomi/vim25/mo", "//vendor/github.com/vmware/govmomi/vim25/soap", diff --git a/pkg/controller/plan/adapter/vsphere/client.go b/pkg/controller/plan/adapter/vsphere/client.go index 6e52d5793..a48af1af5 100644 --- a/pkg/controller/plan/adapter/vsphere/client.go +++ b/pkg/controller/plan/adapter/vsphere/client.go @@ -16,7 +16,6 @@ import ( "github.com/vmware/govmomi" "github.com/vmware/govmomi/object" "github.com/vmware/govmomi/session" - "github.com/vmware/govmomi/task" "github.com/vmware/govmomi/vim25" "github.com/vmware/govmomi/vim25/mo" "github.com/vmware/govmomi/vim25/soap" @@ -28,11 +27,8 @@ import ( ) const ( - snapshotName = "forklift-migration-precopy" - snapshotDesc = "Forklift Operator warm migration precopy" - VirtualMachine = "VirtualMachine" - CreateSnapshotTask = "CreateSnapshot_Task" - RemoveSnapshotTask = "RemoveSnapshot_Task" + snapshotName = "forklift-migration-precopy" + snapshotDesc = "Forklift Operator warm migration precopy" ) // vSphere VM Client @@ -43,9 +39,9 @@ type Client struct { } // Create a VM snapshot and return its ID. -func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (id string, err error) { +func (r *Client) CreateSnapshot(vmRef ref.Ref, hosts util.HostsFunc) (id string, err error) { r.Log.V(1).Info("Creating snapshot", "vmRef", vmRef) - vm, err := r.getVM(vmRef, hostsFunc) + vm, err := r.getVM(vmRef, hosts) if err != nil { return } @@ -54,82 +50,28 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (id str err = liberr.Wrap(err) return } - return task.Common.Reference().Value, nil -} - -// Check if a snapshot is ready to transfer. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, snapshotId string, err error) { - taskInfo, err := r.getLatestTaskByName(vmRef, CreateSnapshotTask) + res, err := task.WaitForResult(context.TODO(), nil) if err != nil { - return false, "", liberr.Wrap(err) - } - ready, err = r.checkTaskStatus(taskInfo) - if err != nil { - return false, "", liberr.Wrap(err) - } - if ready { - return true, taskInfo.Result.(types.ManagedObjectReference).Value, nil - } else { - // The snapshot is not ready, retry the check - return false, "", nil - } -} - -// Check if a snapshot is removed. -func (r *Client) CheckSnapshotRemoved(vmRef ref.Ref, snapshot string) (ready bool, err error) { - taskInfo, err := r.getLatestTaskByName(vmRef, RemoveSnapshotTask) - if err != nil { - return false, liberr.Wrap(err) + err = liberr.Wrap(err) + return } - return r.checkTaskStatus(taskInfo) -} + id = res.Result.(types.ManagedObjectReference).Value + r.Log.Info("Created snapshot", "vmRef", vmRef, "id", id) -func (r *Client) checkTaskStatus(taskInfo *types.TaskInfo) (ready bool, err error) { - r.Log.Info("Snapshot task", "task", taskInfo.Task.Value, "name", taskInfo.Name, "status", taskInfo.State) - switch taskInfo.State { - case types.TaskInfoStateSuccess: - return true, nil - case types.TaskInfoStateError: - return false, fmt.Errorf(taskInfo.Error.LocalizedMessage) - default: - return false, nil - } + return } -func (r *Client) getLatestTaskByName(vmRef ref.Ref, taskName string) (*types.TaskInfo, error) { - taskManager := task.NewManager(r.client.Client) - taskCollector, err := taskManager.CreateCollectorForTasks(context.TODO(), types.TaskFilterSpec{ - Entity: &types.TaskFilterSpecByEntity{ - Entity: types.ManagedObjectReference{ - Type: VirtualMachine, - Value: vmRef.ID, - }, - Recursion: types.TaskFilterSpecRecursionOptionSelf, - }, - }) - if err != nil { - return nil, err - } - //nolint:errcheck - defer taskCollector.Destroy(context.Background()) - tasks, err := taskCollector.LatestPage(context.TODO()) - if err != nil { - return nil, err - } - for _, taskInfo := range tasks { - if taskInfo.Name == taskName { - return &taskInfo, nil - } - } - return nil, fmt.Errorf("no task found with name %s, vmRef %v", taskName, vmRef) +// Check if a snapshot is ready to transfer. +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { + return true, nil } // Remove a VM snapshot. -func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { +func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hosts util.HostsFunc) (err error) { r.Log.V(1).Info("RemoveSnapshot", "vmRef", vmRef, "snapshot", snapshot) - err = r.removeSnapshot(vmRef, snapshot, false, hostsFunc) + err = r.removeSnapshot(vmRef, snapshot, false, hosts) return } diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 692fb7edc..2535e97c8 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -108,6 +108,7 @@ const ( TransferCompleted = "Transfer completed." PopulatorPodPrefix = "populate-" DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries" + SnapshotRemovalCheckRetries = "snapshotRemovalCheckRetries" ) var ( @@ -1026,15 +1027,27 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) break } - snapshot := vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot - ready, err := r.provider.CheckSnapshotRemoved(vm.Ref, snapshot) - if err != nil { - step.AddError(err.Error()) - err = nil - break - } - if ready { - vm.Phase = r.next(vm.Phase) + // FIXME: This is just temporary timeout to unblock the migrations which get stuck on issue https://issues.redhat.com/browse/MTV-1753 + // This should be fixed properly by adding the task manager inside the inventory and monitor the task status + // from the main controller. + var retries int + retriesAnnotation := step.Annotations[SnapshotRemovalCheckRetries] + if retriesAnnotation == "" { + step.Annotations[SnapshotRemovalCheckRetries] = "1" + } else { + retries, err = strconv.Atoi(retriesAnnotation) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if retries >= settings.Settings.SnapshotRemovalCheckRetries { + vm.Phase = r.next(vm.Phase) + // Reset for next precopy + step.Annotations[SnapshotRemovalCheckRetries] = "1" + } else { + step.Annotations[SnapshotRemovalCheckRetries] = strconv.Itoa(retries + 1) + } } case CreateInitialSnapshot, CreateSnapshot, CreateFinalSnapshot: step, found := vm.FindStep(r.step(vm)) @@ -1063,18 +1076,12 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { break } snapshot := vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot - ready, snapshotId, err := r.provider.CheckSnapshotReady(vm.Ref, snapshot) + ready, err := r.provider.CheckSnapshotReady(vm.Ref, snapshot) if err != nil { step.AddError(err.Error()) err = nil break } - // If the provider does not directly create the snapshot, but we need to wait for the snapshot to be created - // We start the creation task in CreateSnapshot, set the task ID as a snapshot id which needs to be replaced - // by the snapshot id after the task finishes. - if snapshotId != "" { - vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot = snapshotId - } if ready { vm.Phase = r.next(vm.Phase) } diff --git a/pkg/settings/migration.go b/pkg/settings/migration.go index 0b8096156..354eadd18 100644 --- a/pkg/settings/migration.go +++ b/pkg/settings/migration.go @@ -12,25 +12,26 @@ import ( // Environment variables. const ( - MaxVmInFlight = "MAX_VM_INFLIGHT" - HookRetry = "HOOK_RETRY" - ImporterRetry = "IMPORTER_RETRY" - VirtV2vImage = "VIRT_V2V_IMAGE" - PrecopyInterval = "PRECOPY_INTERVAL" - VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" - SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" - SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" - CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" - FileSystemOverhead = "FILESYSTEM_OVERHEAD" - BlockOverhead = "BLOCK_OVERHEAD" - CleanupRetries = "CLEANUP_RETRIES" - DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" - OvirtOsConfigMap = "OVIRT_OS_MAP" - VsphereOsConfigMap = "VSPHERE_OS_MAP" - VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" - VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" - VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" - VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" + MaxVmInFlight = "MAX_VM_INFLIGHT" + HookRetry = "HOOK_RETRY" + ImporterRetry = "IMPORTER_RETRY" + VirtV2vImage = "VIRT_V2V_IMAGE" + PrecopyInterval = "PRECOPY_INTERVAL" + VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" + SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" + SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" + CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" + FileSystemOverhead = "FILESYSTEM_OVERHEAD" + BlockOverhead = "BLOCK_OVERHEAD" + CleanupRetries = "CLEANUP_RETRIES" + DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" + SnapshotRemovalCheckRetries = "SNAPSHOT_REMOVAL_CHECK_RETRIES" + OvirtOsConfigMap = "OVIRT_OS_MAP" + VsphereOsConfigMap = "VSPHERE_OS_MAP" + VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" + VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" + VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" + VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" ) // Migration settings @@ -61,6 +62,8 @@ type Migration struct { CleanupRetries int // DvStatusCheckRetries retries DvStatusCheckRetries int + // SnapshotRemovalCheckRetries retries + SnapshotRemovalCheckRetries int // oVirt OS config map name OvirtOsConfigMap string // vSphere OS config map name @@ -106,6 +109,9 @@ func (r *Migration) Load() (err error) { if r.DvStatusCheckRetries, err = getPositiveEnvLimit(DvStatusCheckRetries, 10); err != nil { return liberr.Wrap(err) } + if r.SnapshotRemovalCheckRetries, err = getPositiveEnvLimit(SnapshotRemovalCheckRetries, 20); err != nil { + return liberr.Wrap(err) + } if virtV2vImage, ok := os.LookupEnv(VirtV2vImage); ok { r.VirtV2vImage = virtV2vImage } else if Settings.Role.Has(MainRole) { From b33d521c6d1035ef74c342d7419638627d362b37 Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Wed, 11 Dec 2024 14:19:42 +0100 Subject: [PATCH 4/7] MTV-1632 | Add secureboot to vsphere Issue: When creating the VM from vSphere on kubevirt the MTV always defaulted the secureboot to false. Fix: Add the secureboot to the inventory and to main controller to pass it to the KubeVirt. Ref: https://issues.redhat.com/browse/MTV-1632 Signed-off-by: Martin Necas --- pkg/controller/plan/adapter/vsphere/builder.go | 8 +------- pkg/controller/provider/container/vsphere/collector.go | 2 ++ pkg/controller/provider/container/vsphere/model.go | 6 ++++++ pkg/controller/provider/model/vsphere/model.go | 1 + pkg/controller/provider/web/vsphere/vm.go | 2 ++ 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pkg/controller/plan/adapter/vsphere/builder.go b/pkg/controller/plan/adapter/vsphere/builder.go index 8b8e77610..e46081aaf 100644 --- a/pkg/controller/plan/adapter/vsphere/builder.go +++ b/pkg/controller/plan/adapter/vsphere/builder.go @@ -668,15 +668,9 @@ func (r *Builder) mapFirmware(vm *model.VM, object *cnv.VirtualMachineSpec) { } switch vm.Firmware { case Efi: - // We don't distinguish between UEFI and UEFI with secure boot, but we anyway would have - // disabled secure boot, even if we knew it was enabled on the source, because the guest - // OS won't be able to boot without getting the NVRAM data. By starting the VM without - // secure boot we ease the procedure users need to do in order to make a guest OS that - // was previously configured with secure boot bootable. - secureBootEnabled := false firmware.Bootloader = &cnv.Bootloader{ EFI: &cnv.EFI{ - SecureBoot: &secureBootEnabled, + SecureBoot: &vm.SecureBoot, }} default: firmware.Bootloader = &cnv.Bootloader{BIOS: &cnv.BIOS{}} diff --git a/pkg/controller/provider/container/vsphere/collector.go b/pkg/controller/provider/container/vsphere/collector.go index 6f6a759f8..02a4b5243 100644 --- a/pkg/controller/provider/container/vsphere/collector.go +++ b/pkg/controller/provider/container/vsphere/collector.go @@ -100,6 +100,7 @@ const ( fUUID = "config.uuid" fFirmware = "config.firmware" fFtInfo = "config.ftInfo" + fBootOptions = "config.bootOptions" fCpuAffinity = "config.cpuAffinity" fCpuHotAddEnabled = "config.cpuHotAddEnabled" fCpuHotRemoveEnabled = "config.cpuHotRemoveEnabled" @@ -726,6 +727,7 @@ func (r *Collector) vmPathSet() []string { fFirmware, fFtInfo, fCpuAffinity, + fBootOptions, fCpuHotAddEnabled, fCpuHotRemoveEnabled, fMemoryHotAddEnabled, diff --git a/pkg/controller/provider/container/vsphere/model.go b/pkg/controller/provider/container/vsphere/model.go index b5e1eec59..78708fe4b 100644 --- a/pkg/controller/provider/container/vsphere/model.go +++ b/pkg/controller/provider/container/vsphere/model.go @@ -544,6 +544,12 @@ func (v *VmAdapter) Apply(u types.ObjectUpdate) { if a, cast := p.Val.(types.VirtualMachineAffinityInfo); cast { v.model.CpuAffinity = a.AffinitySet } + case fBootOptions: + if a, cast := p.Val.(types.VirtualMachineBootOptions); cast { + if a.EfiSecureBootEnabled != nil { + v.model.SecureBoot = *a.EfiSecureBootEnabled + } + } case fCpuHotAddEnabled: if b, cast := p.Val.(bool); cast { v.model.CpuHotAddEnabled = b diff --git a/pkg/controller/provider/model/vsphere/model.go b/pkg/controller/provider/model/vsphere/model.go index f8ae2ebf6..6d23312b3 100644 --- a/pkg/controller/provider/model/vsphere/model.go +++ b/pkg/controller/provider/model/vsphere/model.go @@ -265,6 +265,7 @@ type VM struct { Concerns []Concern `sql:""` GuestNetworks []GuestNetwork `sql:""` GuestIpStacks []GuestIpStack `sql:""` + SecureBoot bool `sql:""` } // Determine if current revision has been validated. diff --git a/pkg/controller/provider/web/vsphere/vm.go b/pkg/controller/provider/web/vsphere/vm.go index b019c4595..097b900a7 100644 --- a/pkg/controller/provider/web/vsphere/vm.go +++ b/pkg/controller/provider/web/vsphere/vm.go @@ -235,6 +235,7 @@ type VM struct { NICs []model.NIC `json:"nics"` GuestNetworks []model.GuestNetwork `json:"guestNetworks"` GuestIpStacks []model.GuestIpStack `json:"guestIpStacks"` + SecureBoot bool `json:"secureBoot"` } // Build the resource using the model. @@ -265,6 +266,7 @@ func (r *VM) With(m *model.VM) { r.NICs = m.NICs r.GuestNetworks = m.GuestNetworks r.GuestIpStacks = m.GuestIpStacks + r.SecureBoot = m.SecureBoot } // Build self link (URI). From e88a88fa11b5c6bf575f3dfb6af447abab61ef50 Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Fri, 13 Dec 2024 10:38:12 +0100 Subject: [PATCH 5/7] Add a wait phase for snapshot tasks Issue: The main problem of the MTV-1753 and MTV-1775 is that we are either not waiting for the VMware task to finish or if we are waiting we are halting the whole controller process. This causes either performance issues or even migration failures. So we need to add a mechanism to wait for the tasks without halting the whole process. Fix: My first attempt was in PR #1262 which used the event manager. This on the surface was an easy approach which did not require any additional changes to the CR. The problem there was that some of the tasks were not reported to the taskManager. These tasks had a prefix haTask. After some investigation, I found out that these tasks are directly on the ESXi host and not sent to the vspehre, so we can't use the taskManager. This PR adds the taskIds to the status CR so additional wait phases can monitor the tasks. The main controller will get the ESXi client and create a property collector to request the specific task from the host. Ref: - https://issues.redhat.com/browse/MTV-1753 - https://issues.redhat.com/browse/MTV-1775 - https://github.com/kubev2v/forklift/pull/1262 - https://github.com/kubev2v/forklift/pull/1265 Signed-off-by: Martin Necas --- .../forklift.konveyor.io_migrations.yaml | 4 + .../crd/bases/forklift.konveyor.io_plans.yaml | 4 + pkg/apis/forklift/v1beta1/plan/vm.go | 10 +- pkg/controller/plan/adapter/base/doc.go | 8 +- pkg/controller/plan/adapter/ocp/client.go | 13 +- .../plan/adapter/openstack/client.go | 11 +- pkg/controller/plan/adapter/ova/client.go | 11 +- pkg/controller/plan/adapter/ovirt/client.go | 15 ++- .../plan/adapter/vsphere/BUILD.bazel | 1 + pkg/controller/plan/adapter/vsphere/client.go | 117 ++++++++++++++---- pkg/controller/plan/migration.go | 50 ++++---- 11 files changed, 170 insertions(+), 74 deletions(-) diff --git a/operator/config/crd/bases/forklift.konveyor.io_migrations.yaml b/operator/config/crd/bases/forklift.konveyor.io_migrations.yaml index d76178348..b5d5771d3 100644 --- a/operator/config/crd/bases/forklift.konveyor.io_migrations.yaml +++ b/operator/config/crd/bases/forklift.konveyor.io_migrations.yaml @@ -539,6 +539,8 @@ spec: items: description: Precopy durations properties: + createTaskId: + type: string deltas: items: properties: @@ -554,6 +556,8 @@ spec: end: format: date-time type: string + removeTaskId: + type: string snapshot: type: string start: diff --git a/operator/config/crd/bases/forklift.konveyor.io_plans.yaml b/operator/config/crd/bases/forklift.konveyor.io_plans.yaml index 2ac95b87d..7ddf1136a 100644 --- a/operator/config/crd/bases/forklift.konveyor.io_plans.yaml +++ b/operator/config/crd/bases/forklift.konveyor.io_plans.yaml @@ -1051,6 +1051,8 @@ spec: items: description: Precopy durations properties: + createTaskId: + type: string deltas: items: properties: @@ -1066,6 +1068,8 @@ spec: end: format: date-time type: string + removeTaskId: + type: string snapshot: type: string start: diff --git a/pkg/apis/forklift/v1beta1/plan/vm.go b/pkg/apis/forklift/v1beta1/plan/vm.go index 65d6b37c3..8b3fdf000 100644 --- a/pkg/apis/forklift/v1beta1/plan/vm.go +++ b/pkg/apis/forklift/v1beta1/plan/vm.go @@ -98,10 +98,12 @@ const ( // Precopy durations type Precopy struct { - Start *meta.Time `json:"start,omitempty"` - End *meta.Time `json:"end,omitempty"` - Snapshot string `json:"snapshot,omitempty"` - Deltas []DiskDelta `json:"deltas,omitempty"` + Start *meta.Time `json:"start,omitempty"` + End *meta.Time `json:"end,omitempty"` + Snapshot string `json:"snapshot,omitempty"` + CreateTaskId string `json:"createTaskId,omitempty"` + RemoveTaskId string `json:"removeTaskId,omitempty"` + Deltas []DiskDelta `json:"deltas,omitempty"` } func (r *Precopy) WithDeltas(deltas map[string]string) { diff --git a/pkg/controller/plan/adapter/base/doc.go b/pkg/controller/plan/adapter/base/doc.go index 0900f3397..b57ae58ef 100644 --- a/pkg/controller/plan/adapter/base/doc.go +++ b/pkg/controller/plan/adapter/base/doc.go @@ -104,11 +104,13 @@ type Client interface { // Return whether the source VM is powered off. PoweredOff(vmRef ref.Ref) (bool, error) // Create a snapshot of the source VM. - CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (string, error) + CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapshotId string, creationTaskId string, err error) // Remove a snapshot. - RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) error + RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (removeTaskId string, err error) // Check if a snapshot is ready to transfer. - CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) + CheckSnapshotReady(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (ready bool, snapshotId string, err error) + // Check if a snapshot is removed. + CheckSnapshotRemove(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (ready bool, err error) // Set DataVolume checkpoints. SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) (err error) // Close connections to the provider API. diff --git a/pkg/controller/plan/adapter/ocp/client.go b/pkg/controller/plan/adapter/ocp/client.go index c17cf49a8..afbb24643 100644 --- a/pkg/controller/plan/adapter/ocp/client.go +++ b/pkg/controller/plan/adapter/ocp/client.go @@ -25,7 +25,12 @@ type Client struct { } // CheckSnapshotReady implements base.Client -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (bool, error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (bool, string, error) { + return false, "", nil +} + +// CheckSnapshotRemove implements base.Client +func (r *Client) CheckSnapshotRemove(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (bool, error) { return false, nil } @@ -35,12 +40,12 @@ func (r *Client) Close() { } // CreateSnapshot implements base.Client -func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (string, error) { - return "", nil +func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (string, string, error) { + return "", "", nil } // Remove a VM snapshot. No-op for this provider. -func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { +func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (removeTaskId string, err error) { return } diff --git a/pkg/controller/plan/adapter/openstack/client.go b/pkg/controller/plan/adapter/openstack/client.go index e8e7a89da..9f6558810 100644 --- a/pkg/controller/plan/adapter/openstack/client.go +++ b/pkg/controller/plan/adapter/openstack/client.go @@ -110,22 +110,27 @@ func (r *Client) PoweredOff(vmRef ref.Ref) (off bool, err error) { } // Create a snapshot of the source VM. -func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (imageID string, err error) { +func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapshotId string, creationTaskId string, err error) { return } // Check if a snapshot is ready to transfer. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, imageID string) (ready bool, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (ready bool, snapshotId string, err error) { return } +// CheckSnapshotRemove implements base.Client +func (r *Client) CheckSnapshotRemove(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (bool, error) { + return false, nil +} + // Set DataVolume checkpoints. func (r *Client) SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) error { return nil } // Remove a VM snapshot. No-op for this provider. -func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { +func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (removeTaskId string, err error) { return } diff --git a/pkg/controller/plan/adapter/ova/client.go b/pkg/controller/plan/adapter/ova/client.go index c1ef2838d..c0d9d96a6 100644 --- a/pkg/controller/plan/adapter/ova/client.go +++ b/pkg/controller/plan/adapter/ova/client.go @@ -46,12 +46,12 @@ func (r *Client) connect() (err error) { } // Create a VM snapshot and return its ID. No-op for this provider. -func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapshot string, err error) { +func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapshotId string, creationTaskId string, err error) { return } // Remove a VM snapshot. No-op for this provider. -func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { +func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (removeTaskId string, err error) { return } @@ -61,10 +61,15 @@ func (r *Client) GetSnapshotDeltas(vmRef ref.Ref, snapshot string, hostsFunc uti } // Check if a snapshot is ready to transfer, to avoid importer restarts. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (ready bool, snapshotId string, err error) { return } +// CheckSnapshotRemove implements base.Client +func (r *Client) CheckSnapshotRemove(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (bool, error) { + return false, nil +} + // Set DataVolume checkpoints. func (r *Client) SetCheckpoints(vmRef ref.Ref, precopies []planapi.Precopy, datavolumes []cdi.DataVolume, final bool, hostsFunc util.HostsFunc) (err error) { return diff --git a/pkg/controller/plan/adapter/ovirt/client.go b/pkg/controller/plan/adapter/ovirt/client.go index 4a7a7fec5..5cc174166 100644 --- a/pkg/controller/plan/adapter/ovirt/client.go +++ b/pkg/controller/plan/adapter/ovirt/client.go @@ -40,7 +40,7 @@ type Client struct { } // Create a VM snapshot and return its ID. -func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapshot string, err error) { +func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapshotId string, creationTaskId string, err error) { _, vmService, err := r.getVM(vmRef) if err != nil { err = liberr.Wrap(err) @@ -70,13 +70,18 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapsh } return } - snapshot = snap.MustSnapshot().MustId() + snapshotId = snap.MustSnapshot().MustId() return } +// CheckSnapshotRemove implements base.Client +func (r *Client) CheckSnapshotRemove(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (bool, error) { + return false, nil +} + // Check if a snapshot is ready to transfer, to avoid importer restarts. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { - correlationID, err := r.getSnapshotCorrelationID(vmRef, &snapshot) +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (ready bool, snapshotId string, err error) { + correlationID, err := r.getSnapshotCorrelationID(vmRef, &precopy.Snapshot) if err != nil { err = liberr.Wrap(err) return @@ -105,7 +110,7 @@ func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, } // Remove a VM snapshot. No-op for this provider. -func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (err error) { +func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hostsFunc util.HostsFunc) (removeTaskId string, err error) { return } diff --git a/pkg/controller/plan/adapter/vsphere/BUILD.bazel b/pkg/controller/plan/adapter/vsphere/BUILD.bazel index c744b9e7c..cd15868d9 100644 --- a/pkg/controller/plan/adapter/vsphere/BUILD.bazel +++ b/pkg/controller/plan/adapter/vsphere/BUILD.bazel @@ -35,6 +35,7 @@ go_library( "//vendor/github.com/vmware/govmomi", "//vendor/github.com/vmware/govmomi/find", "//vendor/github.com/vmware/govmomi/object", + "//vendor/github.com/vmware/govmomi/property", "//vendor/github.com/vmware/govmomi/session", "//vendor/github.com/vmware/govmomi/vim25", "//vendor/github.com/vmware/govmomi/vim25/mo", diff --git a/pkg/controller/plan/adapter/vsphere/client.go b/pkg/controller/plan/adapter/vsphere/client.go index a48af1af5..87ec0853c 100644 --- a/pkg/controller/plan/adapter/vsphere/client.go +++ b/pkg/controller/plan/adapter/vsphere/client.go @@ -15,6 +15,7 @@ import ( liberr "github.com/konveyor/forklift-controller/pkg/lib/error" "github.com/vmware/govmomi" "github.com/vmware/govmomi/object" + "github.com/vmware/govmomi/property" "github.com/vmware/govmomi/session" "github.com/vmware/govmomi/vim25" "github.com/vmware/govmomi/vim25/mo" @@ -29,6 +30,7 @@ import ( const ( snapshotName = "forklift-migration-precopy" snapshotDesc = "Forklift Operator warm migration precopy" + taskType = "Task" ) // vSphere VM Client @@ -39,9 +41,9 @@ type Client struct { } // Create a VM snapshot and return its ID. -func (r *Client) CreateSnapshot(vmRef ref.Ref, hosts util.HostsFunc) (id string, err error) { +func (r *Client) CreateSnapshot(vmRef ref.Ref, hostsFunc util.HostsFunc) (snapshotId string, creationTaskId string, err error) { r.Log.V(1).Info("Creating snapshot", "vmRef", vmRef) - vm, err := r.getVM(vmRef, hosts) + vm, err := r.getVM(vmRef, hostsFunc) if err != nil { return } @@ -50,28 +52,15 @@ func (r *Client) CreateSnapshot(vmRef ref.Ref, hosts util.HostsFunc) (id string, err = liberr.Wrap(err) return } - res, err := task.WaitForResult(context.TODO(), nil) - if err != nil { - err = liberr.Wrap(err) - return - } - id = res.Result.(types.ManagedObjectReference).Value - r.Log.Info("Created snapshot", "vmRef", vmRef, "id", id) - - return -} - -// Check if a snapshot is ready to transfer. -func (r *Client) CheckSnapshotReady(vmRef ref.Ref, snapshot string) (ready bool, err error) { - return true, nil + return "", task.Reference().Value, nil } // Remove a VM snapshot. -func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hosts util.HostsFunc) (err error) { +func (r *Client) RemoveSnapshot(vmRef ref.Ref, snapshot string, hosts util.HostsFunc) (taskId string, err error) { r.Log.V(1).Info("RemoveSnapshot", "vmRef", vmRef, "snapshot", snapshot) - err = r.removeSnapshot(vmRef, snapshot, false, hosts) + taskId, err = r.removeSnapshot(vmRef, snapshot, false, hosts) return } @@ -247,6 +236,89 @@ func (r *Client) GetSnapshotDeltas(vmRef ref.Ref, snapshotId string, hosts util. return } +// Check if a snapshot is removed +func (r *Client) CheckSnapshotRemove(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (bool, error) { + r.Log.Info("Check Snapshot Remove", "vmRef", vmRef, "precopy", precopy) + taskInfo, err := r.getTaskById(vmRef, precopy.RemoveTaskId, hosts) + if err != nil { + return false, liberr.Wrap(err) + } + return r.checkTaskStatus(taskInfo) +} + +// Check if a snapshot is ready to transfer. +func (r *Client) CheckSnapshotReady(vmRef ref.Ref, precopy planapi.Precopy, hosts util.HostsFunc) (ready bool, snapshotId string, err error) { + r.Log.Info("Check Snapshot Ready", "vmRef", vmRef, "precopy", precopy) + taskInfo, err := r.getTaskById(vmRef, precopy.CreateTaskId, hosts) + if err != nil { + return false, snapshotId, liberr.Wrap(err) + } + ready, err = r.checkTaskStatus(taskInfo) + snapshotId = taskInfo.Result.(types.ManagedObjectReference).Value + return +} + +func (r *Client) checkTaskStatus(taskInfo *types.TaskInfo) (ready bool, err error) { + r.Log.Info("Snapshot task", "task", taskInfo.Task.Value, "name", taskInfo.Name, "status", taskInfo.State) + switch taskInfo.State { + case types.TaskInfoStateSuccess: + return true, nil + case types.TaskInfoStateError: + return false, fmt.Errorf(taskInfo.Error.LocalizedMessage) + default: + return false, nil + } +} + +func (r *Client) getClientFromVmRef(vmRef ref.Ref, hosts util.HostsFunc) (client *vim25.Client, err error) { + vm := &model.VM{} + err = r.Source.Inventory.Find(vm, vmRef) + if err != nil { + return nil, liberr.Wrap(err, "vm", vmRef.String()) + } + return r.getClient(vm, hosts) +} + +func (r *Client) getTaskById(vmRef ref.Ref, taskId string, hosts util.HostsFunc) (*types.TaskInfo, error) { + r.Log.V(1).Info("Get task by id", "taskId", taskId, "vmRef", vmRef) + + // Get the ESXi client for the haTasks + client, err := r.getClientFromVmRef(vmRef, hosts) + if err != nil { + return nil, err + } + // Create a collector to receive the tasks + pc := property.DefaultCollector(client) + pc, err = pc.Create(context.TODO()) + if err != nil { + return nil, err + } + //nolint:errcheck + defer pc.Destroy(context.TODO()) + + // Retrieve the task from ESXi host + taskRef := types.ManagedObjectReference{ + Type: taskType, + Value: taskId, + } + var content []types.ObjectContent + err = pc.RetrieveOne(context.TODO(), taskRef, []string{"info"}, &content) + if err != nil { + return nil, err + } + if len(content) == 0 { + return nil, fmt.Errorf("task %s not found", taskId) + } + if len(content[0].PropSet) == 0 { + return nil, fmt.Errorf("task %s not found property set", taskId) + } + if content[0].PropSet[0].Val == nil { + return nil, fmt.Errorf("no task value found for task %s", taskId) + } + task := content[0].PropSet[0].Val.(types.TaskInfo) + return &task, nil +} + func (r *Client) getClient(vm *model.VM, hosts util.HostsFunc) (client *vim25.Client, err error) { if coldLocal, vErr := r.Plan.VSphereColdLocal(); vErr == nil && coldLocal { // when virt-v2v runs the migration, forklift-controller should interact only @@ -371,7 +443,7 @@ func nullableHosts() (hosts map[string]*v1beta1.Host, err error) { } // Remove a VM snapshot and optionally its children. -func (r *Client) removeSnapshot(vmRef ref.Ref, snapshot string, children bool, hosts util.HostsFunc) (err error) { +func (r *Client) removeSnapshot(vmRef ref.Ref, snapshot string, children bool, hosts util.HostsFunc) (taskId string, err error) { r.Log.Info("Removing snapshot", "vmRef", vmRef, "snapshot", snapshot, @@ -381,12 +453,11 @@ func (r *Client) removeSnapshot(vmRef ref.Ref, snapshot string, children bool, h if err != nil { return } - _, err = vm.RemoveSnapshot(context.TODO(), snapshot, children, nil) + task, err := vm.RemoveSnapshot(context.TODO(), snapshot, children, nil) if err != nil { - err = liberr.Wrap(err) - return + return "", liberr.Wrap(err) } - return + return task.Reference().Value, nil } // Connect to the vSphere API. diff --git a/pkg/controller/plan/migration.go b/pkg/controller/plan/migration.go index 2535e97c8..69e835aa7 100644 --- a/pkg/controller/plan/migration.go +++ b/pkg/controller/plan/migration.go @@ -108,7 +108,6 @@ const ( TransferCompleted = "Transfer completed." PopulatorPodPrefix = "populate-" DvStatusCheckRetriesAnnotation = "dvStatusCheckRetries" - SnapshotRemovalCheckRetries = "snapshotRemovalCheckRetries" ) var ( @@ -536,7 +535,7 @@ func (r *Migration) removeLastWarmSnapshot(vm *plan.VMStatus) { return } snapshot := vm.Warm.Precopies[n-1].Snapshot - if err := r.provider.RemoveSnapshot(vm.Ref, snapshot, r.kubevirt.loadHosts); err != nil { + if _, err := r.provider.RemoveSnapshot(vm.Ref, snapshot, r.kubevirt.loadHosts); err != nil { r.Log.Error( err, "Failed to clean up warm migration snapshots.", @@ -1014,7 +1013,9 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { break } n := len(vm.Warm.Precopies) - err = r.provider.RemoveSnapshot(vm.Ref, vm.Warm.Precopies[n-1].Snapshot, r.kubevirt.loadHosts) + var taskId string + taskId, err = r.provider.RemoveSnapshot(vm.Ref, vm.Warm.Precopies[n-1].Snapshot, r.kubevirt.loadHosts) + vm.Warm.Precopies[len(vm.Warm.Precopies)-1].RemoveTaskId = taskId if err != nil { step.AddError(err.Error()) err = nil @@ -1027,27 +1028,15 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) break } - // FIXME: This is just temporary timeout to unblock the migrations which get stuck on issue https://issues.redhat.com/browse/MTV-1753 - // This should be fixed properly by adding the task manager inside the inventory and monitor the task status - // from the main controller. - var retries int - retriesAnnotation := step.Annotations[SnapshotRemovalCheckRetries] - if retriesAnnotation == "" { - step.Annotations[SnapshotRemovalCheckRetries] = "1" - } else { - retries, err = strconv.Atoi(retriesAnnotation) - if err != nil { - step.AddError(err.Error()) - err = nil - break - } - if retries >= settings.Settings.SnapshotRemovalCheckRetries { - vm.Phase = r.next(vm.Phase) - // Reset for next precopy - step.Annotations[SnapshotRemovalCheckRetries] = "1" - } else { - step.Annotations[SnapshotRemovalCheckRetries] = strconv.Itoa(retries + 1) - } + precopy := vm.Warm.Precopies[len(vm.Warm.Precopies)-1] + ready, err := r.provider.CheckSnapshotRemove(vm.Ref, precopy, r.kubevirt.loadHosts) + if err != nil { + step.AddError(err.Error()) + err = nil + break + } + if ready { + vm.Phase = r.next(vm.Phase) } case CreateInitialSnapshot, CreateSnapshot, CreateFinalSnapshot: step, found := vm.FindStep(r.step(vm)) @@ -1055,8 +1044,8 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) break } - var snapshot string - if snapshot, err = r.provider.CreateSnapshot(vm.Ref, r.kubevirt.loadHosts); err != nil { + var snapshot, taskId string + if snapshot, taskId, err = r.provider.CreateSnapshot(vm.Ref, r.kubevirt.loadHosts); err != nil { if errors.As(err, &web.ProviderNotReadyError{}) || errors.As(err, &web.ConflictError{}) { return } @@ -1065,7 +1054,7 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { break } now := meta.Now() - precopy := plan.Precopy{Snapshot: snapshot, Start: &now} + precopy := plan.Precopy{Snapshot: snapshot, CreateTaskId: taskId, Start: &now} vm.Warm.Precopies = append(vm.Warm.Precopies, precopy) r.resetPrecopyTasks(vm, step) vm.Phase = r.next(vm.Phase) @@ -1075,14 +1064,17 @@ func (r *Migration) execute(vm *plan.VMStatus) (err error) { vm.AddError(fmt.Sprintf("Step '%s' not found", r.step(vm))) break } - snapshot := vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot - ready, err := r.provider.CheckSnapshotReady(vm.Ref, snapshot) + precopy := vm.Warm.Precopies[len(vm.Warm.Precopies)-1] + ready, snapshotId, err := r.provider.CheckSnapshotReady(vm.Ref, precopy, r.kubevirt.loadHosts) if err != nil { step.AddError(err.Error()) err = nil break } if ready { + if snapshotId != "" { + vm.Warm.Precopies[len(vm.Warm.Precopies)-1].Snapshot = snapshotId + } vm.Phase = r.next(vm.Phase) } case WaitForDataVolumesStatus, WaitForFinalDataVolumesStatus: From 3d46597aa1a39ffa0eb1c72fe831cd73980e0d2e Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Fri, 13 Dec 2024 17:15:19 +0100 Subject: [PATCH 6/7] MTV-1632 | Enable SMM for the uefi secureboot Signed-off-by: Martin Necas --- pkg/controller/plan/adapter/vsphere/builder.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/controller/plan/adapter/vsphere/builder.go b/pkg/controller/plan/adapter/vsphere/builder.go index e46081aaf..c68e7f18a 100644 --- a/pkg/controller/plan/adapter/vsphere/builder.go +++ b/pkg/controller/plan/adapter/vsphere/builder.go @@ -672,6 +672,13 @@ func (r *Builder) mapFirmware(vm *model.VM, object *cnv.VirtualMachineSpec) { EFI: &cnv.EFI{ SecureBoot: &vm.SecureBoot, }} + if vm.SecureBoot { + object.Template.Spec.Domain.Features = &cnv.Features{ + SMM: &cnv.FeatureState{ + Enabled: &vm.SecureBoot, + }, + } + } default: firmware.Bootloader = &cnv.Bootloader{BIOS: &cnv.BIOS{}} } From 29acbabbbdb4412520917d7af3a7f37f5f94a000 Mon Sep 17 00:00:00 2001 From: Martin Necas Date: Fri, 13 Dec 2024 16:12:21 +0100 Subject: [PATCH 7/7] MTV-1493 | Add missing resource limits and requests Issue: If the user sets ClusterResourceQuota the Forklift will start failing as it does not have the limits or requests on the pods which are created form the Forklift Controller. Fix: Add a new parameters to Forklift operator which can be configured depedning on the user env. Ref: https://issues.redhat.com/browse/MTV-1493 Signed-off-by: Martin Necas --- .../forkliftcontroller/defaults/main.yml | 14 ++ .../controller/deployment-controller.yml.j2 | 24 ++++ pkg/controller/plan/hook.go | 11 ++ pkg/controller/plan/kubevirt.go | 35 ++++- pkg/controller/plan/validation.go | 23 +++- pkg/controller/provider/ova-setup.go | 10 ++ pkg/settings/migration.go | 127 +++++++++++++++--- 7 files changed, 220 insertions(+), 24 deletions(-) diff --git a/operator/roles/forkliftcontroller/defaults/main.yml b/operator/roles/forkliftcontroller/defaults/main.yml index 5733c6868..aa4360ff3 100644 --- a/operator/roles/forkliftcontroller/defaults/main.yml +++ b/operator/roles/forkliftcontroller/defaults/main.yml @@ -110,12 +110,26 @@ populator_controller_container_name: "{{ app_name }}-populator-controller" populator_openstack_image_fqin: "{{ lookup( 'env', 'OPENSTACK_POPULATOR_IMAGE') or lookup( 'env', 'RELATED_IMAGE_OPENSTACK_POPULATOR') }}" must_gather_image_fqin: "{{ lookup( 'env', 'MUST_GATHER_IMAGE') or lookup( 'env', 'RELATED_IMAGE_MUST_GATHER') }}" + virt_v2v_image_fqin: "{{ lookup( 'env', 'VIRT_V2V_IMAGE') or lookup( 'env', 'RELATED_IMAGE_VIRT_V2V') }}" virt_v2v_dont_request_kvm: "{{ lookup( 'env', 'VIRT_V2V_DONT_REQUEST_KVM') }}" virt_v2v_extra_args: "{{ lookup( 'env', 'VIRT_V2V_EXTRA_ARGS') }}" virt_v2v_extra_conf_config_map: "{{ lookup( 'env', 'VIRT_V2V_EXTRA_CONF_CONFIG_MAP') }}" +virt_v2v_container_limits_cpu: "4000m" +virt_v2v_container_limits_memory: "8Gi" +virt_v2v_container_requests_cpu: "1000m" +virt_v2v_container_requests_memory: "1Gi" + +hooks_container_limits_cpu: "1000m" +hooks_container_limits_memory: "1Gi" +hooks_container_requests_cpu: "100m" +hooks_container_requests_memory: "150Mi" ova_provider_server_fqin: "{{ lookup( 'env', 'OVA_PROVIDER_SERVER_IMAGE') or lookup( 'env', 'RELATED_IMAGE_OVA_PROVIDER_SERVER') }}" +ova_container_limits_cpu: "1000m" +ova_container_limits_memory: "1Gi" +ova_container_requests_cpu: "100m" +ova_container_requests_memory: "150Mi" metric_service_name: "{{ app_name }}-metrics" metric_servicemonitor_name: "{{ app_name }}-metrics" diff --git a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 index c346dbfbe..e19265e11 100644 --- a/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 +++ b/operator/roles/forkliftcontroller/templates/controller/deployment-controller.yml.j2 @@ -139,6 +139,30 @@ spec: value: "{{ virt_v2v_extra_args }}" - name: VIRT_V2V_EXTRA_CONF_CONFIG_MAP value: "{{ virt_v2v_extra_conf_config_map }}" + - name: VIRT_V2V_CONTAINER_LIMITS_CPU + value: "{{ virt_v2v_container_limits_cpu }}" + - name: VIRT_V2V_CONTAINER_LIMITS_MEMORY + value: "{{ virt_v2v_container_limits_memory }}" + - name: VIRT_V2V_CONTAINER_REQUESTS_CPU + value: "{{ virt_v2v_container_requests_cpu }}" + - name: VIRT_V2V_CONTAINER_REQUESTS_MEMORY + value: "{{ virt_v2v_container_requests_memory }}" + - name: HOOKS_CONTAINER_LIMITS_CPU + value: "{{ hooks_container_limits_cpu }}" + - name: HOOKS_CONTAINER_LIMITS_MEMORY + value: "{{ hooks_container_limits_memory }}" + - name: HOOKS_CONTAINER_REQUESTS_CPU + value: "{{ hooks_container_requests_cpu }}" + - name: HOOKS_CONTAINER_REQUESTS_MEMORY + value: "{{ hooks_container_requests_memory }}" + - name: OVA_CONTAINER_LIMITS_CPU + value: "{{ ova_container_limits_cpu }}" + - name: OVA_CONTAINER_LIMITS_MEMORY + value: "{{ ova_container_limits_memory }}" + - name: OVA_CONTAINER_REQUESTS_CPU + value: "{{ ova_container_requests_cpu }}" + - name: OVA_CONTAINER_REQUESTS_MEMORY + value: "{{ ova_container_requests_memory }}" envFrom: - configMapRef: name: {{ controller_configmap_name }} diff --git a/pkg/controller/plan/hook.go b/pkg/controller/plan/hook.go index 296b6e5b9..ccc97e0a1 100644 --- a/pkg/controller/plan/hook.go +++ b/pkg/controller/plan/hook.go @@ -14,6 +14,7 @@ import ( "gopkg.in/yaml.v2" batch "k8s.io/api/batch/v1" core "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" meta "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/kubernetes/scheme" @@ -181,6 +182,16 @@ func (r *HookRunner) template(mp *core.ConfigMap) (template *core.PodTemplateSpe { Name: "hook", Image: r.hook.Spec.Image, + Resources: core.ResourceRequirements{ + Requests: core.ResourceList{ + core.ResourceCPU: resource.MustParse(Settings.Migration.HooksContainerRequestsCpu), + core.ResourceMemory: resource.MustParse(Settings.Migration.HooksContainerRequestsMemory), + }, + Limits: core.ResourceList{ + core.ResourceCPU: resource.MustParse(Settings.Migration.HooksContainerLimitsCpu), + core.ResourceMemory: resource.MustParse(Settings.Migration.HooksContainerLimitsMemory), + }, + }, VolumeMounts: []core.VolumeMount{ { Name: "hook", diff --git a/pkg/controller/plan/kubevirt.go b/pkg/controller/plan/kubevirt.go index af44af2a1..ac7cbe532 100644 --- a/pkg/controller/plan/kubevirt.go +++ b/pkg/controller/plan/kubevirt.go @@ -763,6 +763,16 @@ func (r *KubeVirt) createPodToBindPVCs(vm *plan.VMStatus, pvcNames []string) (er // In that case, we could benefit from pulling the image of the conversion pod, so it will be present on the node. Image: Settings.Migration.VirtV2vImage, Command: []string{"/bin/sh"}, + Resources: core.ResourceRequirements{ + Requests: core.ResourceList{ + core.ResourceCPU: resource.MustParse(Settings.Migration.VirtV2vContainerRequestsCpu), + core.ResourceMemory: resource.MustParse(Settings.Migration.VirtV2vContainerRequestsMemory), + }, + Limits: core.ResourceList{ + core.ResourceCPU: resource.MustParse(Settings.Migration.VirtV2vContainerLimitsCpu), + core.ResourceMemory: resource.MustParse(Settings.Migration.VirtV2vContainerLimitsMemory), + }, + }, SecurityContext: &core.SecurityContext{ AllowPrivilegeEscalation: &allowPrivilageEscalation, RunAsNonRoot: &nonRoot, @@ -1725,6 +1735,16 @@ func (r *KubeVirt) guestConversionPod(vm *plan.VMStatus, vmVolumes []cnv.Volume, MountPath: "/opt", }, }, + Resources: core.ResourceRequirements{ + Requests: core.ResourceList{ + core.ResourceCPU: resource.MustParse("100m"), + core.ResourceMemory: resource.MustParse("150Mi"), + }, + Limits: core.ResourceList{ + core.ResourceCPU: resource.MustParse("1000m"), + core.ResourceMemory: resource.MustParse("500Mi"), + }, + }, SecurityContext: &core.SecurityContext{ AllowPrivilegeEscalation: &allowPrivilageEscalation, Capabilities: &core.Capabilities{ @@ -1805,8 +1825,19 @@ func (r *KubeVirt) guestConversionPod(vm *plan.VMStatus, vmVolumes []cnv.Volume, InitContainers: initContainers, Containers: []core.Container{ { - Name: "virt-v2v", - Env: environment, + Name: "virt-v2v", + Env: environment, + ImagePullPolicy: core.PullAlways, + Resources: core.ResourceRequirements{ + Requests: core.ResourceList{ + core.ResourceCPU: resource.MustParse(Settings.Migration.VirtV2vContainerRequestsCpu), + core.ResourceMemory: resource.MustParse(Settings.Migration.VirtV2vContainerRequestsMemory), + }, + Limits: core.ResourceList{ + core.ResourceCPU: resource.MustParse(Settings.Migration.VirtV2vContainerLimitsCpu), + core.ResourceMemory: resource.MustParse(Settings.Migration.VirtV2vContainerLimitsMemory), + }, + }, EnvFrom: []core.EnvFromSource{ { Prefix: "V2V_", diff --git a/pkg/controller/plan/validation.go b/pkg/controller/plan/validation.go index 6116c0109..07acbe865 100644 --- a/pkg/controller/plan/validation.go +++ b/pkg/controller/plan/validation.go @@ -25,6 +25,7 @@ import ( core "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" k8serr "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" meta "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" k8svalidation "k8s.io/apimachinery/pkg/util/validation" @@ -888,6 +889,16 @@ func createVddkCheckJob(plan *api.Plan, labels map[string]string, vddkImage stri Drop: []core.Capability{"ALL"}, }, }, + Resources: core.ResourceRequirements{ + Requests: core.ResourceList{ + core.ResourceCPU: resource.MustParse("100m"), + core.ResourceMemory: resource.MustParse("150Mi"), + }, + Limits: core.ResourceList{ + core.ResourceCPU: resource.MustParse("1000m"), + core.ResourceMemory: resource.MustParse("500Mi"), + }, + }, }, } @@ -930,7 +941,17 @@ func createVddkCheckJob(plan *api.Plan, labels map[string]string, vddkImage stri InitContainers: initContainers, Containers: []core.Container{ { - Name: "validator", + Name: "validator", + Resources: core.ResourceRequirements{ + Requests: core.ResourceList{ + core.ResourceCPU: resource.MustParse("100m"), + core.ResourceMemory: resource.MustParse("150Mi"), + }, + Limits: core.ResourceList{ + core.ResourceCPU: resource.MustParse("1000m"), + core.ResourceMemory: resource.MustParse("500Mi"), + }, + }, Image: Settings.Migration.VirtV2vImage, SecurityContext: &core.SecurityContext{ AllowPrivilegeEscalation: ptr.To(false), diff --git a/pkg/controller/provider/ova-setup.go b/pkg/controller/provider/ova-setup.go index 03b562b73..d137beb6f 100644 --- a/pkg/controller/provider/ova-setup.go +++ b/pkg/controller/provider/ova-setup.go @@ -224,6 +224,16 @@ func (r *Reconciler) makeOvaProviderPodSpec(pvcName, providerName, providerNames MountPath: mountPath, }, }, + Resources: core.ResourceRequirements{ + Requests: core.ResourceList{ + core.ResourceCPU: resource.MustParse(Settings.OvaContainerRequestsCpu), + core.ResourceMemory: resource.MustParse(Settings.OvaContainerRequestsMemory), + }, + Limits: core.ResourceList{ + core.ResourceCPU: resource.MustParse(Settings.OvaContainerLimitsCpu), + core.ResourceMemory: resource.MustParse(Settings.OvaContainerRequestsMemory), + }, + }, SecurityContext: securityContext, } diff --git a/pkg/settings/migration.go b/pkg/settings/migration.go index 354eadd18..e8a76db7c 100644 --- a/pkg/settings/migration.go +++ b/pkg/settings/migration.go @@ -12,26 +12,38 @@ import ( // Environment variables. const ( - MaxVmInFlight = "MAX_VM_INFLIGHT" - HookRetry = "HOOK_RETRY" - ImporterRetry = "IMPORTER_RETRY" - VirtV2vImage = "VIRT_V2V_IMAGE" - PrecopyInterval = "PRECOPY_INTERVAL" - VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" - SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" - SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" - CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" - FileSystemOverhead = "FILESYSTEM_OVERHEAD" - BlockOverhead = "BLOCK_OVERHEAD" - CleanupRetries = "CLEANUP_RETRIES" - DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" - SnapshotRemovalCheckRetries = "SNAPSHOT_REMOVAL_CHECK_RETRIES" - OvirtOsConfigMap = "OVIRT_OS_MAP" - VsphereOsConfigMap = "VSPHERE_OS_MAP" - VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" - VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" - VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" - VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" + MaxVmInFlight = "MAX_VM_INFLIGHT" + HookRetry = "HOOK_RETRY" + ImporterRetry = "IMPORTER_RETRY" + VirtV2vImage = "VIRT_V2V_IMAGE" + PrecopyInterval = "PRECOPY_INTERVAL" + VirtV2vDontRequestKVM = "VIRT_V2V_DONT_REQUEST_KVM" + SnapshotRemovalTimeout = "SNAPSHOT_REMOVAL_TIMEOUT" + SnapshotStatusCheckRate = "SNAPSHOT_STATUS_CHECK_RATE" + CDIExportTokenTTL = "CDI_EXPORT_TOKEN_TTL" + FileSystemOverhead = "FILESYSTEM_OVERHEAD" + BlockOverhead = "BLOCK_OVERHEAD" + CleanupRetries = "CLEANUP_RETRIES" + DvStatusCheckRetries = "DV_STATUS_CHECK_RETRIES" + SnapshotRemovalCheckRetries = "SNAPSHOT_REMOVAL_CHECK_RETRIES" + OvirtOsConfigMap = "OVIRT_OS_MAP" + VsphereOsConfigMap = "VSPHERE_OS_MAP" + VirtCustomizeConfigMap = "VIRT_CUSTOMIZE_MAP" + VddkJobActiveDeadline = "VDDK_JOB_ACTIVE_DEADLINE" + VirtV2vExtraArgs = "VIRT_V2V_EXTRA_ARGS" + VirtV2vExtraConfConfigMap = "VIRT_V2V_EXTRA_CONF_CONFIG_MAP" + VirtV2vContainerLimitsCpu = "VIRT_V2V_CONTAINER_LIMITS_CPU" + VirtV2vContainerLimitsMemory = "VIRT_V2V_CONTAINER_LIMITS_MEMORY" + VirtV2vContainerRequestsCpu = "VIRT_V2V_CONTAINER_REQUESTS_CPU" + VirtV2vContainerRequestsMemory = "VIRT_V2V_CONTAINER_REQUESTS_MEMORY" + HooksContainerLimitsCpu = "HOOKS_CONTAINER_LIMITS_CPU" + HooksContainerLimitsMemory = "HOOKS_CONTAINER_LIMITS_MEMORY" + HooksContainerRequestsCpu = "HOOKS_CONTAINER_REQUESTS_CPU" + HooksContainerRequestsMemory = "HOOKS_CONTAINER_REQUESTS_MEMORY" + OvaContainerLimitsCpu = "OVA_CONTAINER_LIMITS_CPU" + OvaContainerLimitsMemory = "OVA_CONTAINER_LIMITS_MEMORY" + OvaContainerRequestsCpu = "OVA_CONTAINER_REQUESTS_CPU" + OvaContainerRequestsMemory = "OVA_CONTAINER_REQUESTS_MEMORY" ) // Migration settings @@ -75,7 +87,19 @@ type Migration struct { // Additional arguments for virt-v2v VirtV2vExtraArgs string // Additional configuration for virt-v2v - VirtV2vExtraConfConfigMap string + VirtV2vExtraConfConfigMap string + VirtV2vContainerLimitsCpu string + VirtV2vContainerLimitsMemory string + VirtV2vContainerRequestsCpu string + VirtV2vContainerRequestsMemory string + HooksContainerLimitsCpu string + HooksContainerLimitsMemory string + HooksContainerRequestsCpu string + HooksContainerRequestsMemory string + OvaContainerLimitsCpu string + OvaContainerLimitsMemory string + OvaContainerRequestsCpu string + OvaContainerRequestsMemory string } // Load settings. @@ -157,5 +181,66 @@ func (r *Migration) Load() (err error) { if val, found := os.LookupEnv(VirtV2vExtraConfConfigMap); found { r.VirtV2vExtraConfConfigMap = val } + // Containers configurations + if val, found := os.LookupEnv(VirtV2vContainerLimitsCpu); found { + r.VirtV2vContainerLimitsCpu = val + } else { + r.VirtV2vContainerLimitsCpu = "4000m" + } + if val, found := os.LookupEnv(VirtV2vContainerLimitsMemory); found { + r.VirtV2vContainerLimitsMemory = val + } else { + r.VirtV2vContainerLimitsMemory = "8Gi" + } + if val, found := os.LookupEnv(VirtV2vContainerRequestsCpu); found { + r.VirtV2vContainerRequestsCpu = val + } else { + r.VirtV2vContainerRequestsCpu = "1000m" + } + if val, found := os.LookupEnv(VirtV2vContainerRequestsMemory); found { + r.VirtV2vContainerRequestsMemory = val + } else { + r.VirtV2vContainerRequestsMemory = "1Gi" + } + if val, found := os.LookupEnv(HooksContainerLimitsCpu); found { + r.HooksContainerLimitsCpu = val + } else { + r.HooksContainerLimitsCpu = "1000m" + } + if val, found := os.LookupEnv(HooksContainerLimitsMemory); found { + r.HooksContainerLimitsMemory = val + } else { + r.HooksContainerLimitsMemory = "1Gi" + } + if val, found := os.LookupEnv(HooksContainerRequestsCpu); found { + r.HooksContainerRequestsCpu = val + } else { + r.HooksContainerRequestsCpu = "100m" + } + if val, found := os.LookupEnv(HooksContainerRequestsMemory); found { + r.HooksContainerRequestsMemory = val + } else { + r.HooksContainerRequestsMemory = "150Mi" + } + if val, found := os.LookupEnv(OvaContainerLimitsCpu); found { + r.OvaContainerLimitsCpu = val + } else { + r.OvaContainerLimitsCpu = "1000m" + } + if val, found := os.LookupEnv(OvaContainerLimitsMemory); found { + r.OvaContainerLimitsMemory = val + } else { + r.OvaContainerLimitsMemory = "1Gi" + } + if val, found := os.LookupEnv(OvaContainerRequestsCpu); found { + r.OvaContainerRequestsCpu = val + } else { + r.OvaContainerRequestsCpu = "100m" + } + if val, found := os.LookupEnv(OvaContainerRequestsMemory); found { + r.OvaContainerRequestsMemory = val + } else { + r.OvaContainerRequestsMemory = "150Mi" + } return }