From acbcf60f9533405433831a40f566b191777a034c Mon Sep 17 00:00:00 2001 From: Thomas Parrott Date: Thu, 20 Oct 2022 13:33:32 +0100 Subject: [PATCH 1/3] lxd/instance/drivers/driver/qemu: Remove incorrect comment about handle caching in getAgentClient Signed-off-by: Thomas Parrott --- lxd/instance/drivers/driver_qemu.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lxd/instance/drivers/driver_qemu.go b/lxd/instance/drivers/driver_qemu.go index cd571ce95671..c35a3d1cffcc 100644 --- a/lxd/instance/drivers/driver_qemu.go +++ b/lxd/instance/drivers/driver_qemu.go @@ -305,8 +305,7 @@ type qemu struct { architectureName string } -// getAgentClient returns the current agent client handle. To avoid TLS setup each time this -// function is called, the handle is cached internally in the Qemu struct. +// getAgentClient returns the current agent client handle. func (d *qemu) getAgentClient() (*http.Client, error) { // Check if the agent is running. monitor, err := qmp.Connect(d.monitorPath(), qemuSerialChardevName, d.getMonitorEventHandler()) From fa5f0d59d65574a03bd960879065c5a72899a4f0 Mon Sep 17 00:00:00 2001 From: Thomas Parrott Date: Thu, 20 Oct 2022 13:28:47 +0100 Subject: [PATCH 2/3] lxd/instance/drivers/driver/qemu: Do not attempt to mount & unmount in generateAgentCert We expect the callers of generateAgentCert to have ascertained whether the VM is running (and therefore mounted) before calling this function. By making this change it prevents a mount/unmount race when LXD starts up during the `devicesRegister` function call. During this function call, each instance has its `IsRunning()` function called to check if it needs its devices registered. For `disk` devices of running instances its important that their `Register`functions are called on LXD start in order to initialise the mount counter for that disk (so that a subsequent unmount call isn't incorrectly run if LXD incorrectly thinks that the disk volume isn't in use anymore). For VMs the initial call to `IsRunning()` is also important as that triggers a reconnection to the QEMU QMP monitor in the running VM and allows for gathering the lxd-agent run status. The mount/unmount race before the `disk` devices have been registered is caused because the `IsRunning` call initialises the `getMonitorEventHandler` and causes it to call the `advertiseVsockAddress` function, which in turn uses `generateAgentCert` as part of creating a connection to the lxd-agent over vsock. The `getMonitorEventHandler` handler is run in a separate goroutine and thus it can race the `disk` device's `Register` function (which would initialise the mount counter so that an unmount call would not be issued as the volume would be considered in use). If the `generateAgentCert` function finishes before the `disk` device's `Register` function then an unmount call is incorrectly attempted on the running VM's root disk causing delays to LXDs start up (as the unmount call blocks for several seconds). By removing the mount management from the `generateAgentCert` function it ensures that no early unmount attempt can occur, and doesn't affect the operation of the function as it is required that the VM be running (and thus mounted) in order to connect to the lxd-agent anyway. Signed-off-by: Thomas Parrott --- lxd/instance/drivers/driver_qemu.go | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/lxd/instance/drivers/driver_qemu.go b/lxd/instance/drivers/driver_qemu.go index c35a3d1cffcc..d7f75aaca856 100644 --- a/lxd/instance/drivers/driver_qemu.go +++ b/lxd/instance/drivers/driver_qemu.go @@ -306,6 +306,8 @@ type qemu struct { } // getAgentClient returns the current agent client handle. +// Callers should check that the instance is running (and therefore mounted) before caling this function, +// otherwise the qmp.Connect call will fail to use the monitor socket file. func (d *qemu) getAgentClient() (*http.Client, error) { // Check if the agent is running. monitor, err := qmp.Connect(d.monitorPath(), qemuSerialChardevName, d.getMonitorEventHandler()) @@ -454,21 +456,13 @@ func (d *qemu) unmount() error { // generateAgentCert creates the necessary server key and certificate if needed. func (d *qemu) generateAgentCert() (string, string, string, string, error) { - // Mount the instance's config volume if needed. - _, err := d.mount() - if err != nil { - return "", "", "", "", err - } - - defer func() { _ = d.unmount() }() - agentCertFile := filepath.Join(d.Path(), "agent.crt") agentKeyFile := filepath.Join(d.Path(), "agent.key") clientCertFile := filepath.Join(d.Path(), "agent-client.crt") clientKeyFile := filepath.Join(d.Path(), "agent-client.key") // Create server certificate. - err = shared.FindOrGenCert(agentCertFile, agentKeyFile, false, false) + err := shared.FindOrGenCert(agentCertFile, agentKeyFile, false, false) if err != nil { return "", "", "", "", err } From ca3645e29d396ce13ac63ef86b57a9ee4153cb88 Mon Sep 17 00:00:00 2001 From: Thomas Parrott Date: Thu, 20 Oct 2022 13:35:16 +0100 Subject: [PATCH 3/3] lxd/instance/drivers/driver/qemu: Make sure instance is running before trying file operations in FileSFTPConn Signed-off-by: Thomas Parrott --- lxd/instance/drivers/driver_qemu.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lxd/instance/drivers/driver_qemu.go b/lxd/instance/drivers/driver_qemu.go index d7f75aaca856..9df381813963 100644 --- a/lxd/instance/drivers/driver_qemu.go +++ b/lxd/instance/drivers/driver_qemu.go @@ -5403,6 +5403,11 @@ func (d *qemu) CGroup() (*cgroup.CGroup, error) { // FileSFTPConn returns a connection to the agent SFTP endpoint. func (d *qemu) FileSFTPConn() (net.Conn, error) { + // VMs, unlike containers, cannot perform file operations if not running and using the lxd-agent. + if !d.IsRunning() { + return nil, fmt.Errorf("Instance is not running") + } + // Connect to the agent. client, err := d.getAgentClient() if err != nil {