From b77e660419dd2a10548a29d96db1e810471de181 Mon Sep 17 00:00:00 2001 From: Simon Murray Date: Mon, 21 Aug 2023 10:17:55 +0100 Subject: [PATCH] Fix Port Leaks When an instance fails to come up, there's a bug where the port doesn't get cleaned up, and eventually you will exhaust your quota. The code does attempt to garbage collect, but I suspect at some point in the past ports were suffixed with an index, but the garbage collection code wasn't made aware of this, so lists nothing. Replace the APi "filtering" that doesn't work with a better algorithm that uses the instance creation code to generate the expected ports, then generate the names and delete if the port exists. Also protected with some nice unit tests. --- controllers/openstackcluster_controller.go | 5 +- controllers/openstackmachine_controller.go | 4 +- pkg/cloud/services/compute/instance.go | 23 +++--- pkg/cloud/services/compute/instance_test.go | 11 ++- pkg/cloud/services/networking/port.go | 38 +++++++--- pkg/cloud/services/networking/port_test.go | 79 +++++++++++++++++++++ 6 files changed, 133 insertions(+), 27 deletions(-) diff --git a/controllers/openstackcluster_controller.go b/controllers/openstackcluster_controller.go index 3103c60907..8b0177de3a 100644 --- a/controllers/openstackcluster_controller.go +++ b/controllers/openstackcluster_controller.go @@ -237,8 +237,9 @@ func deleteBastion(scope scope.Scope, cluster *clusterv1.Cluster, openStackClust } } - rootVolume := openStackCluster.Spec.Bastion.Instance.RootVolume - if err = computeService.DeleteInstance(openStackCluster, instanceStatus, instanceName, rootVolume); err != nil { + instanceSpec := bastionToInstanceSpec(openStackCluster, cluster.Name) + + if err = computeService.DeleteInstance(openStackCluster, openStackCluster, instanceStatus, instanceSpec); err != nil { handleUpdateOSCError(openStackCluster, fmt.Errorf("failed to delete bastion: %w", err)) return fmt.Errorf("failed to delete bastion: %w", err) } diff --git a/controllers/openstackmachine_controller.go b/controllers/openstackmachine_controller.go index 02deee62ee..7b50c63a43 100644 --- a/controllers/openstackmachine_controller.go +++ b/controllers/openstackmachine_controller.go @@ -273,7 +273,9 @@ func (r *OpenStackMachineReconciler) reconcileDelete(scope scope.Scope, cluster } } - if err := computeService.DeleteInstance(openStackMachine, instanceStatus, openStackMachine.Name, openStackMachine.Spec.RootVolume); err != nil { + instanceSpec := machineToInstanceSpec(openStackCluster, machine, openStackMachine, "") + + if err := computeService.DeleteInstance(openStackCluster, openStackMachine, instanceStatus, instanceSpec); err != nil { conditions.MarkFalse(openStackMachine, infrav1.InstanceReadyCondition, infrav1.InstanceDeleteFailedReason, clusterv1.ConditionSeverityError, "Deleting instance failed: %v", err) return ctrl.Result{}, fmt.Errorf("delete instance: %w", err) } diff --git a/pkg/cloud/services/compute/instance.go b/pkg/cloud/services/compute/instance.go index 14d5d2db58..c35f049335 100644 --- a/pkg/cloud/services/compute/instance.go +++ b/pkg/cloud/services/compute/instance.go @@ -283,7 +283,7 @@ func (s *Service) createInstanceImpl(eventObject runtime.Object, openStackCluste if len(instanceSpec.Tags) > 0 { iTags = instanceSpec.Tags } - portName := getPortName(instanceSpec.Name, portOpts, i) + portName := networking.GetPortName(instanceSpec.Name, portOpts, i) port, err := networkingService.GetOrCreatePort(eventObject, clusterName, portName, portOpts, securityGroups, iTags) if err != nil { return nil, err @@ -380,14 +380,6 @@ func (s *Service) createInstanceImpl(eventObject runtime.Object, openStackCluste return createdInstance, nil } -// getPortName appends a suffix to an instance name in order to try and get a unique name per port. -func getPortName(instanceName string, opts *infrav1.PortOpts, netIndex int) string { - if opts != nil && opts.NameSuffix != "" { - return fmt.Sprintf("%s-%s", instanceName, opts.NameSuffix) - } - return fmt.Sprintf("%s-%d", instanceName, netIndex) -} - func rootVolumeName(instanceName string) string { return fmt.Sprintf("%s-root", instanceName) } @@ -550,7 +542,7 @@ func (s *Service) GetManagementPort(openStackCluster *infrav1.OpenStackCluster, return &allPorts[0], nil } -func (s *Service) DeleteInstance(eventObject runtime.Object, instanceStatus *InstanceStatus, instanceName string, rootVolume *infrav1.RootVolume) error { +func (s *Service) DeleteInstance(openStackCluster *infrav1.OpenStackCluster, eventObject runtime.Object, instanceStatus *InstanceStatus, instanceSpec *InstanceSpec) error { if instanceStatus == nil { /* We create a boot-from-volume instance in 2 steps: @@ -570,8 +562,8 @@ func (s *Service) DeleteInstance(eventObject runtime.Object, instanceStatus *Ins Note that we don't need to separately delete the root volume when deleting the instance because DeleteOnTermination will ensure it is deleted in that case. */ - if hasRootVolume(rootVolume) { - name := rootVolumeName(instanceName) + if hasRootVolume(instanceSpec.RootVolume) { + name := rootVolumeName(instanceSpec.Name) volume, err := s.getVolumeByName(name) if err != nil { return err @@ -620,7 +612,12 @@ func (s *Service) DeleteInstance(eventObject runtime.Object, instanceStatus *Ins // delete port of error instance if instanceStatus.State() == infrav1.InstanceStateError { - if err := networkingService.GarbageCollectErrorInstancesPort(eventObject, instanceStatus.Name()); err != nil { + portOpts, err := s.constructPorts(openStackCluster, instanceSpec) + if err != nil { + return err + } + + if err := networkingService.GarbageCollectErrorInstancesPort(eventObject, instanceSpec.Name, portOpts); err != nil { return err } } diff --git a/pkg/cloud/services/compute/instance_test.go b/pkg/cloud/services/compute/instance_test.go index b6be591860..698392ad85 100644 --- a/pkg/cloud/services/compute/instance_test.go +++ b/pkg/cloud/services/compute/instance_test.go @@ -48,6 +48,7 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-openstack/api/v1alpha7" "sigs.k8s.io/cluster-api-provider-openstack/pkg/clients" "sigs.k8s.io/cluster-api-provider-openstack/pkg/clients/mock" + "sigs.k8s.io/cluster-api-provider-openstack/pkg/cloud/services/networking" "sigs.k8s.io/cluster-api-provider-openstack/pkg/scope" ) @@ -110,7 +111,7 @@ func Test_getPortName(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := getPortName(tt.args.instanceName, tt.args.opts, tt.args.netIndex); got != tt.want { + if got := networking.GetPortName(tt.args.instanceName, tt.args.opts, tt.args.netIndex); got != tt.want { t.Errorf("getPortName() = %v, want %v", got, tt.want) } }) @@ -819,7 +820,13 @@ func TestService_DeleteInstance(t *testing.T) { if err != nil { t.Fatalf("Failed to create service: %v", err) } - if err := s.DeleteInstance(tt.eventObject, tt.instanceStatus(), openStackMachineName, tt.rootVolume); (err != nil) != tt.wantErr { + + instanceSpec := &InstanceSpec{ + Name: openStackMachineName, + RootVolume: tt.rootVolume, + } + + if err := s.DeleteInstance(&infrav1.OpenStackCluster{}, tt.eventObject, tt.instanceStatus(), instanceSpec); (err != nil) != tt.wantErr { t.Errorf("Service.DeleteInstance() error = %v, wantErr %v", err, tt.wantErr) } }) diff --git a/pkg/cloud/services/networking/port.go b/pkg/cloud/services/networking/port.go index c354f8c43f..62cd6798fb 100644 --- a/pkg/cloud/services/networking/port.go +++ b/pkg/cloud/services/networking/port.go @@ -289,18 +289,38 @@ func (s *Service) DeletePorts(openStackCluster *infrav1.OpenStackCluster) error return nil } -func (s *Service) GarbageCollectErrorInstancesPort(eventObject runtime.Object, instanceName string) error { - portList, err := s.client.ListPort(ports.ListOpts{ - Name: instanceName, - }) - if err != nil { - return err - } - for _, p := range portList { - if err := s.DeletePort(eventObject, p.ID); err != nil { +func (s *Service) GarbageCollectErrorInstancesPort(eventObject runtime.Object, instanceName string, portOpts []infrav1.PortOpts) error { + for i := range portOpts { + portOpt := &portOpts[i] + + portName := GetPortName(instanceName, portOpt, i) + + // TODO: whould be nice if gophercloud could be persuaded to accept multiple + // names as is allowed by the API in order to reduce API traffic. + portList, err := s.client.ListPort(ports.ListOpts{Name: portName}) + if err != nil { + return err + } + + // NOTE: https://github.com/kubernetes-sigs/cluster-api-provider-openstack/issues/1476 + // It is up to the end user to specify a UNIQUE cluster name when provisioning in the + // same project, otherwise things will alias and we could delete more than we should. + if len(portList) > 1 { + return fmt.Errorf("garbage collection of port %s failed, found %d ports with the same name", portName, len(portList)) + } + + if err := s.DeletePort(eventObject, portList[0].ID); err != nil { return err } } return nil } + +// GetPortName appends a suffix to an instance name in order to try and get a unique name per port. +func GetPortName(instanceName string, opts *infrav1.PortOpts, netIndex int) string { + if opts != nil && opts.NameSuffix != "" { + return fmt.Sprintf("%s-%s", instanceName, opts.NameSuffix) + } + return fmt.Sprintf("%s-%d", instanceName, netIndex) +} diff --git a/pkg/cloud/services/networking/port_test.go b/pkg/cloud/services/networking/port_test.go index 88c8b9e7b3..7cc74d90d1 100644 --- a/pkg/cloud/services/networking/port_test.go +++ b/pkg/cloud/services/networking/port_test.go @@ -535,6 +535,85 @@ func Test_GetOrCreatePort(t *testing.T) { } } +func Test_GarbageCollectErrorInstancesPort(t *testing.T) { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + + instanceName := "foo" + portID1 := "dc6e0ae3-dad6-4240-a9cb-e541916f20d3" + portID2 := "a38ab1cb-c2cc-4c1b-9d1d-696ec73356d2" + portName1 := GetPortName(instanceName, nil, 0) + portName2 := GetPortName(instanceName, nil, 1) + + tests := []struct { + // man is the name of the test. + name string + // expect allows definition of any expected calls to the mock. + expect func(m *mock.MockNetworkClientMockRecorder) + // portOpts defines the instance ports as defined in the OSM spec. + portOpts []infrav1.PortOpts + // wantErr defines whether the test is supposed to fail. + wantErr bool + }{ + { + name: "garbage collects all ports for an instance", + expect: func(m *mock.MockNetworkClientMockRecorder) { + o1 := ports.ListOpts{ + Name: portName1, + } + p1 := []ports.Port{ + { + ID: portID1, + Name: portName1, + }, + } + m.ListPort(o1).Return(p1, nil) + m.DeletePort(portID1) + o2 := ports.ListOpts{ + Name: portName2, + } + p2 := []ports.Port{ + { + ID: portID2, + Name: portName2, + }, + } + + m.ListPort(o2).Return(p2, nil) + m.DeletePort(portID2) + }, + portOpts: []infrav1.PortOpts{ + {}, + {}, + }, + wantErr: false, + }, + } + + eventObject := &infrav1.OpenStackMachine{} + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + mockClient := mock.NewMockNetworkClient(mockCtrl) + tt.expect(mockClient.EXPECT()) + s := Service{ + client: mockClient, + } + err := s.GarbageCollectErrorInstancesPort( + eventObject, + instanceName, + tt.portOpts, + ) + if tt.wantErr { + g.Expect(err).To(HaveOccurred()) + } else { + g.Expect(err).NotTo(HaveOccurred()) + } + }) + } +} + func pointerTo(b bool) *bool { return &b }