Skip to content

Commit

Permalink
Implement node removal
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Sabaini <[email protected]>
  • Loading branch information
sabaini committed Oct 16, 2023
1 parent 66190c4 commit ca4ee00
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 10 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,20 @@ jobs:
lxc exec node-head -- sh -c "microceph status" | grep -F -A 1 node-wrk1 | grep -E "^ Services: osd$"
lxc exec node-head -- sh -c "microceph status" | grep -F -A 1 node-wrk3 | grep -E "^ Services: mds, mgr, mon$"
- name: Enable services on wrk1
run: ~/actionutils.sh headexec enable_services node-wrk1

- name: Test remove node wrk3
run: |
set -uex
~/actionutils.sh headexec remove_node node-wrk3
if lxc exec node-head -- sh -c "microceph status" | grep -q "^- node-wrk3 " ; then
echo "Failed: node-wrk3 still present"
exit 1
fi
lxc exec node-head -- sh -c "microceph.ceph -s" | fgrep "mon: 3 daemons"
upgrade-quincy-tests:
name: Test quincy upgrades
runs-on: ubuntu-22.04
Expand Down
16 changes: 8 additions & 8 deletions microceph/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ import (
"time"

"github.com/canonical/lxd/shared/api"
"github.com/canonical/microcluster/client"
microCli "github.com/canonical/microcluster/client"

"github.com/canonical/microceph/microceph/api/types"
)

func SetConfig(ctx context.Context, c *client.Client, data *types.Config) error {
func SetConfig(ctx context.Context, c *microCli.Client, data *types.Config) error {
queryCtx, cancel := context.WithTimeout(ctx, time.Second*200)
defer cancel()

Expand All @@ -26,7 +26,7 @@ func SetConfig(ctx context.Context, c *client.Client, data *types.Config) error
return nil
}

func ClearConfig(ctx context.Context, c *client.Client, data *types.Config) error {
func ClearConfig(ctx context.Context, c *microCli.Client, data *types.Config) error {
queryCtx, cancel := context.WithTimeout(ctx, time.Second*200)
defer cancel()

Expand All @@ -38,7 +38,7 @@ func ClearConfig(ctx context.Context, c *client.Client, data *types.Config) erro
return nil
}

func GetConfig(ctx context.Context, c *client.Client, data *types.Config) (types.Configs, error) {
func GetConfig(ctx context.Context, c *microCli.Client, data *types.Config) (types.Configs, error) {
queryCtx, cancel := context.WithTimeout(ctx, time.Second*5)
defer cancel()

Expand All @@ -53,7 +53,7 @@ func GetConfig(ctx context.Context, c *client.Client, data *types.Config) (types
}

// AddDisk requests Ceph sets up a new OSD.
func AddDisk(ctx context.Context, c *client.Client, data *types.DisksPost) error {
func AddDisk(ctx context.Context, c *microCli.Client, data *types.DisksPost) error {
queryCtx, cancel := context.WithTimeout(ctx, time.Second*120)
defer cancel()

Expand All @@ -66,7 +66,7 @@ func AddDisk(ctx context.Context, c *client.Client, data *types.DisksPost) error
}

// GetDisks returns the list of configured disks.
func GetDisks(ctx context.Context, c *client.Client) (types.Disks, error) {
func GetDisks(ctx context.Context, c *microCli.Client) (types.Disks, error) {
queryCtx, cancel := context.WithTimeout(ctx, time.Second*5)
defer cancel()

Expand All @@ -81,7 +81,7 @@ func GetDisks(ctx context.Context, c *client.Client) (types.Disks, error) {
}

// GetResources returns the list of storage devices on the system.
func GetResources(ctx context.Context, c *client.Client) (*api.ResourcesStorage, error) {
func GetResources(ctx context.Context, c *microCli.Client) (*api.ResourcesStorage, error) {
queryCtx, cancel := context.WithTimeout(ctx, time.Second*5)
defer cancel()

Expand All @@ -96,7 +96,7 @@ func GetResources(ctx context.Context, c *client.Client) (*api.ResourcesStorage,
}

// RemoveDisk requests Ceph removes an OSD.
func RemoveDisk(ctx context.Context, c *client.Client, data *types.DisksDelete) error {
func RemoveDisk(ctx context.Context, c *microCli.Client, data *types.DisksDelete) error {
timeout := time.Second * time.Duration(data.Timeout+5) // wait a bit longer than the operation timeout
queryCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
Expand Down
53 changes: 53 additions & 0 deletions microceph/client/wrap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package client

import (
"context"
"github.com/canonical/microceph/microceph/api/types"

microCli "github.com/canonical/microcluster/client"
)

// ClientInterface wraps client functions
// This is useful for mocking in unit tests
type ClientInterface interface {
GetClusterMembers(*microCli.Client) ([]string, error)
GetDisks(*microCli.Client) (types.Disks, error)
GetServices(*microCli.Client) (types.Services, error)
DeleteService(*microCli.Client, string, string) error
}

type ClientImpl struct{}

// GetClusterMembers gets the cluster member names
// We return names only here because the Member type is internal to microclient
func (c ClientImpl) GetClusterMembers(cli *microCli.Client) ([]string, error) {
memberNames := make([]string, 3)
members, err := cli.GetClusterMembers(context.Background())
if err != nil {
return nil, err
}

for _, member := range members {
memberNames = append(memberNames, member.Name)
}

return memberNames, nil
}

// GetDisks wraps the GetDisks function above
func (c ClientImpl) GetDisks(cli *microCli.Client) (types.Disks, error) {
return GetDisks(context.Background(), cli)
}

// GetServices wraps the GetServices function above
func (c ClientImpl) GetServices(cli *microCli.Client) (types.Services, error) {
return GetServices(context.Background(), cli)
}

// DeleteService wraps the DeleteService function
func (c ClientImpl) DeleteService(cli *microCli.Client, target string, service string) error {
return DeleteService(context.Background(), cli, target, service)
}

// mocking point for unit tests
var MClient ClientInterface = ClientImpl{}
112 changes: 110 additions & 2 deletions microceph/cmd/microceph/cluster_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@ package main

import (
"context"
"fmt"
"github.com/canonical/lxd/shared/logger"
microCli "github.com/canonical/microcluster/client"

"github.com/canonical/microcluster/microcluster"
"github.com/spf13/cobra"

"github.com/canonical/microceph/microceph/client"
)

type cmdClusterRemove struct {
Expand Down Expand Up @@ -36,15 +41,118 @@ func (c *cmdClusterRemove) Run(cmd *cobra.Command, args []string) error {
return err
}

client, err := m.LocalClient()
cli, err := m.LocalClient()
if err != nil {
return err
}

err = client.DeleteClusterMember(context.Background(), args[0], c.flagForce)
logger.Debugf("Removing cluster member %v, force: %v", args[0], c.flagForce)

// check prerquisites unless we're forcing
if !c.flagForce {
ok, err := checkPrerequisites(cli, args[0])
if err != nil {
return fmt.Errorf("Error checking prereqs: %v", err)
}
if !ok {
return fmt.Errorf("Prerequisites not met, not removing: %v", err)
}
}

// delete from ceph
err = deleteNodeServices(cli, args[0])
if err != nil {
// forcing makes errs non-fatal
if !c.flagForce {
return err
}
logger.Warnf("Error deleting services from node %v: %v", args[0], err)
}

// delete from cluster db
err = cli.DeleteClusterMember(context.Background(), args[0], c.flagForce)
logger.Debugf("DeleteClusterMember %v: %v", args[0], err)
if err != nil {
return err
}

return nil
}

func checkPrerequisites(cli *microCli.Client, name string) (bool, error) {
// check if member exists
clusterMembers, err := client.MClient.GetClusterMembers(cli)
if err != nil {
return false, err
}
found := false
for _, member := range clusterMembers {
if member == name {
found = true
}
}
if !found {
return false, fmt.Errorf("Node %v not found", name)
}

// check if any OSDs present
disks, err := client.MClient.GetDisks(cli)
if err != nil {
return false, err
}
found = false
for _, disk := range disks {
if disk.Location == name {
found = true
}
}
logger.Debugf("Disks: %v, found: %v", disks, found)
if found {
return false, fmt.Errorf("Node %v still has disks configured, remove before proceeding", name)
}

// check if this node has the last mon
services, err := client.MClient.GetServices(cli)
if err != nil {
return false, err
}
// create a map of service names to bool values
// init with false
foundMap := map[string]bool{
"mon": false,
"mgr": false,
"mds": false,
}
// loop through services and check if we have any services that are not on the named node
for _, service := range services {
if service.Location == name {
continue
}
foundMap[service.Service] = true
}
logger.Debugf("Services: %v, foundMap: %v", services, foundMap)
if !foundMap["mon"] || !foundMap["mgr"] || !foundMap["mds"] {
return false, fmt.Errorf("Need at least one mon, mds, and mgr besides %v", name)
}

return true, nil
}

func deleteNodeServices(cli *microCli.Client, name string) error {
services, err := client.MClient.GetServices(cli)
if err != nil {
return err
}
for _, service := range services {
logger.Debugf("Check for deletion: %s", service)
if service.Location == name {
logger.Debugf("Deleting service %s", service)
err = client.MClient.DeleteService(cli, service.Location, service.Service)
if err != nil {
logger.Warnf("Fault deleting service %v on node %v: %v", service.Service, service.Location, err)
}
}
}
return nil

}
34 changes: 34 additions & 0 deletions tests/scripts/actionutils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,40 @@ function testrgw() {
( curl -s http://localhost/testbucket/test.txt | grep -F hello-radosgw ) || return -1
}

function enable_services() {
local node="${1?missing}"
for s in mon mds mgr ; do
sudo microceph enable $s --target $node
done
for i in $(seq 1 8); do
if sudo microceph.ceph -s | grep -q "mon: .*daemons.*${node}" ; then
echo "Found mon on ${node}"
break
else
echo -n '.'
sleep 2
fi
done
sudo microceph.ceph -s
}

function remove_node() {
local node="${1?missing}"
sudo microceph cluster remove $node
for i in $(seq 1 8); do
if sudo microceph.ceph -s | grep -q "mon: .*daemons.*${node}" ; then
echo -n '.'
sleep 2
else
echo "No mon on ${node}"
break
fi
done
sleep 1
sudo microceph.ceph -s
sudo microceph status
}

function headexec() {
local run="${1?missing}"
shift
Expand Down

0 comments on commit ca4ee00

Please sign in to comment.