From 0d3441ff1fe2caa44f0a0d7cf40ed5115859cb44 Mon Sep 17 00:00:00 2001 From: Peter Sabaini Date: Wed, 28 Feb 2024 09:48:12 +0100 Subject: [PATCH 1/2] Fix: ensure backward compat config Fixes: #318 Ensure we populate the config key value db with required values if not present. Newer microceph version set these values on bootstrap, older releases might be missing them Also add some testing for ceph.conf Signed-off-by: Peter Sabaini --- .github/workflows/q2q-candidate-upgrade.yml | 3 + .github/workflows/q2r-candidate-upgrade.yaml | 3 + .github/workflows/r2r-candidate-upgrade.yaml | 3 + .github/workflows/tests.yml | 4 +- microceph/ceph/config.go | 111 +++++++++++++++++-- microceph/ceph/start.go | 8 +- tests/scripts/actionutils.sh | 25 +++++ 7 files changed, 142 insertions(+), 15 deletions(-) diff --git a/.github/workflows/q2q-candidate-upgrade.yml b/.github/workflows/q2q-candidate-upgrade.yml index 9963d7ec..fbf00651 100644 --- a/.github/workflows/q2q-candidate-upgrade.yml +++ b/.github/workflows/q2q-candidate-upgrade.yml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/q2r-candidate-upgrade.yaml b/.github/workflows/q2r-candidate-upgrade.yaml index c06b67da..d2a08c57 100644 --- a/.github/workflows/q2r-candidate-upgrade.yaml +++ b/.github/workflows/q2r-candidate-upgrade.yaml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/r2r-candidate-upgrade.yaml b/.github/workflows/r2r-candidate-upgrade.yaml index f00f9c3e..c1f94b1c 100644 --- a/.github/workflows/r2r-candidate-upgrade.yaml +++ b/.github/workflows/r2r-candidate-upgrade.yaml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6cbbdd32..8d10a426 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -255,6 +255,9 @@ jobs: - name: Setup cluster run: ~/actionutils.sh cluster_nodes custom + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Add 2 OSDs run: | for c in node-wrk1 node-wrk2 ; do @@ -489,4 +492,3 @@ jobs: - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw - diff --git a/microceph/ceph/config.go b/microceph/ceph/config.go index 82859024..ffc1f440 100644 --- a/microceph/ceph/config.go +++ b/microceph/ceph/config.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "github.com/canonical/microceph/microceph/interfaces" + "net" "os" "path/filepath" "strings" @@ -162,36 +163,96 @@ func ListConfigs() (types.Configs, error) { return configs, nil } -// updates the ceph config file. -func UpdateConfig(s interfaces.StateInterface) error { - confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf") - runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run") +// backwardCompatPubnet ensures that the public_network is set in the database +// this is a backward-compat shim to accomodate older versions of microceph +// which will ensure that the public_network is set in the database +func backwardCompatPubnet(s interfaces.StateInterface) error { + config, err := getConfigDb(s) + if err != nil { + return fmt.Errorf("failed to get config from db: %w", err) + } - // Get the configuration and servers. + // do we have a public_network configured? + pubNet := config["public_network"] + _, _, err = net.ParseCIDR(pubNet) + if err != nil { + // get public network from default address + pubNet, err = common.Network.FindNetworkAddress(s.ClusterState().Address().Hostname()) + if err != nil { + return fmt.Errorf("failed to locate public network: %w", err) + } + // update the database + err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { + _, err = database.CreateConfigItem(ctx, tx, database.ConfigItem{Key: "public_network", Value: pubNet}) + if err != nil { + return fmt.Errorf("failed to record public_network: %w", err) + } + return nil + }) + } + + return nil +} + +// backwardCompatMonitors retrieves monitor addresses from the node list and returns that +// this a backward-compat shim to accomodate older versions of microceph +func backwardCompatMonitors(s interfaces.StateInterface) ([]string, error) { var err error - var configItems []database.ConfigItem + var monitors []database.Service + serviceName := "mon" err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { - configItems, err = database.GetConfigItems(ctx, tx) + monitors, err = database.GetServices(ctx, tx, database.ServiceFilter{Service: &serviceName}) if err != nil { return err } - return nil }) if err != nil { - return err + return nil, err } - config := map[string]string{} - for _, item := range configItems { - config[item.Key] = item.Value + monitorAddresses := make([]string, len(monitors)) + remotes := s.ClusterState().Remotes().RemotesByName() + for i, monitor := range monitors { + remote, ok := remotes[monitor.Member] + if !ok { + continue + } + monitorAddresses[i] = remote.Address.Addr().String() + } + return monitorAddresses, nil +} + +// UpdateConfig updates the ceph.conf file with the current configuration. +func UpdateConfig(s interfaces.StateInterface) error { + confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf") + runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run") + + err := backwardCompatPubnet(s) + if err != nil { + return fmt.Errorf("failed to ensure backward compat: %w", err) + } + + config, err := getConfigDb(s) + if err != nil { + return fmt.Errorf("failed to get config db: %w", err) } // REF: https://docs.ceph.com/en/quincy/rados/configuration/network-config-ref/#ceph-daemons // The mon host configuration option only needs to be sufficiently up to date such that a // client can reach one monitor that is currently online. monitorAddresses := getMonitorAddresses(config) + + // backward compat: if no mon hosts found, get them from the node addresses but don't + // insert into db, as the join logic will take care of that. + if len(monitorAddresses) == 0 { + monitorAddresses, err = backwardCompatMonitors(s) + if err != nil { + return fmt.Errorf("failed to get monitor addresses: %w", err) + } + } + conf := newCephConfig(confPath) // Check if host has IP address on the configured public network. @@ -199,6 +260,7 @@ func UpdateConfig(s interfaces.StateInterface) error { if err != nil { return fmt.Errorf("failed to locate IP on public network %s: %w", config["public_network"], err) } + clientConfig, err := GetClientConfigForHost(s, s.ClusterState().Name()) if err != nil { logger.Errorf("Failed to pull Client Configurations: %v", err) @@ -225,6 +287,7 @@ func UpdateConfig(s interfaces.StateInterface) error { if err != nil { return fmt.Errorf("couldn't render ceph.conf: %w", err) } + logger.Debugf("updated ceph.conf: %v", conf.GetPath()) // Generate ceph.client.admin.keyring keyring := newCephKeyring(confPath, "ceph.keyring") @@ -242,6 +305,30 @@ func UpdateConfig(s interfaces.StateInterface) error { return nil } +// getConfigDb retrieves the configuration from the database. +func getConfigDb(s interfaces.StateInterface) (map[string]string, error) { + var err error + var configItems []database.ConfigItem + + err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { + configItems, err = database.GetConfigItems(ctx, tx) + if err != nil { + return err + } + + return nil + }) + if err != nil { + return nil, err + } + + config := map[string]string{} + for _, item := range configItems { + config[item.Key] = item.Value + } + return config, nil +} + // getMonitorAddresses scans a provided config key/value map and returns a list of mon hosts found. func getMonitorAddresses(configs map[string]string) []string { monHosts := []string{} diff --git a/microceph/ceph/start.go b/microceph/ceph/start.go index aac8ec6b..6af259c6 100644 --- a/microceph/ceph/start.go +++ b/microceph/ceph/start.go @@ -3,6 +3,7 @@ package ceph import ( "context" "database/sql" + "github.com/canonical/lxd/shared/logger" "github.com/canonical/microceph/microceph/interfaces" "reflect" "time" @@ -19,6 +20,7 @@ func Start(s interfaces.StateInterface) error { for { // Check that the database is ready. if !s.ClusterState().Database.IsOpen() { + logger.Debug("start: database not ready, waiting...") time.Sleep(10 * time.Second) continue } @@ -39,26 +41,28 @@ func Start(s interfaces.StateInterface) error { return nil }) if err != nil { + logger.Warnf("start: failed to fetch monitors, retrying: %v", err) time.Sleep(10 * time.Second) continue } // Compare to the previous list. if reflect.DeepEqual(oldMonitors, monitors) { + logger.Debugf("start: monitors unchanged, sleeping: %v", monitors) time.Sleep(time.Minute) continue } err = UpdateConfig(s) if err != nil { + logger.Errorf("start: failed to update config, retrying: %v", err) time.Sleep(10 * time.Second) continue } - + logger.Debug("start: updated config, sleeping") oldMonitors = monitors time.Sleep(time.Minute) } - }() return nil diff --git a/tests/scripts/actionutils.sh b/tests/scripts/actionutils.sh index 78302f88..b49d3b84 100755 --- a/tests/scripts/actionutils.sh +++ b/tests/scripts/actionutils.sh @@ -388,6 +388,31 @@ function test_migration() { return -1 } +function test_ceph_conf() { + set -uex + for n in $( lxc ls -c n --format csv ); do + echo "checking node $n" + lxc exec $n -- sh <<'EOF' +# Test: configured rundir must be current +current=$( realpath /var/snap/microceph/current ) +rundir=$( cat /var/snap/microceph/current/conf/ceph.conf | awk '/run dir/{ print $4 }' ) +p=$( dirname $rundir ) +if [ $p != $current ]; then + echo "Error: snap data dir $current, configured run dir: $rundir" + cat /var/snap/microceph/current/conf/ceph.conf + exit -1 +fi + +# Test: must contain public_network +if ! grep -q public_net /var/snap/microceph/current/conf/ceph.conf ; then + echo "Error: didn't find public_net in ceph.conf" + cat /var/snap/microceph/current/conf/ceph.conf + exit -1 +fi +EOF + done +} + function headexec() { local run="${1?missing}" shift From 3f3f0efe20ae770ffbe72e8f6887285a31259495 Mon Sep 17 00:00:00 2001 From: Peter Sabaini Date: Thu, 29 Feb 2024 17:26:26 +0100 Subject: [PATCH 2/2] Address review comments Signed-off-by: Peter Sabaini --- microceph/ceph/config.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/microceph/ceph/config.go b/microceph/ceph/config.go index ffc1f440..eabc547a 100644 --- a/microceph/ceph/config.go +++ b/microceph/ceph/config.go @@ -173,6 +173,8 @@ func backwardCompatPubnet(s interfaces.StateInterface) error { } // do we have a public_network configured? + // if it is unset, the below will evaluate to the empty string + // and subsequently fail the net.ParseCIDR check pubNet := config["public_network"] _, _, err = net.ParseCIDR(pubNet) if err != nil {