Skip to content

Commit

Permalink
Fix: ensure backward compat config
Browse files Browse the repository at this point in the history
Fixes: canonical#318

Ensure we populate the config key value db with required values if not
present. Newer microceph version set these values on bootstrap, older
releases might be missing them

Also add some testing for ceph.conf

Signed-off-by: Peter Sabaini <[email protected]>
  • Loading branch information
sabaini committed Feb 28, 2024
1 parent b959197 commit 94f442a
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 14 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/q2q-candidate-upgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
3 changes: 3 additions & 0 deletions .github/workflows/q2r-candidate-upgrade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
3 changes: 3 additions & 0 deletions .github/workflows/r2r-candidate-upgrade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
6 changes: 6 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,9 @@ jobs:
- name: Setup cluster
run: ~/actionutils.sh cluster_nodes custom

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Add 2 OSDs
run: |
for c in node-wrk1 node-wrk2 ; do
Expand Down Expand Up @@ -490,3 +493,6 @@ jobs:
- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw

- name: Setup upterm session
if: failure()
uses: lhotari/action-upterm@v1
111 changes: 99 additions & 12 deletions microceph/ceph/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"encoding/json"
"fmt"
"github.com/canonical/microceph/microceph/interfaces"
"net"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -162,43 +163,104 @@ func ListConfigs() (types.Configs, error) {
return configs, nil
}

// updates the ceph config file.
func UpdateConfig(s interfaces.StateInterface) error {
confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf")
runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run")
// backwardCompatPubnet ensures that the public_network is set in the database
// this is a backward-compat shim to accomodate older versions of microceph
// which will ensure that the public_network is set in the database
func backwardCompatPubnet(s interfaces.StateInterface) error {
config, err := getConfigDb(s)
if err != nil {
return fmt.Errorf("failed to get config from db: %w", err)
}

// Get the configuration and servers.
// do we have a public_network configured?
pubNet := config["public_network"]
_, _, err = net.ParseCIDR(pubNet)
if err != nil {
// get public network from default address
pubNet, err = common.Network.FindNetworkAddress(s.ClusterState().Address().Hostname())
if err != nil {
return fmt.Errorf("failed to locate public network: %w", err)
}
// update the database
err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
_, err = database.CreateConfigItem(ctx, tx, database.ConfigItem{Key: "public_network", Value: pubNet})
if err != nil {
return fmt.Errorf("failed to record public_network: %w", err)
}
return nil
})
}

return nil
}

// backwardCompatMonitors retrieves monitor addresses from the node list and returns that
// this a backward-compat shim to accomodate older versions of microceph
func backwardCompatMonitors(s interfaces.StateInterface) ([]string, error) {
var err error
var configItems []database.ConfigItem
var monitors []database.Service
serviceName := "mon"

err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
configItems, err = database.GetConfigItems(ctx, tx)
monitors, err = database.GetServices(ctx, tx, database.ServiceFilter{Service: &serviceName})
if err != nil {
return err
}

return nil
})
if err != nil {
return err
return nil, err
}

config := map[string]string{}
for _, item := range configItems {
config[item.Key] = item.Value
monitorAddresses := make([]string, len(monitors))
remotes := s.ClusterState().Remotes().RemotesByName()
for i, monitor := range monitors {
remote, ok := remotes[monitor.Member]
if !ok {
continue
}
monitorAddresses[i] = remote.Address.Addr().String()
}
return monitorAddresses, nil
}

// UpdateConfig updates the ceph.conf file with the current configuration.
func UpdateConfig(s interfaces.StateInterface) error {
confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf")
runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run")

err := backwardCompatPubnet(s)
if err != nil {
return fmt.Errorf("failed to ensure backward compat: %w", err)
}

config, err := getConfigDb(s)
if err != nil {
return fmt.Errorf("failed to get config db: %w", err)
}

// REF: https://docs.ceph.com/en/quincy/rados/configuration/network-config-ref/#ceph-daemons
// The mon host configuration option only needs to be sufficiently up to date such that a
// client can reach one monitor that is currently online.
monitorAddresses := getMonitorAddresses(config)

// backward compat: if no mon hosts found, get them from the node addresses but don't
// insert into db, as the join logic will take care of that.
if len(monitorAddresses) == 0 {
monitorAddresses, err = backwardCompatMonitors(s)
if err != nil {
return fmt.Errorf("failed to get monitor addresses: %w", err)
}
}

conf := newCephConfig(confPath)

// Check if host has IP address on the configured public network.
_, err = common.Network.FindIpOnSubnet(config["public_network"])
if err != nil {
return fmt.Errorf("failed to locate IP on public network %s: %w", config["public_network"], err)
}

clientConfig, err := GetClientConfigForHost(s, s.ClusterState().Name())
if err != nil {
logger.Errorf("Failed to pull Client Configurations: %v", err)
Expand All @@ -225,6 +287,7 @@ func UpdateConfig(s interfaces.StateInterface) error {
if err != nil {
return fmt.Errorf("couldn't render ceph.conf: %w", err)
}
logger.Debugf("updated ceph.conf: %v", conf.GetPath())

// Generate ceph.client.admin.keyring
keyring := newCephKeyring(confPath, "ceph.keyring")
Expand All @@ -242,6 +305,30 @@ func UpdateConfig(s interfaces.StateInterface) error {
return nil
}

// getConfigDb retrieves the configuration from the database.
func getConfigDb(s interfaces.StateInterface) (map[string]string, error) {
var err error
var configItems []database.ConfigItem

err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
configItems, err = database.GetConfigItems(ctx, tx)
if err != nil {
return err
}

return nil
})
if err != nil {
return nil, err
}

config := map[string]string{}
for _, item := range configItems {
config[item.Key] = item.Value
}
return config, nil
}

// getMonitorAddresses scans a provided config key/value map and returns a list of mon hosts found.
func getMonitorAddresses(configs map[string]string) []string {
monHosts := []string{}
Expand Down
8 changes: 6 additions & 2 deletions microceph/ceph/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package ceph
import (
"context"
"database/sql"
"github.com/canonical/lxd/shared/logger"
"github.com/canonical/microceph/microceph/interfaces"
"reflect"
"time"
Expand All @@ -19,6 +20,7 @@ func Start(s interfaces.StateInterface) error {
for {
// Check that the database is ready.
if !s.ClusterState().Database.IsOpen() {
logger.Debug("start: database not ready, waiting...")
time.Sleep(10 * time.Second)
continue
}
Expand All @@ -39,26 +41,28 @@ func Start(s interfaces.StateInterface) error {
return nil
})
if err != nil {
logger.Warnf("start: failed to fetch monitors, retrying: %v", err)
time.Sleep(10 * time.Second)
continue
}

// Compare to the previous list.
if reflect.DeepEqual(oldMonitors, monitors) {
logger.Debugf("start: monitors unchanged, sleeping: %v", monitors)
time.Sleep(time.Minute)
continue
}

err = UpdateConfig(s)
if err != nil {
logger.Errorf("start: failed to update config, retrying: %v", err)
time.Sleep(10 * time.Second)
continue
}

logger.Debug("start: updated config, sleeping")
oldMonitors = monitors
time.Sleep(time.Minute)
}

}()

return nil
Expand Down
25 changes: 25 additions & 0 deletions tests/scripts/actionutils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,31 @@ function test_migration() {
return -1
}

function test_ceph_conf() {
set -uex
for n in $( lxc ls -c n --format csv ); do
echo "checking node $n"
lxc exec $n -- sh <<'EOF'
# Test: configured rundir must be current
current=$( realpath /var/snap/microceph/current )
rundir=$( cat /var/snap/microceph/current/conf/ceph.conf | awk '/run dir/{ print $4 }' )
p=$( dirname $rundir )
if [ $p != $current ]; then
echo "Error: snap data dir $current, configured run dir: $rundir"
cat /var/snap/microceph/current/conf/ceph.conf
exit -1
fi
# Test: must contain public_network
if ! grep -q public_net /var/snap/microceph/current/conf/ceph.conf ; then
echo "Error: didn't find public_net in ceph.conf"
cat /var/snap/microceph/current/conf/ceph.conf
exit -1
fi
EOF
done
}

function headexec() {
local run="${1?missing}"
shift
Expand Down

0 comments on commit 94f442a

Please sign in to comment.