Skip to content

Commit

Permalink
Merge pull request canonical#321 from UtkarshBhatthere/feature/prohib…
Browse files Browse the repository at this point in the history
…itScaledown

Add flag to prohibit scaledown of crush rules on disk removal.
  • Loading branch information
UtkarshBhatthere authored Mar 5, 2024
2 parents 21acc74 + d5559fe commit 1396b7b
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 24 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,18 @@ jobs:
- name: Enable services on wrk1
run: ~/actionutils.sh headexec enable_services node-wrk1

- name: Remove OSD again but don't scaledown crush rule.
run: |
set -uex
lxc exec node-wrk0 -- sh -c "microceph disk remove 3 --prohibit-crush-scaledown --bypass-safety-checks"
lxc exec node-wrk0 -- sh -c "microceph.ceph osd crush rule ls" | grep -F microceph_auto_host
- name: Re-Add OSD
run: |
set -uex
~/actionutils.sh add_osd_to_node node-wrk0
~/actionutils.sh headexec wait_for_osds 3
- name: Test remove node wrk3
run: |
set -uex
Expand Down
28 changes: 16 additions & 12 deletions microceph/api/disks.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,18 +119,22 @@ func cmdDisksDelete(s *state.State, r *http.Request) response.Response {
defer mu.Unlock()

cs := interfaces.CephState{State: s}
needDowngrade, err := ceph.IsDowngradeNeeded(cs, osdid)
if err != nil {
return response.InternalError(err)
}
if needDowngrade && !req.ConfirmDowngrade {
errorMsg := fmt.Errorf(
"Removing osd.%s would require a downgrade of the automatic crush rule from 'host' to 'osd' level. "+
"Likely this will result in additional data movement. Please confirm by setting the "+
"'--confirm-failure-domain-downgrade' flag to true",
osd,
)
return response.BadRequest(errorMsg)

// if check for crush rule scaledown only if crush change is not prohibited.
if !req.ProhibitCrushScaledown {
needDowngrade, err := ceph.IsDowngradeNeeded(cs, osdid)
if err != nil {
return response.InternalError(err)
}
if needDowngrade && !req.ConfirmDowngrade {
errorMsg := fmt.Errorf(
"removing osd.%s would require a downgrade of the automatic crush rule from 'host' to 'osd' level. "+
"Likely this will result in additional data movement. Please confirm by setting the "+
"'--confirm-failure-domain-downgrade' flag to true",
osd,
)
return response.BadRequest(errorMsg)
}
}

err = ceph.RemoveOSD(cs, osdid, req.BypassSafety, req.Timeout)
Expand Down
9 changes: 5 additions & 4 deletions microceph/api/types/disks.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ type DiskAddResponse struct {

// DisksDelete holds an OSD number and a flag for forcing the removal
type DisksDelete struct {
OSD int64 `json:"osdid" yaml:"osdid"`
BypassSafety bool `json:"bypass_safety" yaml:"bypass_safety"`
ConfirmDowngrade bool `json:"confirm_downgrade" yaml:"confirm_downgrade"`
Timeout int64 `json:"timeout" yaml:"timeout"`
OSD int64 `json:"osdid" yaml:"osdid"`
BypassSafety bool `json:"bypass_safety" yaml:"bypass_safety"`
ConfirmDowngrade bool `json:"confirm_downgrade" yaml:"confirm_downgrade"`
ProhibitCrushScaledown bool `json:"prohibit_crush_scaledown" yaml:"prohibit_crush_scaledown"`
Timeout int64 `json:"timeout" yaml:"timeout"`
}

// Disks is a slice of disks
Expand Down
21 changes: 14 additions & 7 deletions microceph/cmd/microceph/disk_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ type cmdDiskRemove struct {
common *CmdControl
disk *cmdDisk

flagBypassSafety bool
flagConfirmDowngrade bool
flagTimeout int64
flagBypassSafety bool
flagConfirmDowngrade bool
flagProhibitCrushScaledown bool
flagTimeout int64
}

func (c *cmdDiskRemove) Command() *cobra.Command {
Expand All @@ -31,6 +32,7 @@ func (c *cmdDiskRemove) Command() *cobra.Command {
cmd.PersistentFlags().Int64Var(&c.flagTimeout, "timeout", 1800, "Timeout to wait for safe removal (seconds), default=1800")
cmd.PersistentFlags().BoolVar(&c.flagBypassSafety, "bypass-safety-checks", false, "Bypass safety checks")
cmd.PersistentFlags().BoolVar(&c.flagConfirmDowngrade, "confirm-failure-domain-downgrade", false, "Confirm failure domain downgrade if required")
cmd.PersistentFlags().BoolVar(&c.flagProhibitCrushScaledown, "prohibit-crush-scaledown", false, "Remove OSD without scaling down the crush failure domain")

return cmd
}
Expand Down Expand Up @@ -63,11 +65,16 @@ func (c *cmdDiskRemove) Run(cmd *cobra.Command, args []string) error {
}
}

if c.flagConfirmDowngrade && c.flagProhibitCrushScaledown {
return fmt.Errorf("bad Request, --confirm-failure-domain-downgrade and --prohibit-crush-scaledown flags are exclusive to each other")
}

req := &types.DisksDelete{
OSD: osd,
BypassSafety: c.flagBypassSafety,
ConfirmDowngrade: c.flagConfirmDowngrade,
Timeout: c.flagTimeout,
OSD: osd,
BypassSafety: c.flagBypassSafety,
ConfirmDowngrade: c.flagConfirmDowngrade,
ProhibitCrushScaledown: c.flagProhibitCrushScaledown,
Timeout: c.flagTimeout,
}

fmt.Printf("Removing osd.%d, timeout %ds\n", osd, req.Timeout)
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/actionutils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ function cluster_nodes() {

function add_osd_to_node() {
local container="${1?missing}"
lxc exec $container -- sh -c "microceph disk add /dev/sdia"
lxc exec $container -- sh -c "microceph disk add /dev/sdia --wipe"
sleep 1
}

Expand Down

0 comments on commit 1396b7b

Please sign in to comment.