From efc3f6a578c4c1d5e44333ecca4add1b22ce2980 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:21:13 +0100 Subject: [PATCH 01/28] host volume struct update --- api/host_volumes.go | 8 ++++++++ nomad/structs/host_volumes.go | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/api/host_volumes.go b/api/host_volumes.go index 661ac77c183..0f62c906419 100644 --- a/api/host_volumes.go +++ b/api/host_volumes.go @@ -62,6 +62,14 @@ type HostVolume struct { // created. We record this to make debugging easier. HostPath string `mapstructure:"host_path" hcl:"host_path"` + // Sticky property specifies whether the scheduler should treat this volume + // as assigned to a particular allocation. If marked sticky, the ID of this + // volume will be added to an allocation that uses it during scheduling, + // and every time that allocation gets rescheduled it will only be on a + // node that has this Volume ID present, thus allowing stateful + // deployments. + Sticky bool + // State represents the overall state of the volume. One of pending, ready, // deleted. State HostVolumeState diff --git a/nomad/structs/host_volumes.go b/nomad/structs/host_volumes.go index c254bf72902..a242e82f0f4 100644 --- a/nomad/structs/host_volumes.go +++ b/nomad/structs/host_volumes.go @@ -70,6 +70,14 @@ type HostVolume struct { // created. We record this to make debugging easier. HostPath string + // Sticky property specifies whether the scheduler should treat this volume + // as assigned to a particular allocation. If marked sticky, the ID of this + // volume will be added to an allocation that uses it during scheduling, + // and every time that allocation gets rescheduled it will only be on a + // node that has this Volume ID present, thus allowing stateful + // deployments. + Sticky bool + // State represents the overall state of the volume. One of pending, ready, // deleted. State HostVolumeState From f30f44e14c2a0cb035a72f65043c1c5b17e09878 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:21:21 +0100 Subject: [PATCH 02/28] Allocation update --- nomad/structs/structs.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 2d28c003150..6883fbac2e9 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -11114,6 +11114,9 @@ type Allocation struct { // AllocatedResources is the total resources allocated for the task group. AllocatedResources *AllocatedResources + // VolumeID is the ID of the host volume that this allocation requires. + VolumeID *string + // Metrics associated with this allocation Metrics *AllocMetric From d321fc410eebe14a5f42f837085da9afe8c45030 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 3 Dec 2024 08:58:16 +0100 Subject: [PATCH 03/28] notes --- nomad/structs/structs.go | 1 + scheduler/feasible.go | 2 ++ scheduler/generic_sched.go | 3 +++ 3 files changed, 6 insertions(+) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 6883fbac2e9..1894bcf10da 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -11115,6 +11115,7 @@ type Allocation struct { AllocatedResources *AllocatedResources // VolumeID is the ID of the host volume that this allocation requires. + // FIXME:could be multiple, could be CSI? can't just be a string VolumeID *string // Metrics associated with this allocation diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 60442f92e7f..323aa78bfe8 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -159,6 +159,8 @@ func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[ continue // filter CSI volumes } + // FIXME: if there's a sticky vol set, adjust this to look for an ID + if req.PerAlloc { // provide a unique volume source per allocation copied := req.Copy() diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index f9fd669e592..f05293784fc 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -894,11 +894,14 @@ func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation // findPreferredNode finds the preferred node for an allocation func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.Node, error) { + // TODO: is previous allocation set on destrouctive updates? prev := place.PreviousAllocation() if prev == nil { return nil, nil } if place.TaskGroup().EphemeralDisk.Sticky || place.TaskGroup().EphemeralDisk.Migrate { + // TODO: this could be a good place where we'd stick the sticky volume + // logic, to find the node that must have the right vol id var preferredNode *structs.Node ws := memdb.NewWatchSet() preferredNode, err := s.state.NodeByID(ws, prev.NodeID) From 75b2527254302bb9e5cfcf67c6cfa2f87a930e44 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 3 Dec 2024 11:04:48 +0100 Subject: [PATCH 04/28] VolumeRequest and VolumeMount update --- api/host_volumes.go | 8 -------- nomad/structs/host_volumes.go | 8 -------- nomad/structs/volumes.go | 6 ++++-- nomad/structs/volumes_test.go | 13 +++++++++++++ 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/api/host_volumes.go b/api/host_volumes.go index 0f62c906419..661ac77c183 100644 --- a/api/host_volumes.go +++ b/api/host_volumes.go @@ -62,14 +62,6 @@ type HostVolume struct { // created. We record this to make debugging easier. HostPath string `mapstructure:"host_path" hcl:"host_path"` - // Sticky property specifies whether the scheduler should treat this volume - // as assigned to a particular allocation. If marked sticky, the ID of this - // volume will be added to an allocation that uses it during scheduling, - // and every time that allocation gets rescheduled it will only be on a - // node that has this Volume ID present, thus allowing stateful - // deployments. - Sticky bool - // State represents the overall state of the volume. One of pending, ready, // deleted. State HostVolumeState diff --git a/nomad/structs/host_volumes.go b/nomad/structs/host_volumes.go index a242e82f0f4..c254bf72902 100644 --- a/nomad/structs/host_volumes.go +++ b/nomad/structs/host_volumes.go @@ -70,14 +70,6 @@ type HostVolume struct { // created. We record this to make debugging easier. HostPath string - // Sticky property specifies whether the scheduler should treat this volume - // as assigned to a particular allocation. If marked sticky, the ID of this - // volume will be added to an allocation that uses it during scheduling, - // and every time that allocation gets rescheduled it will only be on a - // node that has this Volume ID present, thus allowing stateful - // deployments. - Sticky bool - // State represents the overall state of the volume. One of pending, ready, // deleted. State HostVolumeState diff --git a/nomad/structs/volumes.go b/nomad/structs/volumes.go index b8c95fc2862..29609742162 100644 --- a/nomad/structs/volumes.go +++ b/nomad/structs/volumes.go @@ -145,8 +145,7 @@ func (v *VolumeRequest) Equal(o *VolumeRequest) bool { } func (v *VolumeRequest) Validate(jobType string, taskGroupCount, canaries int) error { - if !(v.Type == VolumeTypeHost || - v.Type == VolumeTypeCSI) { + if !(v.Type == VolumeTypeHost || v.Type == VolumeTypeCSI) { return fmt.Errorf("volume has unrecognized type %s", v.Type) } @@ -181,6 +180,9 @@ func (v *VolumeRequest) Validate(jobType string, taskGroupCount, canaries int) e } case VolumeTypeCSI: + if v.Sticky { + addErr("CSI volumes cannot be set to sticky") + } switch v.AttachmentMode { case CSIVolumeAttachmentModeUnknown: diff --git a/nomad/structs/volumes_test.go b/nomad/structs/volumes_test.go index 58585932d7c..1b7efc01d7b 100644 --- a/nomad/structs/volumes_test.go +++ b/nomad/structs/volumes_test.go @@ -85,6 +85,19 @@ func TestVolumeRequest_Validate(t *testing.T) { PerAlloc: true, }, }, + { + name: "Sticky CSI", + expected: []string{ + "CSI volumes cannot be set to sticky", + }, + req: &VolumeRequest{ + Source: "source", + Type: VolumeTypeCSI, + Sticky: true, + AttachmentMode: CSIVolumeAttachmentModeBlockDevice, + AccessMode: CSIVolumeAccessModeMultiNodeMultiWriter, + }, + }, } for _, tc := range testCases { From 8927c5d7e820f1c3bd912c4b34e2f9444a819840 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:17:29 +0100 Subject: [PATCH 05/28] super hacky prototype --- nomad/structs/structs.go | 6 ++--- scheduler/feasible.go | 12 +++++----- script.sh | 49 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 9 deletions(-) create mode 100755 script.sh diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 1894bcf10da..de87e373448 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -11114,9 +11114,9 @@ type Allocation struct { // AllocatedResources is the total resources allocated for the task group. AllocatedResources *AllocatedResources - // VolumeID is the ID of the host volume that this allocation requires. - // FIXME:could be multiple, could be CSI? can't just be a string - VolumeID *string + // VolumeIDs is a list of volume IDs (host or CSI) that this allocation + // requires. + VolumeIDs []string // Metrics associated with this allocation Metrics *AllocMetric diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 323aa78bfe8..a6d21998d2d 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -20,6 +20,7 @@ import ( const ( FilterConstraintHostVolumes = "missing compatible host volumes" + FilterConstraintHostVolumesLookupFailed = "host volume lookup failed" FilterConstraintCSIPluginTemplate = "CSI plugin %s is missing from client %s" FilterConstraintCSIPluginUnhealthyTemplate = "CSI plugin %s is unhealthy on client %s" FilterConstraintCSIPluginMaxVolumesTemplate = "CSI plugin %s has the maximum number of volumes on client %s" @@ -159,8 +160,6 @@ func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[ continue // filter CSI volumes } - // FIXME: if there's a sticky vol set, adjust this to look for an ID - if req.PerAlloc { // provide a unique volume source per allocation copied := req.Copy() @@ -174,11 +173,12 @@ func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[ } func (h *HostVolumeChecker) Feasible(candidate *structs.Node) bool { - if h.hasVolumes(candidate) { + ok, failure := h.hasVolumes(candidate) + if ok { return true } - h.ctx.Metrics().FilterNode(candidate, FilterConstraintHostVolumes) + h.ctx.Metrics().FilterNode(candidate, failure) return false } @@ -192,7 +192,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { for _, req := range h.volumeReqs { volCfg, ok := n.HostVolumes[req.Source] if !ok { - return false + return false, FilterConstraintHostVolumes } if volCfg.ID != "" { // dynamic host volume @@ -227,7 +227,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { } } - return true + return true, "" } type CSIVolumeChecker struct { diff --git a/script.sh b/script.sh new file mode 100755 index 00000000000..9bcdc3b7711 --- /dev/null +++ b/script.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -o errexit + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +# Create test namespaces. +nomad namespace apply -description "test namespace" test-1 +nomad namespace apply -description "test namespace" test-2 +nomad namespace apply -description "test namespace" test-3 +nomad namespace apply -description "test namespace" test-4 +nomad namespace apply -description "test namespace" test-5 +nomad namespace apply -description "test namespace" test-6 +nomad namespace apply -description "test namespace" test-7 +nomad namespace apply -description "test namespace" test-8 +nomad namespace apply -description "test namespace" test-9 + +# Create test ACL policies. +nomad acl policy apply -description "test acl policy" test-1 "$SCRIPT_DIR"/_test_acl_policy.hcl +nomad acl policy apply -description "test acl policy" test-2 "$SCRIPT_DIR"/_test_acl_policy.hcl +nomad acl policy apply -description "test acl policy" test-3 "$SCRIPT_DIR"/_test_acl_policy.hcl +nomad acl policy apply -description "test acl policy" test-4 "$SCRIPT_DIR"/_test_acl_policy.hcl +nomad acl policy apply -description "test acl policy" test-5 "$SCRIPT_DIR"/_test_acl_policy.hcl +nomad acl policy apply -description "test acl policy" test-6 "$SCRIPT_DIR"/_test_acl_policy.hcl +nomad acl policy apply -description "test acl policy" test-7 "$SCRIPT_DIR"/_test_acl_policy.hcl +nomad acl policy apply -description "test acl policy" test-8 "$SCRIPT_DIR"/_test_acl_policy.hcl +nomad acl policy apply -description "test acl policy" test-9 "$SCRIPT_DIR"/_test_acl_policy.hcl + +# Create client ACL tokens. +nomad acl token create -name="test client acl token" -policy=test-1 -type=client +nomad acl token create -name="test client acl token" -policy=test-2 -type=client +nomad acl token create -name="test client acl token" -policy=test-3 -type=client +nomad acl token create -name="test client acl token" -policy=test-4 -type=client +nomad acl token create -name="test client acl token" -policy=test-5 -type=client +nomad acl token create -name="test client acl token" -policy=test-6 -type=client +nomad acl token create -name="test client acl token" -policy=test-7 -type=client +nomad acl token create -name="test client acl token" -policy=test-8 -type=client +nomad acl token create -name="test client acl token" -policy=test-9 -type=client + +# Create management ACL tokens. +nomad acl token create -name="test management acl token" -type=management +nomad acl token create -name="test management acl token" -type=management +nomad acl token create -name="test management acl token" -type=management +nomad acl token create -name="test management acl token" -type=management +nomad acl token create -name="test management acl token" -type=management +nomad acl token create -name="test management acl token" -type=management +nomad acl token create -name="test management acl token" -type=management +nomad acl token create -name="test management acl token" -type=management +nomad acl token create -name="test management acl token" -type=management From b8b4639b15c47c2a3ff6cae6878d6c5ec2deb944 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:07:36 +0100 Subject: [PATCH 06/28] wip findPreferredNode --- scheduler/generic_sched.go | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index f05293784fc..128f55edb22 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -900,8 +900,6 @@ func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.No return nil, nil } if place.TaskGroup().EphemeralDisk.Sticky || place.TaskGroup().EphemeralDisk.Migrate { - // TODO: this could be a good place where we'd stick the sticky volume - // logic, to find the node that must have the right vol id var preferredNode *structs.Node ws := memdb.NewWatchSet() preferredNode, err := s.state.NodeByID(ws, prev.NodeID) @@ -913,6 +911,27 @@ func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.No return preferredNode, nil } } + + for _, vol := range place.TaskGroup().Volumes { + if !vol.Sticky { + continue + } + + var preferredNode *structs.Node + ws := memdb.NewWatchSet() + preferredNode, err := s.state.NodeByID(ws, prev.NodeID) + if err != nil { + return nil, err + } + + // s.state.CSIVolumesByNodeID(ws, ) + + if preferredNode != nil && preferredNode.Ready() { + return preferredNode, nil + } + + } + return nil, nil } From b610f5e15a3118a42369d986adfc66c1042c7898 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:10:32 +0100 Subject: [PATCH 07/28] CSI vols can be sticky too --- nomad/structs/volumes.go | 4 ---- nomad/structs/volumes_test.go | 13 ------------- 2 files changed, 17 deletions(-) diff --git a/nomad/structs/volumes.go b/nomad/structs/volumes.go index 29609742162..0c08219e76c 100644 --- a/nomad/structs/volumes.go +++ b/nomad/structs/volumes.go @@ -180,10 +180,6 @@ func (v *VolumeRequest) Validate(jobType string, taskGroupCount, canaries int) e } case VolumeTypeCSI: - if v.Sticky { - addErr("CSI volumes cannot be set to sticky") - } - switch v.AttachmentMode { case CSIVolumeAttachmentModeUnknown: addErr("CSI volumes must have an attachment mode") diff --git a/nomad/structs/volumes_test.go b/nomad/structs/volumes_test.go index 1b7efc01d7b..58585932d7c 100644 --- a/nomad/structs/volumes_test.go +++ b/nomad/structs/volumes_test.go @@ -85,19 +85,6 @@ func TestVolumeRequest_Validate(t *testing.T) { PerAlloc: true, }, }, - { - name: "Sticky CSI", - expected: []string{ - "CSI volumes cannot be set to sticky", - }, - req: &VolumeRequest{ - Source: "source", - Type: VolumeTypeCSI, - Sticky: true, - AttachmentMode: CSIVolumeAttachmentModeBlockDevice, - AccessMode: CSIVolumeAccessModeMultiNodeMultiWriter, - }, - }, } for _, tc := range testCases { From ba82ddc424eddce6bbb0710757dbb4b7f06ead6e Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:31:42 +0100 Subject: [PATCH 08/28] refactor hasVolumes --- scheduler/feasible.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index a6d21998d2d..60442f92e7f 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -20,7 +20,6 @@ import ( const ( FilterConstraintHostVolumes = "missing compatible host volumes" - FilterConstraintHostVolumesLookupFailed = "host volume lookup failed" FilterConstraintCSIPluginTemplate = "CSI plugin %s is missing from client %s" FilterConstraintCSIPluginUnhealthyTemplate = "CSI plugin %s is unhealthy on client %s" FilterConstraintCSIPluginMaxVolumesTemplate = "CSI plugin %s has the maximum number of volumes on client %s" @@ -173,12 +172,11 @@ func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[ } func (h *HostVolumeChecker) Feasible(candidate *structs.Node) bool { - ok, failure := h.hasVolumes(candidate) - if ok { + if h.hasVolumes(candidate) { return true } - h.ctx.Metrics().FilterNode(candidate, failure) + h.ctx.Metrics().FilterNode(candidate, FilterConstraintHostVolumes) return false } @@ -192,7 +190,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { for _, req := range h.volumeReqs { volCfg, ok := n.HostVolumes[req.Source] if !ok { - return false, FilterConstraintHostVolumes + return false } if volCfg.ID != "" { // dynamic host volume @@ -227,7 +225,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { } } - return true, "" + return true } type CSIVolumeChecker struct { From c1a11ff27c754d0b0b7102a4a63b8a6b042e1920 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:31:01 +0100 Subject: [PATCH 09/28] findPreferredNode --- scheduler/generic_sched.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 128f55edb22..11d1111fc17 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -6,6 +6,7 @@ package scheduler import ( "fmt" "runtime/debug" + "slices" "sort" "time" @@ -894,7 +895,6 @@ func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation // findPreferredNode finds the preferred node for an allocation func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.Node, error) { - // TODO: is previous allocation set on destrouctive updates? prev := place.PreviousAllocation() if prev == nil { return nil, nil @@ -924,12 +924,15 @@ func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.No return nil, err } - // s.state.CSIVolumesByNodeID(ws, ) - if preferredNode != nil && preferredNode.Ready() { - return preferredNode, nil + // if this node has at least one of the allocation volumes, it's a + // preferred one + for _, vol := range preferredNode.HostVolumes { + if slices.Contains(prev.VolumeIDs, vol.VolumeID) { + return preferredNode, nil + } + } } - } return nil, nil From 3cfb7ce2aba3fbacc62e158c1b068663797abb1b Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:24:03 +0100 Subject: [PATCH 10/28] separate CSI and host volumes --- api/allocations.go | 2 ++ nomad/structs/structs.go | 7 +++++-- scheduler/feasible.go | 1 + scheduler/generic_sched.go | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/api/allocations.go b/api/allocations.go index b35e338c559..bf8059d32c2 100644 --- a/api/allocations.go +++ b/api/allocations.go @@ -278,6 +278,8 @@ type Allocation struct { Resources *Resources TaskResources map[string]*Resources AllocatedResources *AllocatedResources + HostVolumeIDs []string + CSIVolumeIDs []string Services map[string]string Metrics *AllocationMetric DesiredStatus string diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index de87e373448..574119d8fef 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -11114,9 +11114,12 @@ type Allocation struct { // AllocatedResources is the total resources allocated for the task group. AllocatedResources *AllocatedResources - // VolumeIDs is a list of volume IDs (host or CSI) that this allocation + // HostVolumeIDs is a list of host volume IDs that this allocation // requires. - VolumeIDs []string + HostVolumeIDs []string + + // CSIVolumeIDs is a list of CSI volume IDs that this allocation requires. + CSIVolumeIDs []string // Metrics associated with this allocation Metrics *AllocMetric diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 60442f92e7f..126f06e4556 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -212,6 +212,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { capOk = true break } + return false } if !capOk { return false diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 11d1111fc17..373d2f9bf60 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -928,7 +928,7 @@ func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.No // if this node has at least one of the allocation volumes, it's a // preferred one for _, vol := range preferredNode.HostVolumes { - if slices.Contains(prev.VolumeIDs, vol.VolumeID) { + if slices.Contains(prev.HostVolumeIDs, vol.VolumeID) { return preferredNode, nil } } From 0664e7c1b3733231737b8be1fd45157ac3f33cd0 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:33:38 +0100 Subject: [PATCH 11/28] accidental git snafu --- script.sh | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) delete mode 100755 script.sh diff --git a/script.sh b/script.sh deleted file mode 100755 index 9bcdc3b7711..00000000000 --- a/script.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash - -set -o errexit - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) - -# Create test namespaces. -nomad namespace apply -description "test namespace" test-1 -nomad namespace apply -description "test namespace" test-2 -nomad namespace apply -description "test namespace" test-3 -nomad namespace apply -description "test namespace" test-4 -nomad namespace apply -description "test namespace" test-5 -nomad namespace apply -description "test namespace" test-6 -nomad namespace apply -description "test namespace" test-7 -nomad namespace apply -description "test namespace" test-8 -nomad namespace apply -description "test namespace" test-9 - -# Create test ACL policies. -nomad acl policy apply -description "test acl policy" test-1 "$SCRIPT_DIR"/_test_acl_policy.hcl -nomad acl policy apply -description "test acl policy" test-2 "$SCRIPT_DIR"/_test_acl_policy.hcl -nomad acl policy apply -description "test acl policy" test-3 "$SCRIPT_DIR"/_test_acl_policy.hcl -nomad acl policy apply -description "test acl policy" test-4 "$SCRIPT_DIR"/_test_acl_policy.hcl -nomad acl policy apply -description "test acl policy" test-5 "$SCRIPT_DIR"/_test_acl_policy.hcl -nomad acl policy apply -description "test acl policy" test-6 "$SCRIPT_DIR"/_test_acl_policy.hcl -nomad acl policy apply -description "test acl policy" test-7 "$SCRIPT_DIR"/_test_acl_policy.hcl -nomad acl policy apply -description "test acl policy" test-8 "$SCRIPT_DIR"/_test_acl_policy.hcl -nomad acl policy apply -description "test acl policy" test-9 "$SCRIPT_DIR"/_test_acl_policy.hcl - -# Create client ACL tokens. -nomad acl token create -name="test client acl token" -policy=test-1 -type=client -nomad acl token create -name="test client acl token" -policy=test-2 -type=client -nomad acl token create -name="test client acl token" -policy=test-3 -type=client -nomad acl token create -name="test client acl token" -policy=test-4 -type=client -nomad acl token create -name="test client acl token" -policy=test-5 -type=client -nomad acl token create -name="test client acl token" -policy=test-6 -type=client -nomad acl token create -name="test client acl token" -policy=test-7 -type=client -nomad acl token create -name="test client acl token" -policy=test-8 -type=client -nomad acl token create -name="test client acl token" -policy=test-9 -type=client - -# Create management ACL tokens. -nomad acl token create -name="test management acl token" -type=management -nomad acl token create -name="test management acl token" -type=management -nomad acl token create -name="test management acl token" -type=management -nomad acl token create -name="test management acl token" -type=management -nomad acl token create -name="test management acl token" -type=management -nomad acl token create -name="test management acl token" -type=management -nomad acl token create -name="test management acl token" -type=management -nomad acl token create -name="test management acl token" -type=management -nomad acl token create -name="test management acl token" -type=management From e0be27e8ba18052fc01784787ad763dc77a856f6 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:37:35 +0100 Subject: [PATCH 12/28] correct findPreferredNode --- scheduler/generic_sched.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 373d2f9bf60..72991d694dd 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -928,7 +928,7 @@ func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.No // if this node has at least one of the allocation volumes, it's a // preferred one for _, vol := range preferredNode.HostVolumes { - if slices.Contains(prev.HostVolumeIDs, vol.VolumeID) { + if slices.Contains(prev.HostVolumeIDs, vol.ID) { return preferredNode, nil } } From d9dbecf1b82041a8012921078a6e0717dc743f71 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:07:14 +0100 Subject: [PATCH 13/28] Tim's comment --- nomad/structs/volumes.go | 4 +++- scheduler/feasible.go | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/nomad/structs/volumes.go b/nomad/structs/volumes.go index 0c08219e76c..b8c95fc2862 100644 --- a/nomad/structs/volumes.go +++ b/nomad/structs/volumes.go @@ -145,7 +145,8 @@ func (v *VolumeRequest) Equal(o *VolumeRequest) bool { } func (v *VolumeRequest) Validate(jobType string, taskGroupCount, canaries int) error { - if !(v.Type == VolumeTypeHost || v.Type == VolumeTypeCSI) { + if !(v.Type == VolumeTypeHost || + v.Type == VolumeTypeCSI) { return fmt.Errorf("volume has unrecognized type %s", v.Type) } @@ -180,6 +181,7 @@ func (v *VolumeRequest) Validate(jobType string, taskGroupCount, canaries int) e } case VolumeTypeCSI: + switch v.AttachmentMode { case CSIVolumeAttachmentModeUnknown: addErr("CSI volumes must have an attachment mode") diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 126f06e4556..60442f92e7f 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -212,7 +212,6 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { capOk = true break } - return false } if !capOk { return false From 1857bbf8434b44c4ad78fb7fc28f5ccb60b6c603 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:48:54 +0100 Subject: [PATCH 14/28] hasVolumes --- scheduler/feasible.go | 26 +++++++++++++++++++++++++- scheduler/scheduler.go | 3 +++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 60442f92e7f..5ab06dbe727 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -8,6 +8,7 @@ import ( "fmt" "reflect" "regexp" + "slices" "strconv" "strings" @@ -181,7 +182,6 @@ func (h *HostVolumeChecker) Feasible(candidate *structs.Node) bool { } func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { - // Fast path: Requested no volumes. No need to check further. if len(h.volumeReqs) == 0 { return true @@ -216,6 +216,30 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { if !capOk { return false } + if req.Sticky { + // FIXME: surely there is a better way to find the right alloc? + allocs, err := h.ctx.ProposedAllocs(n.ID) + if err != nil { + continue + } + + for _, a := range allocs { + if a.TerminalStatus() || a.NodeID != n.ID { + continue + } + + // check if the allocation has any volume IDs attached; if + // not, attach them + if len(a.HostVolumeIDs) == 0 { + a.HostVolumeIDs = []string{vol.ID} + } else { + if !slices.Contains(a.HostVolumeIDs, volCfg.ID) { + return false + } + } + } + } + } else if !req.ReadOnly { // this is a static host volume and can only be mounted ReadOnly, // validate that no requests for it are ReadWrite. diff --git a/scheduler/scheduler.go b/scheduler/scheduler.go index 9d46edf8801..27f87e79745 100644 --- a/scheduler/scheduler.go +++ b/scheduler/scheduler.go @@ -118,8 +118,11 @@ type State interface { // CSIVolumeByID fetch CSI volumes, containing controller jobs CSIVolumesByNodeID(memdb.WatchSet, string, string) (memdb.ResultIterator, error) + // HostVolumeByID fetches host volume by its ID HostVolumeByID(memdb.WatchSet, string, string, bool) (*structs.HostVolume, error) + // HostVolumesByNodeID gets an iterator with all the volumes attached to a + // given node HostVolumesByNodeID(memdb.WatchSet, string, state.SortOption) (memdb.ResultIterator, error) // LatestIndex returns the greatest index value for all indexes. From 955ce28eff736d4eb4503baadd300bcf56fb73fe Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:50:09 +0100 Subject: [PATCH 15/28] simplify --- scheduler/feasible.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 5ab06dbe727..769d7293bb6 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -233,9 +233,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { if len(a.HostVolumeIDs) == 0 { a.HostVolumeIDs = []string{vol.ID} } else { - if !slices.Contains(a.HostVolumeIDs, volCfg.ID) { - return false - } + return slices.Contains(a.HostVolumeIDs, volCfg.ID) } } } From 347287cc4611dd70abab13f1346c368f0f597d84 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:10:54 +0100 Subject: [PATCH 16/28] hasVolumes and tests --- scheduler/feasible.go | 15 ++++--- scheduler/feasible_test.go | 80 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 8 deletions(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 769d7293bb6..866c586d9e1 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -217,7 +217,12 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { return false } if req.Sticky { - // FIXME: surely there is a better way to find the right alloc? + // NOTE: surely there is a better way to find the right alloc? + // Should we perhaps search for allocs by job? Could there be a + // situation in which there are non-terminal allocations + // belonging to the job in question that are sticky, have the + // volume IDs that match what the node offers and should not + // end up in this check? allocs, err := h.ctx.ProposedAllocs(n.ID) if err != nil { continue @@ -228,13 +233,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { continue } - // check if the allocation has any volume IDs attached; if - // not, attach them - if len(a.HostVolumeIDs) == 0 { - a.HostVolumeIDs = []string{vol.ID} - } else { - return slices.Contains(a.HostVolumeIDs, volCfg.ID) - } + return slices.Contains(a.HostVolumeIDs, volCfg.ID) } } diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 9c5a9aaf1a7..23311ec620f 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -366,6 +366,86 @@ func TestHostVolumeChecker_ReadOnly(t *testing.T) { } } +func TestHostVolumeChecker_Sticky(t *testing.T) { + ci.Parallel(t) + + store, ctx := testContext(t) + + nodes := []*structs.Node{ + mock.Node(), + mock.Node(), + } + + hostVolCapsReadWrite := []*structs.HostVolumeCapability{ + { + AttachmentMode: structs.HostVolumeAttachmentModeFilesystem, + AccessMode: structs.HostVolumeAccessModeSingleNodeReader, + }, + { + AttachmentMode: structs.HostVolumeAttachmentModeFilesystem, + AccessMode: structs.HostVolumeAccessModeSingleNodeWriter, + }, + } + + dhv := &structs.HostVolume{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Name: "foo", + NodeID: nodes[1].ID, + RequestedCapabilities: hostVolCapsReadWrite, + State: structs.HostVolumeStateReady, + } + + nodes[0].HostVolumes = map[string]*structs.ClientHostVolumeConfig{} + nodes[1].HostVolumes = map[string]*structs.ClientHostVolumeConfig{"foo": {ID: dhv.ID}} + + for _, node := range nodes { + must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, 1000, node)) + } + must.NoError(t, store.UpsertHostVolume(1000, dhv)) + + stickyRequest := map[string]*structs.VolumeRequest{ + "foo": { + Type: "host", + Source: "foo", + Sticky: true, + AccessMode: structs.CSIVolumeAccessModeSingleNodeWriter, + AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem, + }, + } + + checker := NewHostVolumeChecker(ctx) + + alloc := mock.Alloc() + alloc.NodeID = nodes[1].ID + alloc.HostVolumeIDs = []string{dhv.ID} + + cases := []struct { + name string + node *structs.Node + expect bool + }{ + { + "alloc asking for a sticky volume on an infeasible node", + nodes[0], + false, + }, + { + "alloc asking for a sticky volume on a feasible node", + nodes[1], + true, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + checker.SetVolumes(alloc.Name, structs.DefaultNamespace, stickyRequest) + actual := checker.Feasible(tc.node) + must.Eq(t, tc.expect, actual) + }) + } +} + func TestCSIVolumeChecker(t *testing.T) { ci.Parallel(t) state, ctx := testContext(t) From f5d3edaecaee036505e22f13c66c75102230ecf7 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:40:46 +0100 Subject: [PATCH 17/28] Update nomad/structs/structs.go Co-authored-by: Tim Gross --- nomad/structs/structs.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 574119d8fef..622af2d1325 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -11115,10 +11115,10 @@ type Allocation struct { AllocatedResources *AllocatedResources // HostVolumeIDs is a list of host volume IDs that this allocation - // requires. + // has claimed. HostVolumeIDs []string - // CSIVolumeIDs is a list of CSI volume IDs that this allocation requires. + // CSIVolumeIDs is a list of CSI volume IDs that this allocation has claimed. CSIVolumeIDs []string // Metrics associated with this allocation From 4ae311f21d4fa39e0154ec3edc66b5041d6b837e Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:09:25 +0100 Subject: [PATCH 18/28] don't return too early --- scheduler/feasible.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 866c586d9e1..852e806695c 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -233,7 +233,9 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { continue } - return slices.Contains(a.HostVolumeIDs, volCfg.ID) + if !slices.Contains(a.HostVolumeIDs, volCfg.ID) { + return false + } } } From 7db247eb94b8d97050fafa1c6f6ea4fb4ea70fa4 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:06:21 +0100 Subject: [PATCH 19/28] make alloc ID available to the host volume checker --- scheduler/feasible.go | 79 ++++++++++++++++++++----------------- scheduler/feasible_test.go | 41 ++++++++++++++++--- scheduler/reconcile.go | 2 + scheduler/reconcile_util.go | 2 + scheduler/stack.go | 5 ++- 5 files changed, 84 insertions(+), 45 deletions(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 852e806695c..99d235d9fbd 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -21,6 +21,7 @@ import ( const ( FilterConstraintHostVolumes = "missing compatible host volumes" + FilterConstraintHostVolumesAllocLookupFailed = "sticky host volume allocation lookup failed" FilterConstraintCSIPluginTemplate = "CSI plugin %s is missing from client %s" FilterConstraintCSIPluginUnhealthyTemplate = "CSI plugin %s is unhealthy on client %s" FilterConstraintCSIPluginMaxVolumesTemplate = "CSI plugin %s has the maximum number of volumes on client %s" @@ -139,22 +140,28 @@ func NewRandomIterator(ctx Context, nodes []*structs.Node) *StaticIterator { // the host volumes necessary to schedule a task group. type HostVolumeChecker struct { ctx Context - volumeReqs []*structs.VolumeRequest + volumeReqs []*allocVolumeRequest namespace string } +// allocVolumeRequest associates allocation ID with the volume request +type allocVolumeRequest struct { + allocID string + volumeReq *structs.VolumeRequest +} + // NewHostVolumeChecker creates a HostVolumeChecker from a set of volumes func NewHostVolumeChecker(ctx Context) *HostVolumeChecker { return &HostVolumeChecker{ ctx: ctx, - volumeReqs: []*structs.VolumeRequest{}, + volumeReqs: []*allocVolumeRequest{}, } } // SetVolumes takes the volumes required by a task group and updates the checker. -func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[string]*structs.VolumeRequest) { +func (h *HostVolumeChecker) SetVolumes(allocName, allocID string, ns string, volumes map[string]*structs.VolumeRequest) { h.namespace = ns - h.volumeReqs = []*structs.VolumeRequest{} + h.volumeReqs = []*allocVolumeRequest{} for _, req := range volumes { if req.Type != structs.VolumeTypeHost { continue // filter CSI volumes @@ -164,33 +171,35 @@ func (h *HostVolumeChecker) SetVolumes(allocName string, ns string, volumes map[ // provide a unique volume source per allocation copied := req.Copy() copied.Source = copied.Source + structs.AllocSuffix(allocName) - h.volumeReqs = append(h.volumeReqs, copied) + h.volumeReqs = append(h.volumeReqs, &allocVolumeRequest{allocID, copied}) } else { - h.volumeReqs = append(h.volumeReqs, req) + h.volumeReqs = append(h.volumeReqs, &allocVolumeRequest{allocID, req}) } } } func (h *HostVolumeChecker) Feasible(candidate *structs.Node) bool { - if h.hasVolumes(candidate) { + feasible, failure := h.hasVolumes(candidate) + if feasible { return true } - h.ctx.Metrics().FilterNode(candidate, FilterConstraintHostVolumes) + h.ctx.Metrics().FilterNode(candidate, failure) return false } -func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { +func (h *HostVolumeChecker) hasVolumes(n *structs.Node) (bool, string) { // Fast path: Requested no volumes. No need to check further. if len(h.volumeReqs) == 0 { - return true + return true, "" } + ws := memdb.NewWatchSet() for _, req := range h.volumeReqs { - volCfg, ok := n.HostVolumes[req.Source] + volCfg, ok := n.HostVolumes[req.volumeReq.Source] if !ok { - return false + return false, FilterConstraintHostVolumes } if volCfg.ID != "" { // dynamic host volume @@ -200,55 +209,51 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { // state store; this is only possible if the batched fingerprint // update from a delete RPC is written before the delete RPC's // raft entry completes - return false + return false, FilterConstraintHostVolumes } if vol.State != structs.HostVolumeStateReady { - return false + return false, FilterConstraintHostVolumes } var capOk bool for _, cap := range vol.RequestedCapabilities { - if req.AccessMode == structs.CSIVolumeAccessMode(cap.AccessMode) && - req.AttachmentMode == structs.CSIVolumeAttachmentMode(cap.AttachmentMode) { + if req.volumeReq.AccessMode == structs.CSIVolumeAccessMode(cap.AccessMode) && + req.volumeReq.AttachmentMode == structs.CSIVolumeAttachmentMode(cap.AttachmentMode) { capOk = true break } } if !capOk { - return false + return false, FilterConstraintHostVolumes } - if req.Sticky { - // NOTE: surely there is a better way to find the right alloc? - // Should we perhaps search for allocs by job? Could there be a - // situation in which there are non-terminal allocations - // belonging to the job in question that are sticky, have the - // volume IDs that match what the node offers and should not - // end up in this check? - allocs, err := h.ctx.ProposedAllocs(n.ID) + + if req.volumeReq.Sticky { + allocation, err := h.ctx.State().AllocByID(ws, req.allocID) if err != nil { - continue + return false, FilterConstraintHostVolumesAllocLookupFailed + } + if slices.Contains(allocation.HostVolumeIDs, vol.ID) { + return true, "" } - for _, a := range allocs { - if a.TerminalStatus() || a.NodeID != n.ID { - continue - } - - if !slices.Contains(a.HostVolumeIDs, volCfg.ID) { - return false - } + // if an allocation doesn't have a volume ID associated with + // it, update it + if len(allocation.HostVolumeIDs) == 0 { + allocation.HostVolumeIDs = []string{vol.ID} + // TODO: figure out how to update allocation. Should we + // have a new RPC endpoint for this? } } - } else if !req.ReadOnly { + } else if !req.volumeReq.ReadOnly { // this is a static host volume and can only be mounted ReadOnly, // validate that no requests for it are ReadWrite. if volCfg.ReadOnly { - return false + return false, FilterConstraintHostVolumes } } } - return true + return true, "" } type CSIVolumeChecker struct { diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 23311ec620f..1528baa4712 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -177,7 +177,7 @@ func TestHostVolumeChecker(t *testing.T) { alloc.NodeID = nodes[2].ID for i, c := range cases { - checker.SetVolumes(alloc.Name, structs.DefaultNamespace, c.RequestedVolumes) + checker.SetVolumes(alloc.Name, alloc.ID, structs.DefaultNamespace, c.RequestedVolumes) if act := checker.Feasible(c.Node); act != c.Result { t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result) } @@ -359,7 +359,7 @@ func TestHostVolumeChecker_ReadOnly(t *testing.T) { for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - checker.SetVolumes(alloc.Name, structs.DefaultNamespace, tc.requestedVolumes) + checker.SetVolumes(alloc.Name, alloc.ID, structs.DefaultNamespace, tc.requestedVolumes) actual := checker.Feasible(tc.node) must.Eq(t, tc.expect, actual) }) @@ -416,30 +416,59 @@ func TestHostVolumeChecker_Sticky(t *testing.T) { checker := NewHostVolumeChecker(ctx) - alloc := mock.Alloc() - alloc.NodeID = nodes[1].ID - alloc.HostVolumeIDs = []string{dhv.ID} + // alloc0 wants a previously registered volume ID that's available on node1 + alloc0 := mock.Alloc() + alloc0.NodeID = nodes[1].ID + alloc0.HostVolumeIDs = []string{dhv.ID} + + // alloc1 wants a volume ID that's available on node1 but hasn't used it + // before + alloc1 := mock.Alloc() + alloc1.NodeID = nodes[1].ID + + // alloc2 wants a volume ID that's unrelated + alloc2 := mock.Alloc() + alloc2.NodeID = nodes[1].ID + alloc2.HostVolumeIDs = []string{uuid.Generate()} + + // insert all the allocs into the state + must.NoError(t, store.UpsertAllocs(structs.MsgTypeTestSetup, 1000, []*structs.Allocation{alloc0, alloc1, alloc2})) cases := []struct { name string node *structs.Node + alloc *structs.Allocation expect bool }{ { "alloc asking for a sticky volume on an infeasible node", nodes[0], + alloc0, false, }, { "alloc asking for a sticky volume on a feasible node", nodes[1], + alloc0, true, }, + { + "alloc asking for a sticky volume on a feasible node for the first time", + nodes[1], + alloc1, + true, + }, + { + "alloc asking for an unrelated volume", + nodes[1], + alloc2, + false, + }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - checker.SetVolumes(alloc.Name, structs.DefaultNamespace, stickyRequest) + checker.SetVolumes(tc.alloc.Name, tc.alloc.ID, structs.DefaultNamespace, stickyRequest) actual := checker.Feasible(tc.node) must.Eq(t, tc.expect, actual) }) diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index bf9241797c2..25dff2cfd65 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -805,6 +805,7 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup, for _, alloc := range reschedule { place = append(place, allocPlaceResult{ name: alloc.Name, + id: alloc.ID, taskGroup: group, previousAlloc: alloc, reschedule: true, @@ -830,6 +831,7 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup, existing++ place = append(place, allocPlaceResult{ name: alloc.Name, + id: alloc.ID, taskGroup: group, previousAlloc: alloc, reschedule: false, diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 41a56503c7e..0ca74f75f97 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -66,6 +66,7 @@ type allocStopResult struct { // allocation type allocPlaceResult struct { name string + id string canary bool taskGroup *structs.TaskGroup previousAlloc *structs.Allocation @@ -78,6 +79,7 @@ type allocPlaceResult struct { func (a allocPlaceResult) TaskGroup() *structs.TaskGroup { return a.taskGroup } func (a allocPlaceResult) Name() string { return a.name } +func (a allocPlaceResult) ID() string { return a.id } func (a allocPlaceResult) Canary() bool { return a.canary } func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc } func (a allocPlaceResult) IsRescheduling() bool { return a.reschedule } diff --git a/scheduler/stack.go b/scheduler/stack.go index 1f2b6586886..39c8a8d0ef4 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -39,6 +39,7 @@ type SelectOptions struct { PreferredNodes []*structs.Node Preempt bool AllocName string + AllocID string } // GenericStack is the Stack used for the Generic scheduler. It is @@ -156,7 +157,7 @@ func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ra s.taskGroupDrivers.SetDrivers(tgConstr.drivers) s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) - s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes) + s.taskGroupHostVolumes.SetVolumes(options.AllocName, options.AllocID, s.jobNamespace, tg.Volumes) s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0]) @@ -349,7 +350,7 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran s.taskGroupDrivers.SetDrivers(tgConstr.drivers) s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) - s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes) + s.taskGroupHostVolumes.SetVolumes(options.AllocName, options.AllocID, s.jobNamespace, tg.Volumes) s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0]) From a0ec4c8bb9f0a7ce5595e763ae557c1f4f7e93f3 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:11:06 +0100 Subject: [PATCH 20/28] fix returns --- scheduler/feasible.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 99d235d9fbd..29bf00a703a 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -241,7 +241,10 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) (bool, string) { allocation.HostVolumeIDs = []string{vol.ID} // TODO: figure out how to update allocation. Should we // have a new RPC endpoint for this? + return true, "" } + + return false, FilterConstraintHostVolumes } } else if !req.volumeReq.ReadOnly { From 26d7cd586417611d20e5936584dd2900277ec059 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:08:12 +0100 Subject: [PATCH 21/28] adjust computePlacements --- scheduler/feasible.go | 11 +---------- scheduler/generic_sched.go | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 29bf00a703a..c339dd09127 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -231,16 +231,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) (bool, string) { if err != nil { return false, FilterConstraintHostVolumesAllocLookupFailed } - if slices.Contains(allocation.HostVolumeIDs, vol.ID) { - return true, "" - } - - // if an allocation doesn't have a volume ID associated with - // it, update it - if len(allocation.HostVolumeIDs) == 0 { - allocation.HostVolumeIDs = []string{vol.ID} - // TODO: figure out how to update allocation. Should we - // have a new RPC endpoint for this? + if slices.Contains(allocation.HostVolumeIDs, vol.ID) || len(allocation.HostVolumeIDs) == 0 { return true, "" } diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 72991d694dd..33e5d7dfb4c 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -658,6 +658,21 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul "old_alloc_name", oldAllocName, "new_alloc_name", newAllocName) } + // Are there sticky volumes requested by the task group for the first time? If + // yes, make sure the allocation stores their IDs for future reschedules. + var newHostVolumeIDs []string + for _, v := range tg.Volumes { + if v.Sticky { + if missing.PreviousAllocation() != nil && len(missing.PreviousAllocation().HostVolumeIDs) > 0 { + continue + } + vol, ok := option.Node.HostVolumes[v.Source] + if ok { + newHostVolumeIDs = append(newHostVolumeIDs, vol.ID) + } + } + } + // Create an allocation for this alloc := &structs.Allocation{ ID: uuid.Generate(), @@ -682,6 +697,10 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul }, } + if len(newHostVolumeIDs) > 0 { + alloc.HostVolumeIDs = newHostVolumeIDs + } + // If the new allocation is replacing an older allocation then we // set the record the older allocation id so that they are chained if prevAllocation != nil { From d3bece05c313066e4a2d7a543edd7dbc344c511c Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:32:35 +0100 Subject: [PATCH 22/28] Tim's comments --- scheduler/feasible.go | 22 ++++++++++------------ scheduler/feasible_test.go | 6 +++--- scheduler/generic_sched.go | 11 +++++++---- scheduler/stack.go | 15 ++++++++------- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index c339dd09127..c9ae37a594d 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -144,10 +144,11 @@ type HostVolumeChecker struct { namespace string } -// allocVolumeRequest associates allocation ID with the volume request +// allocVolumeRequest associates allocation volume IDs with the volume request type allocVolumeRequest struct { - allocID string - volumeReq *structs.VolumeRequest + hostVolumeIDs []string + cniVolumeIDs []string + volumeReq *structs.VolumeRequest } // NewHostVolumeChecker creates a HostVolumeChecker from a set of volumes @@ -159,7 +160,9 @@ func NewHostVolumeChecker(ctx Context) *HostVolumeChecker { } // SetVolumes takes the volumes required by a task group and updates the checker. -func (h *HostVolumeChecker) SetVolumes(allocName, allocID string, ns string, volumes map[string]*structs.VolumeRequest) { +func (h *HostVolumeChecker) SetVolumes( + allocName, ns string, volumes map[string]*structs.VolumeRequest, allocHostVolumeIDs []string, +) { h.namespace = ns h.volumeReqs = []*allocVolumeRequest{} for _, req := range volumes { @@ -171,10 +174,10 @@ func (h *HostVolumeChecker) SetVolumes(allocName, allocID string, ns string, vol // provide a unique volume source per allocation copied := req.Copy() copied.Source = copied.Source + structs.AllocSuffix(allocName) - h.volumeReqs = append(h.volumeReqs, &allocVolumeRequest{allocID, copied}) + h.volumeReqs = append(h.volumeReqs, &allocVolumeRequest{volumeReq: copied}) } else { - h.volumeReqs = append(h.volumeReqs, &allocVolumeRequest{allocID, req}) + h.volumeReqs = append(h.volumeReqs, &allocVolumeRequest{hostVolumeIDs: allocHostVolumeIDs, volumeReq: req}) } } } @@ -195,7 +198,6 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) (bool, string) { return true, "" } - ws := memdb.NewWatchSet() for _, req := range h.volumeReqs { volCfg, ok := n.HostVolumes[req.volumeReq.Source] if !ok { @@ -227,11 +229,7 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) (bool, string) { } if req.volumeReq.Sticky { - allocation, err := h.ctx.State().AllocByID(ws, req.allocID) - if err != nil { - return false, FilterConstraintHostVolumesAllocLookupFailed - } - if slices.Contains(allocation.HostVolumeIDs, vol.ID) || len(allocation.HostVolumeIDs) == 0 { + if slices.Contains(req.hostVolumeIDs, vol.ID) || len(req.hostVolumeIDs) == 0 { return true, "" } diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 1528baa4712..3351210c2ee 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -177,7 +177,7 @@ func TestHostVolumeChecker(t *testing.T) { alloc.NodeID = nodes[2].ID for i, c := range cases { - checker.SetVolumes(alloc.Name, alloc.ID, structs.DefaultNamespace, c.RequestedVolumes) + checker.SetVolumes(alloc.Name, structs.DefaultNamespace, c.RequestedVolumes, alloc.HostVolumeIDs) if act := checker.Feasible(c.Node); act != c.Result { t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result) } @@ -359,7 +359,7 @@ func TestHostVolumeChecker_ReadOnly(t *testing.T) { for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - checker.SetVolumes(alloc.Name, alloc.ID, structs.DefaultNamespace, tc.requestedVolumes) + checker.SetVolumes(alloc.Name, structs.DefaultNamespace, tc.requestedVolumes, alloc.HostVolumeIDs) actual := checker.Feasible(tc.node) must.Eq(t, tc.expect, actual) }) @@ -468,7 +468,7 @@ func TestHostVolumeChecker_Sticky(t *testing.T) { for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - checker.SetVolumes(tc.alloc.Name, tc.alloc.ID, structs.DefaultNamespace, stickyRequest) + checker.SetVolumes(tc.alloc.Name, structs.DefaultNamespace, stickyRequest, tc.alloc.HostVolumeIDs) actual := checker.Feasible(tc.node) must.Eq(t, tc.expect, actual) }) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 33e5d7dfb4c..81d4a18c64d 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -666,10 +666,7 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul if missing.PreviousAllocation() != nil && len(missing.PreviousAllocation().HostVolumeIDs) > 0 { continue } - vol, ok := option.Node.HostVolumes[v.Source] - if ok { - newHostVolumeIDs = append(newHostVolumeIDs, vol.ID) - } + newHostVolumeIDs = append(newHostVolumeIDs, option.Node.HostVolumes[v.Source].ID) } } @@ -699,6 +696,8 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul if len(newHostVolumeIDs) > 0 { alloc.HostVolumeIDs = newHostVolumeIDs + } else { + alloc.HostVolumeIDs = prevAllocation.HostVolumeIDs } // If the new allocation is replacing an older allocation then we @@ -858,6 +857,10 @@ func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs } } selectOptions.PenaltyNodeIDs = penaltyNodes + + if prevAllocation.HostVolumeIDs != nil { + selectOptions.AllocationHostVolumeIDs = prevAllocation.HostVolumeIDs + } } if preferredNode != nil { selectOptions.PreferredNodes = []*structs.Node{preferredNode} diff --git a/scheduler/stack.go b/scheduler/stack.go index 39c8a8d0ef4..87375892b71 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -35,11 +35,12 @@ type Stack interface { } type SelectOptions struct { - PenaltyNodeIDs map[string]struct{} - PreferredNodes []*structs.Node - Preempt bool - AllocName string - AllocID string + PenaltyNodeIDs map[string]struct{} + PreferredNodes []*structs.Node + Preempt bool + AllocName string + AllocID string + AllocationHostVolumeIDs []string } // GenericStack is the Stack used for the Generic scheduler. It is @@ -157,7 +158,7 @@ func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ra s.taskGroupDrivers.SetDrivers(tgConstr.drivers) s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) - s.taskGroupHostVolumes.SetVolumes(options.AllocName, options.AllocID, s.jobNamespace, tg.Volumes) + s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes, options.AllocationHostVolumeIDs) s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0]) @@ -350,7 +351,7 @@ func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) *Ran s.taskGroupDrivers.SetDrivers(tgConstr.drivers) s.taskGroupConstraint.SetConstraints(tgConstr.constraints) s.taskGroupDevices.SetTaskGroup(tg) - s.taskGroupHostVolumes.SetVolumes(options.AllocName, options.AllocID, s.jobNamespace, tg.Volumes) + s.taskGroupHostVolumes.SetVolumes(options.AllocName, s.jobNamespace, tg.Volumes, options.AllocationHostVolumeIDs) s.taskGroupCSIVolumes.SetVolumes(options.AllocName, tg.Volumes) if len(tg.Networks) > 0 { s.taskGroupNetwork.SetNetwork(tg.Networks[0]) From f9caa314c36b19ddaedbe0fcf96da12b24c611db Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:36:58 +0100 Subject: [PATCH 23/28] cleanup feasible.go --- scheduler/feasible.go | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index c9ae37a594d..af8638949ad 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -21,7 +21,6 @@ import ( const ( FilterConstraintHostVolumes = "missing compatible host volumes" - FilterConstraintHostVolumesAllocLookupFailed = "sticky host volume allocation lookup failed" FilterConstraintCSIPluginTemplate = "CSI plugin %s is missing from client %s" FilterConstraintCSIPluginUnhealthyTemplate = "CSI plugin %s is unhealthy on client %s" FilterConstraintCSIPluginMaxVolumesTemplate = "CSI plugin %s has the maximum number of volumes on client %s" @@ -183,25 +182,24 @@ func (h *HostVolumeChecker) SetVolumes( } func (h *HostVolumeChecker) Feasible(candidate *structs.Node) bool { - feasible, failure := h.hasVolumes(candidate) - if feasible { + if h.hasVolumes(candidate) { return true } - h.ctx.Metrics().FilterNode(candidate, failure) + h.ctx.Metrics().FilterNode(candidate, FilterConstraintHostVolumes) return false } -func (h *HostVolumeChecker) hasVolumes(n *structs.Node) (bool, string) { +func (h *HostVolumeChecker) hasVolumes(n *structs.Node) bool { // Fast path: Requested no volumes. No need to check further. if len(h.volumeReqs) == 0 { - return true, "" + return true } for _, req := range h.volumeReqs { volCfg, ok := n.HostVolumes[req.volumeReq.Source] if !ok { - return false, FilterConstraintHostVolumes + return false } if volCfg.ID != "" { // dynamic host volume @@ -211,10 +209,10 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) (bool, string) { // state store; this is only possible if the batched fingerprint // update from a delete RPC is written before the delete RPC's // raft entry completes - return false, FilterConstraintHostVolumes + return false } if vol.State != structs.HostVolumeStateReady { - return false, FilterConstraintHostVolumes + return false } var capOk bool for _, cap := range vol.RequestedCapabilities { @@ -225,27 +223,27 @@ func (h *HostVolumeChecker) hasVolumes(n *structs.Node) (bool, string) { } } if !capOk { - return false, FilterConstraintHostVolumes + return false } if req.volumeReq.Sticky { if slices.Contains(req.hostVolumeIDs, vol.ID) || len(req.hostVolumeIDs) == 0 { - return true, "" + return true } - return false, FilterConstraintHostVolumes + return false } } else if !req.volumeReq.ReadOnly { // this is a static host volume and can only be mounted ReadOnly, // validate that no requests for it are ReadWrite. if volCfg.ReadOnly { - return false, FilterConstraintHostVolumes + return false } } } - return true, "" + return true } type CSIVolumeChecker struct { From cdff86bb91859f3de0b18ca2969903bb58604305 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 16 Dec 2024 17:39:10 +0100 Subject: [PATCH 24/28] clean up reconciler --- scheduler/reconcile.go | 2 -- scheduler/reconcile_util.go | 2 -- 2 files changed, 4 deletions(-) diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 25dff2cfd65..bf9241797c2 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -805,7 +805,6 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup, for _, alloc := range reschedule { place = append(place, allocPlaceResult{ name: alloc.Name, - id: alloc.ID, taskGroup: group, previousAlloc: alloc, reschedule: true, @@ -831,7 +830,6 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup, existing++ place = append(place, allocPlaceResult{ name: alloc.Name, - id: alloc.ID, taskGroup: group, previousAlloc: alloc, reschedule: false, diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 0ca74f75f97..41a56503c7e 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -66,7 +66,6 @@ type allocStopResult struct { // allocation type allocPlaceResult struct { name string - id string canary bool taskGroup *structs.TaskGroup previousAlloc *structs.Allocation @@ -79,7 +78,6 @@ type allocPlaceResult struct { func (a allocPlaceResult) TaskGroup() *structs.TaskGroup { return a.taskGroup } func (a allocPlaceResult) Name() string { return a.name } -func (a allocPlaceResult) ID() string { return a.id } func (a allocPlaceResult) Canary() bool { return a.canary } func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc } func (a allocPlaceResult) IsRescheduling() bool { return a.reschedule } From 4dd3ada87fbaee9e50e4a8b2f9dbda34804a73dd Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 16 Dec 2024 19:18:33 +0100 Subject: [PATCH 25/28] test --- scheduler/generic_sched.go | 4 +- scheduler/generic_sched_test.go | 118 ++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 2 deletions(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 81d4a18c64d..256c6272a83 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -696,8 +696,6 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul if len(newHostVolumeIDs) > 0 { alloc.HostVolumeIDs = newHostVolumeIDs - } else { - alloc.HostVolumeIDs = prevAllocation.HostVolumeIDs } // If the new allocation is replacing an older allocation then we @@ -708,6 +706,8 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul updateRescheduleTracker(alloc, prevAllocation, now) } + alloc.HostVolumeIDs = prevAllocation.HostVolumeIDs + // If the allocation has task handles, // copy them to the new allocation propagateTaskState(alloc, prevAllocation, missing.PreviousLost()) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index adda5e2cb2a..371b75999d1 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -218,6 +218,124 @@ func TestServiceSched_JobRegister_StickyAllocs(t *testing.T) { } } +func TestServiceSched_JobRegister_StickyVolumes(t *testing.T) { + ci.Parallel(t) + + h := NewHarness(t) + + nodes := []*structs.Node{ + mock.Node(), + mock.Node(), + } + + hostVolCapsReadWrite := []*structs.HostVolumeCapability{ + { + AttachmentMode: structs.HostVolumeAttachmentModeFilesystem, + AccessMode: structs.HostVolumeAccessModeSingleNodeReader, + }, + { + AttachmentMode: structs.HostVolumeAttachmentModeFilesystem, + AccessMode: structs.HostVolumeAccessModeSingleNodeWriter, + }, + } + + dhv := &structs.HostVolume{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Name: "foo", + NodeID: nodes[1].ID, + RequestedCapabilities: hostVolCapsReadWrite, + State: structs.HostVolumeStateReady, + } + + nodes[0].HostVolumes = map[string]*structs.ClientHostVolumeConfig{} + nodes[1].HostVolumes = map[string]*structs.ClientHostVolumeConfig{"foo": {ID: dhv.ID}} + + for _, node := range nodes { + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, 1000, node)) + } + must.NoError(t, h.State.UpsertHostVolume(1000, dhv)) + + stickyRequest := map[string]*structs.VolumeRequest{ + "foo": { + Type: "host", + Source: "foo", + Sticky: true, + AccessMode: structs.CSIVolumeAccessModeSingleNodeWriter, + AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem, + }, + } + + // Create a job + job := mock.Job() + job.TaskGroups[0].Volumes = stickyRequest + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + + // Create a mock evaluation to register the job + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + must.NoError(t, h.Process(NewServiceScheduler, eval)) + + // Ensure the plan allocated + plan := h.Plans[0] + planned := make(map[string]*structs.Allocation) + for _, allocList := range plan.NodeAllocation { + for _, alloc := range allocList { + planned[alloc.ID] = alloc + } + } + must.MapLen(t, 10, planned) + + // Ensure that the allocations got the host volume ID added + for _, p := range planned { + must.Eq(t, p.PreviousAllocation, "") + must.Eq(t, p.HostVolumeIDs[0], dhv.ID) + } + + // Update the job to force a rolling upgrade + updated := job.Copy() + updated.TaskGroups[0].Tasks[0].Resources.CPU += 10 + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, updated)) + + // Create a mock evaluation to handle the update + eval = &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + h1 := NewHarnessWithState(t, h.State) + must.NoError(t, h1.Process(NewServiceScheduler, eval)) + + // Ensure we have created only one new allocation + // Ensure a single plan + must.SliceLen(t, 1, h1.Plans) + plan = h1.Plans[0] + var newPlanned []*structs.Allocation + for _, allocList := range plan.NodeAllocation { + newPlanned = append(newPlanned, allocList...) + } + must.SliceLen(t, 10, newPlanned) + + // Ensure that the new allocations retain the host volume ID + for _, new := range newPlanned { + must.NotEq(t, new.PreviousAllocation, "") + must.Eq(t, new.HostVolumeIDs[0], dhv.ID) + } +} + func TestServiceSched_JobRegister_DiskConstraints(t *testing.T) { ci.Parallel(t) From 324e17af35a4643eacdfbfff62fd53dc67787976 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 16 Dec 2024 19:20:30 +0100 Subject: [PATCH 26/28] don't need CNI here --- scheduler/feasible.go | 1 - 1 file changed, 1 deletion(-) diff --git a/scheduler/feasible.go b/scheduler/feasible.go index af8638949ad..b7fe2bc56b6 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -146,7 +146,6 @@ type HostVolumeChecker struct { // allocVolumeRequest associates allocation volume IDs with the volume request type allocVolumeRequest struct { hostVolumeIDs []string - cniVolumeIDs []string volumeReq *structs.VolumeRequest } From bfd9fbdfa631d22e1a92aa09db344e3756afd185 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 16 Dec 2024 19:23:33 +0100 Subject: [PATCH 27/28] extra check --- scheduler/generic_sched.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 256c6272a83..014752355d8 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -706,7 +706,9 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul updateRescheduleTracker(alloc, prevAllocation, now) } - alloc.HostVolumeIDs = prevAllocation.HostVolumeIDs + if len(prevAllocation.HostVolumeIDs) > 0 { + alloc.HostVolumeIDs = prevAllocation.HostVolumeIDs + } // If the allocation has task handles, // copy them to the new allocation From 24ab1498d5d0b094da2300557a006f30a33079e1 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 16 Dec 2024 21:00:51 +0100 Subject: [PATCH 28/28] Tim's comments --- scheduler/generic_sched.go | 3 +-- scheduler/generic_sched_test.go | 9 +++------ scheduler/stack.go | 1 - 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 014752355d8..60b4f7f1eed 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -942,8 +942,7 @@ func (s *GenericScheduler) findPreferredNode(place placementResult) (*structs.No } var preferredNode *structs.Node - ws := memdb.NewWatchSet() - preferredNode, err := s.state.NodeByID(ws, prev.NodeID) + preferredNode, err := s.state.NodeByID(nil, prev.NodeID) if err != nil { return nil, err } diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 371b75999d1..5d471423136 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -316,13 +316,11 @@ func TestServiceSched_JobRegister_StickyVolumes(t *testing.T) { Status: structs.EvalStatusPending, } must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) - h1 := NewHarnessWithState(t, h.State) - must.NoError(t, h1.Process(NewServiceScheduler, eval)) + must.NoError(t, h.Process(NewServiceScheduler, eval)) // Ensure we have created only one new allocation - // Ensure a single plan - must.SliceLen(t, 1, h1.Plans) - plan = h1.Plans[0] + must.SliceLen(t, 2, h.Plans) + plan = h.Plans[0] var newPlanned []*structs.Allocation for _, allocList := range plan.NodeAllocation { newPlanned = append(newPlanned, allocList...) @@ -331,7 +329,6 @@ func TestServiceSched_JobRegister_StickyVolumes(t *testing.T) { // Ensure that the new allocations retain the host volume ID for _, new := range newPlanned { - must.NotEq(t, new.PreviousAllocation, "") must.Eq(t, new.HostVolumeIDs[0], dhv.ID) } } diff --git a/scheduler/stack.go b/scheduler/stack.go index 87375892b71..f978c753f68 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -39,7 +39,6 @@ type SelectOptions struct { PreferredNodes []*structs.Node Preempt bool AllocName string - AllocID string AllocationHostVolumeIDs []string }