Skip to content

Commit

Permalink
Move slurm_resume configuration in a sub-recipe
Browse files Browse the repository at this point in the history
Also factor out the creation of some directory and add
ScalingStrategy parameter to slurm_resume config

Signed-off-by: Nicola Sirena <[email protected]>
  • Loading branch information
NSsirena committed Oct 5, 2023
1 parent efd3921 commit fe3c0a0
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 66 deletions.
20 changes: 20 additions & 0 deletions cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,23 @@ suites:
Uri: test-slurm-database.cluster-abcdefghijkl.us-east-1.rds.amazonaws.com:3306
UserName: clusteradmin
PasswordSecretArn: arn:aws:secretsmanager:us-east-1:123456789012:secret:TestSecret:abcdefghijkl-ABCDEF
- name: config_slurm_resume
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-slurm::config_slurm_resume]
verifier:
controls:
- config_slurm_resume
attributes:
cluster:
config:
Scheduling:
ScalingStrategy: test-strategy
dependencies:
- resource:nfs # This is required only for mock_slurm_dir that makes a fake export
- recipe:aws-parallelcluster-platform::directories
- recipe:aws-parallelcluster-platform::users
- recipe:aws-parallelcluster-slurm::slurm_users
- recipe:aws-parallelcluster-slurm::config_head_node_directories
- recipe:aws-parallelcluster-slurm::mock_slurm_dir

Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,7 @@
options ['no_root_squash']
end unless on_docker?

# Ensure config directory is in place
directory "#{node['cluster']['slurm']['install_dir']}" do
user 'root'
group 'root'
mode '0755'
end if redhat_on_docker? # we skip slurm setup on Docker UBI because we don't install python

# Ensure config directory is in place
directory "#{node['cluster']['slurm']['install_dir']}/etc" do
user 'root'
group 'root'
mode '0755'
end

# Create directory configured as StateSaveLocation
directory '/var/spool/slurm.state' do
user node['cluster']['slurm']['user']
group node['cluster']['slurm']['group']
mode '0700'
end
include_recipe 'aws-parallelcluster-slurm::config_head_node_directories'

template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.conf" do
source 'slurm/slurm.conf.erb'
Expand All @@ -59,14 +40,6 @@
mode '0644'
end

# Copy pcluster config generator and templates
remote_directory "#{node['cluster']['scripts_dir']}/slurm" do
source 'head_node_slurm/slurm'
mode '0755'
action :create
recursive true
end

unless on_docker?
# Generate pcluster specific configs
no_gpu = nvidia_installed? ? "" : "--no-gpu"
Expand Down Expand Up @@ -159,44 +132,7 @@
mode '0644'
end

template "#{node['cluster']['scripts_dir']}/slurm/slurm_resume" do
source 'slurm/resume_program.erb'
owner node['cluster']['slurm']['user']
group node['cluster']['slurm']['group']
mode '0744'
end

file "/var/log/parallelcluster/slurm_resume.log" do
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
end

file "/var/log/parallelcluster/slurm_resume.events" do
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
end

template "#{node['cluster']['slurm_plugin_dir']}/parallelcluster_slurm_resume.conf" do
source 'slurm/parallelcluster_slurm_resume.conf.erb'
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
variables(
cluster_name: node['cluster']['stack_name'],
region: node['cluster']['region'],
proxy: node['cluster']['proxy'],
dynamodb_table: node['cluster']['slurm_ddb_table'],
hosted_zone: node['cluster']['hosted_zone'],
dns_domain: node['cluster']['dns_domain'],
use_private_hostname: node['cluster']['use_private_hostname'],
head_node_private_ip: on_docker? ? 'local_ipv4' : node['ec2']['local_ipv4'],
head_node_hostname: on_docker? ? 'local_hostname' : node['ec2']['local_hostname'],
clustermgtd_heartbeat_file_path: "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/.slurm_plugin/clustermgtd_heartbeat",
instance_id: on_docker? ? 'instance_id' : node['ec2']['instance_id']
)
end
include_recipe 'aws-parallelcluster-slurm::config_slurm_resume'

template "#{node['cluster']['scripts_dir']}/slurm/slurm_suspend" do
source 'slurm/suspend_program.erb'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# frozen_string_literal: true

#
# Cookbook:: aws-parallelcluster-slurm
# Recipe:: config_slurm_resume
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

template "#{node['cluster']['scripts_dir']}/slurm/slurm_resume" do
source 'slurm/resume_program.erb'
owner node['cluster']['slurm']['user']
group node['cluster']['slurm']['group']
mode '0744'
end

file "/var/log/parallelcluster/slurm_resume.log" do
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
end

file "/var/log/parallelcluster/slurm_resume.events" do
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
end

template "#{node['cluster']['slurm_plugin_dir']}/parallelcluster_slurm_resume.conf" do
source 'slurm/parallelcluster_slurm_resume.conf.erb'
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
variables(
cluster_name: node['cluster']['stack_name'],
region: node['cluster']['region'],
proxy: node['cluster']['proxy'],
dynamodb_table: node['cluster']['slurm_ddb_table'],
hosted_zone: node['cluster']['hosted_zone'],
dns_domain: node['cluster']['dns_domain'],
use_private_hostname: node['cluster']['use_private_hostname'],
head_node_private_ip: on_docker? ? 'local_ipv4' : node['ec2']['local_ipv4'],
head_node_hostname: on_docker? ? 'local_hostname' : node['ec2']['local_hostname'],
clustermgtd_heartbeat_file_path: "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/.slurm_plugin/clustermgtd_heartbeat",
instance_id: on_docker? ? 'instance_id' : node['ec2']['instance_id'],
scaling_strategy: node['cluster']['config'].dig(:Scheduling, :ScalingStrategy)
)
end
10 changes: 10 additions & 0 deletions cookbooks/aws-parallelcluster-slurm/recipes/test/mock_slurm_dir.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
return if on_docker?

slurm_install_dir = '/opt/slurm'
slurm_plugin_dir = '/etc/parallelcluster/slurm_plugin'
directory slurm_install_dir do
mode '1777'
end
Expand All @@ -10,3 +11,12 @@
writeable true
options ['no_root_squash']
end

# Ensure slurm plugin directory is in place for tests that require it
directory slurm_plugin_dir do
user 'root'
group 'root'
mode '0755'
action :create
recursive true
end
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ head_node_private_ip = <%= @head_node_private_ip %>
head_node_hostname = <%= @head_node_hostname %>
clustermgtd_heartbeat_file_path = <%= @clustermgtd_heartbeat_file_path %>
instance_id = <%= @instance_id %>
scaling_strategy = <%= @scaling_strategy %>

0 comments on commit fe3c0a0

Please sign in to comment.