Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[develop] Move slurm_resume configuration in a sub-recipe #2483

Merged
merged 4 commits into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions cookbooks/aws-parallelcluster-slurm/kitchen.slurm-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -215,3 +215,23 @@ suites:
Uri: test-slurm-database.cluster-abcdefghijkl.us-east-1.rds.amazonaws.com:3306
UserName: clusteradmin
PasswordSecretArn: arn:aws:secretsmanager:us-east-1:123456789012:secret:TestSecret:abcdefghijkl-ABCDEF
- name: config_slurm_resume
run_list:
- recipe[aws-parallelcluster-tests::setup]
- recipe[aws-parallelcluster-slurm::config_slurm_resume]
verifier:
controls:
- config_slurm_resume
attributes:
cluster:
config:
Scheduling:
ScalingStrategy: test-strategy
dependencies:
- resource:nfs # This is required only for mock_slurm_dir that makes a fake export
- recipe:aws-parallelcluster-platform::directories
- recipe:aws-parallelcluster-platform::users
- recipe:aws-parallelcluster-slurm::slurm_users
- recipe:aws-parallelcluster-slurm::config_head_node_directories
- recipe:aws-parallelcluster-slurm::mock_slurm_dir

Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,7 @@
only_if { node['cluster']['internal_shared_storage_type'] == 'ebs' }
end unless on_docker?

# Ensure config directory is in place
directory "#{node['cluster']['slurm']['install_dir']}" do
user 'root'
group 'root'
mode '0755'
end if redhat_on_docker? # we skip slurm setup on Docker UBI because we don't install python

# Ensure config directory is in place
directory "#{node['cluster']['slurm']['install_dir']}/etc" do
user 'root'
group 'root'
mode '0755'
end

# Create directory configured as StateSaveLocation
directory '/var/spool/slurm.state' do
user node['cluster']['slurm']['user']
group node['cluster']['slurm']['group']
mode '0700'
end
include_recipe 'aws-parallelcluster-slurm::config_head_node_directories'

template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.conf" do
source 'slurm/slurm.conf.erb'
Expand All @@ -60,14 +41,6 @@
mode '0644'
end

# Copy pcluster config generator and templates
remote_directory "#{node['cluster']['scripts_dir']}/slurm" do
source 'head_node_slurm/slurm'
mode '0755'
action :create
recursive true
end

unless on_docker?
# Generate pcluster specific configs
no_gpu = nvidia_installed? ? "" : "--no-gpu"
Expand Down Expand Up @@ -160,44 +133,7 @@
mode '0644'
end

template "#{node['cluster']['scripts_dir']}/slurm/slurm_resume" do
source 'slurm/resume_program.erb'
owner node['cluster']['slurm']['user']
group node['cluster']['slurm']['group']
mode '0744'
end

file "/var/log/parallelcluster/slurm_resume.log" do
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
end

file "/var/log/parallelcluster/slurm_resume.events" do
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
end

template "#{node['cluster']['slurm_plugin_dir']}/parallelcluster_slurm_resume.conf" do
source 'slurm/parallelcluster_slurm_resume.conf.erb'
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
variables(
cluster_name: node['cluster']['stack_name'],
region: node['cluster']['region'],
proxy: node['cluster']['proxy'],
dynamodb_table: node['cluster']['slurm_ddb_table'],
hosted_zone: node['cluster']['hosted_zone'],
dns_domain: node['cluster']['dns_domain'],
use_private_hostname: node['cluster']['use_private_hostname'],
head_node_private_ip: on_docker? ? 'local_ipv4' : node['ec2']['local_ipv4'],
head_node_hostname: on_docker? ? 'local_hostname' : node['ec2']['local_hostname'],
clustermgtd_heartbeat_file_path: "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/.slurm_plugin/clustermgtd_heartbeat",
instance_id: on_docker? ? 'instance_id' : node['ec2']['instance_id']
)
end
include_recipe 'aws-parallelcluster-slurm::config_slurm_resume'

template "#{node['cluster']['scripts_dir']}/slurm/slurm_suspend" do
source 'slurm/suspend_program.erb'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# frozen_string_literal: true

#
# Cookbook:: aws-parallelcluster-slurm
# Recipe:: config_head_node_directories
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

# Ensure config directory is in place
directory "#{node['cluster']['slurm']['install_dir']}" do
user 'root'
group 'root'
mode '0755'
end

# Ensure config directory is in place
directory "#{node['cluster']['slurm']['install_dir']}/etc" do
user 'root'
group 'root'
mode '0755'
end

# Create directory configured as StateSaveLocation
directory '/var/spool/slurm.state' do
user node['cluster']['slurm']['user']
group node['cluster']['slurm']['group']
mode '0700'
end

# Copy pcluster config generator and templates
remote_directory "#{node['cluster']['scripts_dir']}/slurm" do
source 'head_node_slurm/slurm'
mode '0755'
action :create
recursive true
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# frozen_string_literal: true

#
# Cookbook:: aws-parallelcluster-slurm
# Recipe:: config_slurm_resume
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

template "#{node['cluster']['scripts_dir']}/slurm/slurm_resume" do
source 'slurm/resume_program.erb'
owner node['cluster']['slurm']['user']
group node['cluster']['slurm']['group']
mode '0744'
end

file "/var/log/parallelcluster/slurm_resume.log" do
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
end

file "/var/log/parallelcluster/slurm_resume.events" do
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
end

template "#{node['cluster']['slurm_plugin_dir']}/parallelcluster_slurm_resume.conf" do
source 'slurm/parallelcluster_slurm_resume.conf.erb'
owner node['cluster']['cluster_admin_user']
group node['cluster']['cluster_admin_group']
mode '0644'
variables(
cluster_name: node['cluster']['stack_name'],
region: node['cluster']['region'],
proxy: node['cluster']['proxy'],
dynamodb_table: node['cluster']['slurm_ddb_table'],
hosted_zone: node['cluster']['hosted_zone'],
dns_domain: node['cluster']['dns_domain'],
use_private_hostname: node['cluster']['use_private_hostname'],
head_node_private_ip: on_docker? ? 'local_ipv4' : node['ec2']['local_ipv4'],
head_node_hostname: on_docker? ? 'local_hostname' : node['ec2']['local_hostname'],
clustermgtd_heartbeat_file_path: "#{node['cluster']['slurm']['install_dir']}/etc/pcluster/.slurm_plugin/clustermgtd_heartbeat",
instance_id: on_docker? ? 'instance_id' : node['ec2']['instance_id'],
scaling_strategy: node['cluster']['config'].dig(:Scheduling, :ScalingStrategy)
)
end
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
slurm_install_dir = '/opt/slurm'
slurm_plugin_dir = '/etc/parallelcluster/slurm_plugin'

# Ensure slurm plugin directory is in place for tests that require it
directory slurm_plugin_dir do
user 'root'
group 'root'
mode '0755'
action :create
recursive true
end

# skips the fake export on docker
return if on_docker?

slurm_install_dir = '/opt/slurm'
directory slurm_install_dir do
mode '1777'
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ head_node_private_ip = <%= @head_node_private_ip %>
head_node_hostname = <%= @head_node_hostname %>
clustermgtd_heartbeat_file_path = <%= @clustermgtd_heartbeat_file_path %>
instance_id = <%= @instance_id %>
scaling_strategy = <%= @scaling_strategy %>
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

control 'config_slurm_resume' do
title 'Check slurm_resume program configuration is created'

only_if { !(instance.compute_node? or instance.login_node?) }

describe file('/etc/parallelcluster/slurm_plugin/parallelcluster_slurm_resume.conf') do
it { should exist }
its('mode') { should cmp '0644' }
its('owner') { should eq 'pcluster-admin' }
its('group') { should eq 'pcluster-admin' }
its('content') { should match 'scaling_strategy = test-strategy' }
end

describe file('/opt/parallelcluster/scripts/slurm/slurm_resume') do
it { should exist }
its('mode') { should cmp '0744' }
its('owner') { should eq 'slurm' }
its('group') { should eq 'slurm' }
end

describe file('/var/log/parallelcluster/slurm_resume.events') do
it { should exist }
its('mode') { should cmp '0644' }
its('owner') { should eq 'pcluster-admin' }
its('group') { should eq 'pcluster-admin' }
end

describe file('/var/log/parallelcluster/slurm_resume.log') do
it { should exist }
its('mode') { should cmp '0644' }
its('owner') { should eq 'pcluster-admin' }
its('group') { should eq 'pcluster-admin' }
end
end