Skip to content

Commit

Permalink
Reuse the rotation script in config and update process
Browse files Browse the repository at this point in the history
Avoid code duplication by reusing the rotation script inside the munge resource when retrieving the custom munge key from Secrets Manager

Give a input -d as value DISABLE_RESOURCE_CHECK

Modify the logic inside the rotation script to make it can be used by config and update process.
  • Loading branch information
hehe7318 authored Oct 12, 2023
1 parent 07f517c commit 02fa532
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 95 deletions.
35 changes: 0 additions & 35 deletions cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,25 +65,13 @@ def enable_munge_service
end
end

def restart_munge_service
service "munge" do
supports restart: true
action :restart
retries 5
retry_delay 10
end
end

def setup_munge_head_node
# Generate munge key or get it's value from secrets manager
munge_key_manager 'manage_munge_key' do
munge_key_secret_arn lazy {
node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn)
}
end

enable_munge_service
share_munge_head_node
end

def update_munge_head_node
Expand All @@ -92,29 +80,6 @@ def update_munge_head_node
action :update_munge_key
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
end

restart_munge_service
share_munge_head_node
end

def share_munge_key_to_dir(shared_dir)
bash 'share_munge_key' do
user 'root'
group 'root'
code <<-SHARE_MUNGE_KEY
set -e
mkdir -p #{shared_dir}/.munge
# Copy key to shared dir
cp /etc/munge/munge.key #{shared_dir}/.munge/.munge.key
chmod 0700 #{shared_dir}/.munge
chmod 0600 #{shared_dir}/.munge/.munge.key
SHARE_MUNGE_KEY
end
end

def share_munge_head_node
share_munge_key_to_dir(node['cluster']['shared_dir'])
share_munge_key_to_dir(node['cluster']['shared_dir_login'])
end

def setup_munge_key(shared_dir)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,26 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

# Copy pcluster config generator and templates
include_recipe 'aws-parallelcluster-slurm::config_head_node_directories'

include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script'

template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do
source 'slurm/head_node/update_munge_key.sh.erb'
owner 'root'
group 'root'
mode '0700'
variables(
munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) },
region: node['cluster']['region'],
munge_user: node['cluster']['munge']['user'],
munge_group: node['cluster']['munge']['group'],
shared_directory_compute: node['cluster']['shared_dir'],
shared_directory_login: node['cluster']['shared_dir_login_nodes']
)
end

include_recipe 'aws-parallelcluster-slurm::config_munge_key'

# Export /opt/slurm
Expand All @@ -25,8 +45,6 @@
only_if { node['cluster']['internal_shared_storage_type'] == 'ebs' }
end unless on_docker?

include_recipe 'aws-parallelcluster-slurm::config_head_node_directories'

template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.conf" do
source 'slurm/slurm.conf.erb'
owner 'root'
Expand Down Expand Up @@ -218,20 +236,3 @@
retries 5
retry_delay 2
end unless redhat_on_docker?

include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script'

template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do
source 'slurm/head_node/update_munge_key.sh.erb'
owner 'root'
group 'root'
mode '0700'
variables(
munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) },
region: node['cluster']['region'],
munge_user: node['cluster']['munge']['user'],
munge_group: node['cluster']['munge']['group'],
shared_directory_compute: node['cluster']['shared_dir'],
shared_directory_login: node['cluster']['shared_dir_login']
)
end
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def update_nodes_in_queue(strategy, queues)
munge_user: node['cluster']['munge']['user'],
munge_group: node['cluster']['munge']['group'],
shared_directory_compute: node['cluster']['shared_dir'],
shared_directory_login: node['cluster']['shared_dir_login']
shared_directory_login: node['cluster']['shared_dir_login_nodes']
)
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? }
end
Expand Down
67 changes: 41 additions & 26 deletions cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,41 @@

default_action :setup_munge_key

def fetch_and_decode_munge_key(munge_key_secret_arn)
declare_resource(:bash, 'fetch_and_decode_munge_key') do
def restart_munge_service
declare_resource(:service, "munge") do
supports restart: true
action :restart
retries 5
retry_delay 10
end
end

def share_munge_key_to_dir(shared_dir)
declare_resource(:bash, 'share_munge_key') do
user 'root'
group 'root'
cwd '/tmp'
code <<-FETCH_AND_DECODE
code <<-SHARE_MUNGE_KEY
set -e
# Get encoded munge key from secrets manager
encoded_key=$(aws secretsmanager get-secret-value --secret-id #{munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']})
# If encoded_key doesn't have a value, error and exit
if [ -z "$encoded_key" ]; then
echo "Error fetching munge key from Secrets Manager or the key is empty"
exit 1
fi
# Decode munge key and write to /etc/munge/munge.key
decoded_key=$(echo $encoded_key | base64 -d)
if [ $? -ne 0 ]; then
echo "Error decoding the munge key with base64"
exit 1
fi
mkdir -p #{shared_dir}/.munge
# Copy key to shared dir
cp /etc/munge/munge.key #{shared_dir}/.munge/.munge.key
chmod 0700 #{shared_dir}/.munge
chmod 0600 #{shared_dir}/.munge/.munge.key
SHARE_MUNGE_KEY
end
end

echo "$decoded_key" > /etc/munge/munge.key
def share_munge
share_munge_key_to_dir(node['cluster']['shared_dir'])
share_munge_key_to_dir(node['cluster']['shared_dir_login_nodes'])
end

# Set ownership on the key
chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key
# Enforce correct permission on the key
chmod 0600 /etc/munge/munge.key
FETCH_AND_DECODE
# TODO: Consider renaming 'generate_munge_key' and 'fetch_and_decode_munge_key' to more descriptive names that better convey their functionalities.
def fetch_and_decode_munge_key
declare_resource(:execute, 'fetch_and_decode_munge_key') do
user 'root'
group 'root'
command "/#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh -d"
end
end

Expand All @@ -66,12 +72,21 @@ def generate_munge_key
chmod 0600 /etc/munge/munge.key
GENERATE_KEY
end

# This function randomly generates a new munge key.
# After generating the key, it is essential to restart the munge service so that it starts using the new key.
# Moreover, the new key has to be shared across relevant directories to ensure consistent authentication across the cluster.
# We're restarting the munge service and sharing the munge key here within the `generate_munge_key` method,
# and not within the `fetch_and_decode_munge_key` method because the `update_munge_key.sh` script,
# which is called by `fetch_and_decode_munge_key`, already includes these two operations.
restart_munge_service
share_munge
end

action :setup_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager
fetch_and_decode_munge_key(new_resource.munge_key_secret_arn)
fetch_and_decode_munge_key
else
# This block will randomly generate a munge key
generate_munge_key
Expand All @@ -81,7 +96,7 @@ def generate_munge_key
action :update_munge_key do
if new_resource.munge_key_secret_arn
# This block will fetch the munge key from Secrets Manager and replace the previous munge key
fetch_and_decode_munge_key(new_resource.munge_key_secret_arn)
fetch_and_decode_munge_key
else
# This block will randomly generate a munge key and replace the previous munge key
generate_munge_key
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,37 @@ MUNGE_USER="<%= @munge_user %>"
MUNGE_GROUP="<%= @munge_group %>"
SHARED_DIRECTORY_COMPUTE="<%= @shared_directory_compute %>"
SHARED_DIRECTORY_LOGIN="<%= @shared_directory_login %>"
CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh"

# Check if the script exists
if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then
# Check if login nodes are running
if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then
exit 1
fi
fi

# Check compute fleet status
compute_fleet_status=$(get-compute-fleet-status.sh)
if ! echo "$compute_fleet_status" | grep -q '"status": "STOPPED"'; then
echo "Compute fleet is not stopped. Please stop it before updating the munge key."
exit 1
DISABLE_RESOURCE_CHECK=false

while getopts "d" opt; do
case $opt in
d) DISABLE_RESOURCE_CHECK=true;;
*)
echo "Usage: $0 [-d]" >&2
exit 1
;;
esac
done

if ! $DISABLE_RESOURCE_CHECK; then
# Check compute fleet status
compute_fleet_status=$(get-compute-fleet-status.sh)
if ! echo "$compute_fleet_status" | grep -q '"status": "STOPPED"'; then
echo "Compute fleet is not stopped. Please stop it before updating the munge key."
exit 1
fi

# Check LoginNodes status
CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh"

# Check if the script exists
if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then
# Check if login nodes are running
if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then
exit 1
fi
fi
fi

# If SECRET_ARN is provided, fetch the munge key from Secrets Manager
Expand Down

0 comments on commit 02fa532

Please sign in to comment.