From 02fa532c7ca02c7df4f480baf7e370538c658e5d Mon Sep 17 00:00:00 2001 From: Xuanqi He <93849823+hehe7318@users.noreply.github.com> Date: Thu, 12 Oct 2023 12:56:45 -0400 Subject: [PATCH] Reuse the rotation script in config and update process Avoid code duplication by reusing the rotation script inside the munge resource when retrieving the custom munge key from Secrets Manager Give a input -d as value DISABLE_RESOURCE_CHECK Modify the logic inside the rotation script to make it can be used by config and update process. --- .../libraries/helpers.rb | 35 ---------- .../recipes/config/config_head_node.rb | 39 +++++------ .../recipes/update/update_head_node.rb | 2 +- .../resources/munge_key_manager.rb | 67 ++++++++++++------- .../slurm/head_node/update_munge_key.sh.erb | 44 ++++++++---- 5 files changed, 92 insertions(+), 95 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb index b4bf24216..67ee86050 100644 --- a/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb @@ -65,15 +65,6 @@ def enable_munge_service end end -def restart_munge_service - service "munge" do - supports restart: true - action :restart - retries 5 - retry_delay 10 - end -end - def setup_munge_head_node # Generate munge key or get it's value from secrets manager munge_key_manager 'manage_munge_key' do @@ -81,9 +72,6 @@ def setup_munge_head_node node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) } end - - enable_munge_service - share_munge_head_node end def update_munge_head_node @@ -92,29 +80,6 @@ def update_munge_head_node action :update_munge_key only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? } end - - restart_munge_service - share_munge_head_node -end - -def share_munge_key_to_dir(shared_dir) - bash 'share_munge_key' do - user 'root' - group 'root' - code <<-SHARE_MUNGE_KEY - set -e - mkdir -p #{shared_dir}/.munge - # Copy key to shared dir - cp /etc/munge/munge.key #{shared_dir}/.munge/.munge.key - chmod 0700 #{shared_dir}/.munge - chmod 0600 #{shared_dir}/.munge/.munge.key - SHARE_MUNGE_KEY - end -end - -def share_munge_head_node - share_munge_key_to_dir(node['cluster']['shared_dir']) - share_munge_key_to_dir(node['cluster']['shared_dir_login']) end def setup_munge_key(shared_dir) diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 69c3e5316..dd30a2c7b 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -15,6 +15,26 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +# Copy pcluster config generator and templates +include_recipe 'aws-parallelcluster-slurm::config_head_node_directories' + +include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script' + +template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do + source 'slurm/head_node/update_munge_key.sh.erb' + owner 'root' + group 'root' + mode '0700' + variables( + munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) }, + region: node['cluster']['region'], + munge_user: node['cluster']['munge']['user'], + munge_group: node['cluster']['munge']['group'], + shared_directory_compute: node['cluster']['shared_dir'], + shared_directory_login: node['cluster']['shared_dir_login_nodes'] + ) +end + include_recipe 'aws-parallelcluster-slurm::config_munge_key' # Export /opt/slurm @@ -25,8 +45,6 @@ only_if { node['cluster']['internal_shared_storage_type'] == 'ebs' } end unless on_docker? -include_recipe 'aws-parallelcluster-slurm::config_head_node_directories' - template "#{node['cluster']['slurm']['install_dir']}/etc/slurm.conf" do source 'slurm/slurm.conf.erb' owner 'root' @@ -218,20 +236,3 @@ retries 5 retry_delay 2 end unless redhat_on_docker? - -include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script' - -template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do - source 'slurm/head_node/update_munge_key.sh.erb' - owner 'root' - group 'root' - mode '0700' - variables( - munge_key_secret_arn: lazy { node['cluster']['config'].dig(:DevSettings, :MungeKeySettings, :MungeKeySecretArn) }, - region: node['cluster']['region'], - munge_user: node['cluster']['munge']['user'], - munge_group: node['cluster']['munge']['group'], - shared_directory_compute: node['cluster']['shared_dir'], - shared_directory_login: node['cluster']['shared_dir_login'] - ) -end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 729406dac..a8ffc5aaf 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -225,7 +225,7 @@ def update_nodes_in_queue(strategy, queues) munge_user: node['cluster']['munge']['user'], munge_group: node['cluster']['munge']['group'], shared_directory_compute: node['cluster']['shared_dir'], - shared_directory_login: node['cluster']['shared_dir_login'] + shared_directory_login: node['cluster']['shared_dir_login_nodes'] ) only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_custom_munge_key_updated? } end diff --git a/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb b/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb index a5f030369..968e323b9 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/munge_key_manager.rb @@ -23,35 +23,41 @@ default_action :setup_munge_key -def fetch_and_decode_munge_key(munge_key_secret_arn) - declare_resource(:bash, 'fetch_and_decode_munge_key') do +def restart_munge_service + declare_resource(:service, "munge") do + supports restart: true + action :restart + retries 5 + retry_delay 10 + end +end + +def share_munge_key_to_dir(shared_dir) + declare_resource(:bash, 'share_munge_key') do user 'root' group 'root' - cwd '/tmp' - code <<-FETCH_AND_DECODE + code <<-SHARE_MUNGE_KEY set -e - # Get encoded munge key from secrets manager - encoded_key=$(aws secretsmanager get-secret-value --secret-id #{munge_key_secret_arn} --query 'SecretString' --output text --region #{node['cluster']['region']}) - # If encoded_key doesn't have a value, error and exit - if [ -z "$encoded_key" ]; then - echo "Error fetching munge key from Secrets Manager or the key is empty" - exit 1 - fi - - # Decode munge key and write to /etc/munge/munge.key - decoded_key=$(echo $encoded_key | base64 -d) - if [ $? -ne 0 ]; then - echo "Error decoding the munge key with base64" - exit 1 - fi + mkdir -p #{shared_dir}/.munge + # Copy key to shared dir + cp /etc/munge/munge.key #{shared_dir}/.munge/.munge.key + chmod 0700 #{shared_dir}/.munge + chmod 0600 #{shared_dir}/.munge/.munge.key + SHARE_MUNGE_KEY + end +end - echo "$decoded_key" > /etc/munge/munge.key +def share_munge + share_munge_key_to_dir(node['cluster']['shared_dir']) + share_munge_key_to_dir(node['cluster']['shared_dir_login_nodes']) +end - # Set ownership on the key - chown #{node['cluster']['munge']['user']}:#{node['cluster']['munge']['group']} /etc/munge/munge.key - # Enforce correct permission on the key - chmod 0600 /etc/munge/munge.key - FETCH_AND_DECODE +# TODO: Consider renaming 'generate_munge_key' and 'fetch_and_decode_munge_key' to more descriptive names that better convey their functionalities. +def fetch_and_decode_munge_key + declare_resource(:execute, 'fetch_and_decode_munge_key') do + user 'root' + group 'root' + command "/#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh -d" end end @@ -66,12 +72,21 @@ def generate_munge_key chmod 0600 /etc/munge/munge.key GENERATE_KEY end + + # This function randomly generates a new munge key. + # After generating the key, it is essential to restart the munge service so that it starts using the new key. + # Moreover, the new key has to be shared across relevant directories to ensure consistent authentication across the cluster. + # We're restarting the munge service and sharing the munge key here within the `generate_munge_key` method, + # and not within the `fetch_and_decode_munge_key` method because the `update_munge_key.sh` script, + # which is called by `fetch_and_decode_munge_key`, already includes these two operations. + restart_munge_service + share_munge end action :setup_munge_key do if new_resource.munge_key_secret_arn # This block will fetch the munge key from Secrets Manager - fetch_and_decode_munge_key(new_resource.munge_key_secret_arn) + fetch_and_decode_munge_key else # This block will randomly generate a munge key generate_munge_key @@ -81,7 +96,7 @@ def generate_munge_key action :update_munge_key do if new_resource.munge_key_secret_arn # This block will fetch the munge key from Secrets Manager and replace the previous munge key - fetch_and_decode_munge_key(new_resource.munge_key_secret_arn) + fetch_and_decode_munge_key else # This block will randomly generate a munge key and replace the previous munge key generate_munge_key diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb index 044819e52..1fa9ffee6 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/update_munge_key.sh.erb @@ -15,21 +15,37 @@ MUNGE_USER="<%= @munge_user %>" MUNGE_GROUP="<%= @munge_group %>" SHARED_DIRECTORY_COMPUTE="<%= @shared_directory_compute %>" SHARED_DIRECTORY_LOGIN="<%= @shared_directory_login %>" -CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh" - -# Check if the script exists -if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then - # Check if login nodes are running - if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then - exit 1 - fi -fi -# Check compute fleet status -compute_fleet_status=$(get-compute-fleet-status.sh) -if ! echo "$compute_fleet_status" | grep -q '"status": "STOPPED"'; then - echo "Compute fleet is not stopped. Please stop it before updating the munge key." - exit 1 +DISABLE_RESOURCE_CHECK=false + +while getopts "d" opt; do + case $opt in + d) DISABLE_RESOURCE_CHECK=true;; + *) + echo "Usage: $0 [-d]" >&2 + exit 1 + ;; + esac +done + +if ! $DISABLE_RESOURCE_CHECK; then + # Check compute fleet status + compute_fleet_status=$(get-compute-fleet-status.sh) + if ! echo "$compute_fleet_status" | grep -q '"status": "STOPPED"'; then + echo "Compute fleet is not stopped. Please stop it before updating the munge key." + exit 1 + fi + + # Check LoginNodes status + CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh" + + # Check if the script exists + if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then + # Check if login nodes are running + if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then + exit 1 + fi + fi fi # If SECRET_ARN is provided, fetch the munge key from Secrets Manager