Skip to content

Commit

Permalink
Add a script of checking if all LoginNodes stopped
Browse files Browse the repository at this point in the history
This script is used by update_munge_key.sh script

Customer must stop all the LoginNodes by set the count to 0 before update munge key

Create a sub-recipe for the configuration of check login stopped script

Add check if LoginNodes be removed logic in update.rb

The script will not exist if LoginNodes section is not specified
  • Loading branch information
hehe7318 authored Oct 10, 2023
1 parent 1fc1e2c commit ea23363
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 1 deletion.
12 changes: 12 additions & 0 deletions cookbooks/aws-parallelcluster-slurm/libraries/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.
# rubocop:disable Style/SingleArgumentDig

# Helpers functions used by update recipe steps.
require 'chef/mixin/shell_out'
Expand Down Expand Up @@ -83,3 +84,14 @@ def execute_command(command, user = "root", timeout = 300, raise_on_error = true
def is_custom_munge_key_updated?
config_parameter_changed?(%w(DevSettings MungeKeySettings MungeKeySecretArn))
end

def is_login_nodes_pool_name_updated?
config_parameter_changed?(['LoginNodes', 'Pools', 0, 'Name'])
end

def is_login_nodes_removed?
require 'yaml'
config = YAML.safe_load(File.read(node['cluster']['cluster_config_path']))
previous_config = YAML.safe_load(File.read(node['cluster']['previous_cluster_config_path']))
previous_config.dig("LoginNodes") and !config.dig("LoginNodes")
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# frozen_string_literal: true

# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
# rubocop:disable Style/SingleArgumentDig
template "#{node['cluster']['scripts_dir']}/slurm/check_login_nodes_stopped.sh" do
source 'slurm/head_node/check_login_nodes_stopped.sh.erb'
owner 'root'
group 'root'
mode '0700'
variables(
cluster_name: node['cluster']['cluster_name'] || node['cluster']['stack_name'],
login_nodes_pool_name: lazy { node['cluster']['config'].dig(:LoginNodes, :Pools, 0, :Name) },
region: node['cluster']['region']
)
only_if do
node['cluster']['config'].dig(:LoginNodes)
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@
retry_delay 2
end unless redhat_on_docker?

include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script'

template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do
source 'slurm/head_node/update_munge_key.sh.erb'
owner 'root'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,19 @@ def update_nodes_in_queue(strategy, queues)
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_slurm_database_updated? }
end unless on_docker?

# Update rotation script to update secret arn
# Cover the following two scenarios:
# - a cluster without login nodes is updated to have login nodes;
# - a cluster with login nodes is updated to use another pool name.
if ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_login_nodes_pool_name_updated?
include_recipe 'aws-parallelcluster-slurm::config_check_login_stopped_script'
end

file "#{node['cluster']['scripts_dir']}/slurm/check_login_nodes_stopped.sh" do
action :delete
only_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && is_login_nodes_removed? }
end

# Update munge key rotation script to update secret arn
template "#{node['cluster']['scripts_dir']}/slurm/update_munge_key.sh" do
source 'slurm/head_node/update_munge_key.sh.erb'
owner 'root'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash
# This script checks whether there are running login nodes in a specified AWS ParallelCluster stack and login nodes pool.
# It first retrieves the ARN of the Load Balancer associated with the specified stack and login nodes pool.
# If a Load Balancer is found, it then retrieves the ARN of the Target Group associated with the Load Balancer.
# Lastly, it checks the health of the targets in the Target Group to determine the number of healthy and unhealthy login nodes.
# If there are any healthy or unhealthy nodes found, it concludes that there are running login nodes.
#
# Usage: ./check_if_has_running_login_nodes.sh

set -e

CLUSTER_NAME="<%= @cluster_name %>"
LOGIN_NODES_POOL_NAME="<%= @login_nodes_pool_name %>"
REGION="<%= @region %>"

# List all Load Balancers
load_balancers=$(aws elbv2 describe-load-balancers --region ${REGION})

# Iterate over Load Balancers to find the one with matching tags
load_balancer_arn=''
for arn in $(echo "${load_balancers}" | jq -r '.LoadBalancers[].LoadBalancerArn'); do
# Get tags for the current Load Balancer
tags=$(aws elbv2 describe-tags --resource-arns "${arn}" --region ${REGION})

# Check if the tags match the desired stack name and login nodes pool name
cluster_name_match=$(echo "${tags}" | jq -r --arg key "parallelcluster:cluster-name" --arg value "${CLUSTER_NAME}" '.TagDescriptions[] | select(.Tags[]? | (.Key == $key and .Value == $value))')
login_nodes_pool_name_match=$(echo "${tags}" | jq -r --arg key "parallelcluster:login-nodes-pool" --arg value "${LOGIN_NODES_POOL_NAME}" '.TagDescriptions[] | select(.Tags[]? | (.Key == $key and .Value == $value))')

# If both tags are found, store the ARN and break the loop
# For now, there's only one pool of login nodes per cluster.
if [[ -n "${cluster_name_match}" && -n "${login_nodes_pool_name_match}" ]]; then
load_balancer_arn="${arn}"
break
fi
done

# Output result
if [[ -n "${load_balancer_arn}" ]]; then
echo "Load Balancer ARN found: ${load_balancer_arn}"
else
echo "No Load Balancer found for the cluster ${CLUSTER_NAME} and login nodes pool ${LOGIN_NODES_POOL_NAME}."
exit 1
fi

# Get Target Group ARN associated with the Load Balancer
target_group_arn=$(aws elbv2 describe-target-groups \
--load-balancer-arn $load_balancer_arn \
--query "TargetGroups[0].TargetGroupArn" \
--output text \
--region ${REGION})

# Exit if Target Group is not found
if [[ -n "${target_group_arn}" ]]; then
echo "TargetGroup ARN found: ${target_group_arn}"
else
echo "No Target Group found for the specified Load Balancer ${load_balancer_arn}."
exit 1
fi

# Get the number of healthy and unhealthy targets
target_healths=$(aws elbv2 describe-target-health \
--target-group-arn $target_group_arn \
--region ${REGION})

healthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State == "healthy") | .Target.Id' | wc -l)
unhealthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State != "healthy") | .Target.Id' | wc -l)

# Check if there are running login nodes
total_nodes=$((healthy_count + unhealthy_count))
if [[ $total_nodes -gt 0 ]]; then
echo "Login nodes are running. Please stop them before updating the munge key."
exit 1
fi

echo "Login nodes are stopped."
exit 0
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ MUNGE_USER="<%= @munge_user %>"
MUNGE_GROUP="<%= @munge_group %>"
SHARED_DIRECTORY_COMPUTE="<%= @shared_directory_compute %>"
SHARED_DIRECTORY_LOGIN="<%= @shared_directory_login %>"
CHECK_LOGIN_NODES_SCRIPT_PATH="<%= node['cluster']['scripts_dir'] %>/slurm/check_login_nodes_stopped.sh"

# Check if the script exists
if [ -f "$CHECK_LOGIN_NODES_SCRIPT_PATH" ]; then
# Check if login nodes are running
if ! $CHECK_LOGIN_NODES_SCRIPT_PATH; then
exit 1
fi
fi

# Check compute fleet status
compute_fleet_status=$(get-compute-fleet-status.sh)
Expand Down

0 comments on commit ea23363

Please sign in to comment.