From d42caf8ce4ea499cb024d33f0d792558a147fdc5 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Thu, 5 Oct 2023 16:31:25 -0400 Subject: [PATCH] [Develop] Add support for an external shared /home Users can now specify `/home` as a mount point in SharedStorage. This was previously a reserved directory When a user specifies this, the data in `/home` is transferred without replacement to the external filesystem. This means that in order to share the `/home` directory across clusters, users must specify the same security credentials when creating the clusters. --- CHANGELOG.md | 1 + .../recipes/config.rb | 19 +-- .../recipes/config/ebs.rb | 19 +-- .../recipes/config/efs.rb | 37 +++--- .../recipes/config/export_home.rb | 5 + .../recipes/config/fsx.rb | 42 +++++++ .../recipes/config/mount_home.rb | 119 ++++++++++++++++-- .../recipes/config/raid.rb | 2 +- .../recipes/init.rb | 3 +- .../recipes/init/backup_home_shared_data.rb | 30 +++++ .../recipes/init/restore_home_shared_data.rb | 31 +++++ 11 files changed, 255 insertions(+), 53 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/config/fsx.rb create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/init/backup_home_shared_data.rb create mode 100644 cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b8f7214fa..f9c0c552ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Install [Spack](https://spack.io) by default in cluster user's home directory. - Add support for `Scheduling/SlurmSettings/Database/DatabaseName` parameter to render `StorageLoc` in the slurmdbd configuration generated by ParallelCluster. - Add the option to use EFS storage instead of NFS exports from the head node root volume for intra-cluster shared ParallelCluster, Intel, Slurm, and login node data. +- Allow for mounting `home` as an EFS or FSx external shared storage via the `SharedStorage` section of the config file. **CHANGES** diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config.rb b/cookbooks/aws-parallelcluster-environment/recipes/config.rb index a6e57c7894..cdff07bc91 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config.rb @@ -18,9 +18,9 @@ action :configure end include_recipe 'aws-parallelcluster-environment::ephemeral_drives' -# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes +# update_fs_mapping generates the shared storage mapping file, so it must be executed before shared storage recipes include_recipe 'aws-parallelcluster-environment::update_fs_mapping' -# Export home dir from the head node +# Export the home dir from the head node when using ebs include_recipe 'aws-parallelcluster-environment::export_home' if node['cluster']['internal_shared_storage_type'] == 'ebs' @@ -32,16 +32,5 @@ include_recipe 'aws-parallelcluster-environment::ebs' include_recipe 'aws-parallelcluster-environment::raid' -include_recipe "aws-parallelcluster-environment::efs" - -# Mount FSx directory with manage_fsx resource -lustre "mount fsx" do - fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',') - fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',') - fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',') - fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',') - fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',') - fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',') - action :mount - not_if { node['cluster']['fsx_fs_ids'].split(',').empty? } -end +include_recipe 'aws-parallelcluster-environment::efs' +include_recipe 'aws-parallelcluster-environment::fsx' diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/ebs.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/ebs.rb index 08745e1caa..9dfa3b82a0 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/ebs.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/ebs.rb @@ -11,22 +11,27 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +shared_dirs = node['cluster']['ebs_shared_dirs'].split(',') +volumes = node['cluster']['volume'].split(',') +shared_dirs.each_with_index do |dir, index| + next unless dir == '/home' || dir == 'home' + shared_dirs.delete(dir) + volumes.delete_at(index) + break +end case node['cluster']['node_type'] when 'HeadNode' manage_ebs "add ebs" do - shared_dir_array node['cluster']['ebs_shared_dirs'].split(',') - vol_array node['cluster']['volume'].split(',') + shared_dir_array shared_dirs + vol_array volumes action %i(mount export) - not_if { node['cluster']['ebs_shared_dirs'].split(',').empty? } + not_if { shared_dirs.empty? } end unless on_docker? when 'ComputeFleet', 'LoginNode' - # Parse shared directory info and turn into an array - shared_dir_array = node['cluster']['ebs_shared_dirs'].split(',') - # Mount each volume with NFS - shared_dir_array.each do |dir| + shared_dirs.each do |dir| volume "mount volume #{dir}" do action :mount shared_dir dir diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb index a967ececea..1a6d10b690 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb @@ -11,31 +11,26 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. return if on_docker? -efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') -efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',') -efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',') -efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',') +shared_dir_array = node['cluster']['efs_shared_dirs'].split(',') +id_array = node['cluster']['efs_fs_ids'].split(',') +encryption_array = node['cluster']['efs_encryption_in_transits'].split(',') +iam_array = node['cluster']['efs_iam_authorizations'].split(',') -cx_shared_dir_array = [] -cx_efs_fs_id_array = [] -cx_efs_encryption_array = [] -cx_efs_iam_array = [] - -# Identify the customer use filesystems and store their data in arrays for the EFS resource -efs_shared_dir_array.each_with_index do |dir, index| - next if node['cluster']['internal_shared_dirs'].include?(dir) - cx_shared_dir_array.push(dir) - cx_efs_fs_id_array.push(efs_fs_id_array[index]) - cx_efs_encryption_array.push(efs_encryption_in_transit_array[index]) - cx_efs_iam_array.push(efs_iam_authorization_array[index]) +# Identify the previously mounted filesystems and remove them from the set of filesystems to mount +shared_dir_array.each_with_index do |dir, index| + next unless node['cluster']['internal_shared_dirs'].include?(dir) || dir == "/home" + shared_dir_array.delete(dir) + id_array.delete_at(index) + encryption_array.delete_at(index) + iam_array.delete_at(index) end # Mount EFS directories with the efs resource efs "mount efs" do - shared_dir_array cx_shared_dir_array - efs_fs_id_array cx_efs_fs_id_array - efs_encryption_in_transit_array cx_efs_encryption_array - efs_iam_authorization_array cx_efs_iam_array + shared_dir_array shared_dir_array + efs_fs_id_array id_array + efs_encryption_in_transit_array encryption_array + efs_iam_authorization_array iam_array action :mount - not_if { cx_shared_dir_array.empty? } + not_if { shared_dir_array.empty? } end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb index aa1e1e9dbc..88ffd76888 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/export_home.rb @@ -14,6 +14,11 @@ return if on_docker? +# Check if home is a shared filesystem and return if it is because there is nothing to export +shared_storage = node['cluster']['efs_shared_dirs'].split(',') + node['cluster']['fsx_shared_dirs'].split(',') + + node['cluster']['ebs_shared_dirs'].split(',') + node['cluster']['raid_shared_dir'].split(',') +return if shared_storage.include?('/home') || shared_storage.include?('home') + case node['cluster']['node_type'] when 'HeadNode' volume "export /home" do diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/fsx.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/fsx.rb new file mode 100644 index 0000000000..368152e942 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/fsx.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +return if on_docker? +id_array = node['cluster']['fsx_fs_ids'].split(',') +type_array = node['cluster']['fsx_fs_types'].split(',') +shared_dir_array = node['cluster']['fsx_shared_dirs'].split(',') +dns_name_array = node['cluster']['fsx_dns_names'].split(',') +mount_name_array = node['cluster']['fsx_mount_names'].split(',') +volume_junction_path_array = node['cluster']['fsx_volume_junction_paths'].split(',') + +# Identify the previously mounted filesystems and remove them from the set of filesystems to mount +shared_dir_array.each_with_index do |dir, index| + next unless dir == "/home" + id_array.delete_at(index) + type_array.delete_at(index) + shared_dir_array.delete(dir) + dns_name_array.delete_at(index) + mount_name_array.delete_at(index) + volume_junction_path_array.delete_at(index) +end + +# Mount FSx shared directories with the lustre resource +lustre "mount fsx" do + fsx_fs_id_array id_array + fsx_fs_type_array type_array + fsx_shared_dir_array shared_dir_array + fsx_dns_name_array dns_name_array + fsx_mount_name_array mount_name_array + fsx_volume_junction_path_array volume_junction_path_array + action :mount + not_if { id_array.empty? } +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_home.rb index c2c60662f2..3b59fcb9df 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/mount_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/mount_home.rb @@ -11,12 +11,115 @@ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. -volume "mount /home" do - action :mount - shared_dir '/home' - device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['head_node_home_path']}" }) - fstype 'nfs' - options node['cluster']['nfs']['hard_mount_options'] - retries 10 - retry_delay 6 +return if on_docker? + +shared_storage = { 'efs' => node['cluster']['efs_shared_dirs'].split(','), + 'fsx' => node['cluster']['fsx_shared_dirs'].split(','), + 'ebs' => node['cluster']['ebs_shared_dirs'].split(','), + # Making sure they are all the same object type, even though raid is just a string + 'raid' => node['cluster']['raid_shared_dir'].split(','), +} + +# Check if home is a shared filesystem +shared_home = 'none' +shared_storage.each do |type, dirs| + next unless dirs.include?('/home') || dirs.include?('home') + shared_home = type + break +end + +if shared_home == 'none' + # Mount the NFS export to compute and login nodes, the head node will export /home later + case node['cluster']['node_type'] + when 'ComputeFleet', 'LoginNode' + volume "mount /home" do + action :mount + shared_dir '/home' + device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['head_node_home_path']}" }) + fstype 'nfs' + options node['cluster']['nfs']['hard_mount_options'] + retries 10 + retry_delay 6 + end + when 'HeadNode' + Chef::Log.info("Do not mount NFS shares on the HeadNode") + else + raise "node_type must be ComputeFleet, LoginNode or HeadNode" + end +else + # Identify the filesystem that is shared and mount it + include_recipe "aws-parallelcluster-environment::update_fs_mapping" + include_recipe "aws-parallelcluster-environment::backup_home_shared_data" + case shared_home + when 'efs' + shared_storage['efs'].each_with_index do |dir, index| + next unless dir == "/home" || dir == 'home' + efs "mount shared efs home" do + shared_dir_array [dir] + efs_fs_id_array [node['cluster']['efs_fs_ids'].split(',')[index]] + efs_encryption_in_transit_array [node['cluster']['efs_encryption_in_transits'].split(',')[index]] + efs_iam_authorization_array [node['cluster']['efs_iam_authorizations'].split(',')[index]] + action :mount + end + break + end + when 'fsx' + shared_storage['fsx'].each_with_index do |dir, index| + next unless dir == "/home" || dir == 'home' + lustre "mount shared fsx home" do + fsx_fs_id_array [node['cluster']['fsx_fs_ids'].split(',')[index]] + fsx_fs_type_array [node['cluster']['fsx_fs_types'].split(',')[index]] + fsx_shared_dir_array [dir] + fsx_dns_name_array [node['cluster']['fsx_dns_names'].split(',')[index]] + fsx_mount_name_array [node['cluster']['fsx_mount_names'].split(',')[index]] + fsx_volume_junction_path_array [node['cluster']['fsx_volume_junction_paths'].split(',')[index]] + action :mount + end + break + end + when 'ebs' + case node['cluster']['node_type'] + when 'HeadNode' + shared_storage['ebs'].each_with_index do |dir, index| + next unless dir == "/home" || dir == 'home' + manage_ebs "add ebs /home" do + shared_dir_array [dir] + vol_array [node['cluster']['volume'].split(',')[index]] + action %i(mount export) + end + break + end + when 'ComputeFleet', 'LoginNode' + volume "mount /home" do + action :mount + shared_dir '/home' + device(lazy { "#{node['cluster']['head_node_private_ip']}:#{format_directory('/home')}" }) + fstype 'nfs' + options node['cluster']['nfs']['hard_mount_options'] + retries 10 + retry_delay 6 + end + end + when 'raid' + case node['cluster']['node_type'] + when 'HeadNode' + raid "add raid /home" do + raid_shared_dir '/home' + raid_type node['cluster']['raid_type'] + raid_vol_array node['cluster']['raid_vol_ids'].split(',') + action %i(mount export) + end + when 'ComputeFleet', 'LoginNode' + volume "mount raid /home volume over NFS" do + action :mount + shared_dir '/home' + device(lazy { "#{node['cluster']['head_node_private_ip']}:/home" }) + fstype 'nfs' + options node['cluster']['nfs']['hard_mount_options'] + retries 10 + retry_delay 6 + end + end + end + include_recipe "aws-parallelcluster-environment::restore_home_shared_data" end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/config/raid.rb b/cookbooks/aws-parallelcluster-environment/recipes/config/raid.rb index b422182c54..6b053b9569 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/config/raid.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/config/raid.rb @@ -14,7 +14,7 @@ # Parse and get RAID shared directory info and turn into an array raid_shared_dir = node['cluster']['raid_shared_dir'] -return if raid_shared_dir.empty? +return if raid_shared_dir.empty? || raid_shared_dir == '/home' || raid_shared_dir == 'home' case node['cluster']['node_type'] when 'HeadNode' diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init.rb b/cookbooks/aws-parallelcluster-environment/recipes/init.rb index c5ff6a66fe..e2275e1fb9 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init.rb @@ -26,7 +26,8 @@ raise "internal_shared_storage_type must be ebs or efs" end -include_recipe "aws-parallelcluster-environment::mount_home" if %w(ComputeFleet LoginNode).include? node['cluster']['node_type'] +# Mount the home directory to all nodes if it is shared, otherwise mount the NFS share to compute and login nodes +include_recipe "aws-parallelcluster-environment::mount_home" include_recipe "aws-parallelcluster-environment::network_interfaces" include_recipe 'aws-parallelcluster-environment::imds' diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/backup_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_home_shared_data.rb new file mode 100644 index 0000000000..132eec932f --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/backup_home_shared_data.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # For each, backup the data to a temp location + # This is necessary to preserve any data in these directories that was + # generated during the build of ParallelCluster AMIs after converting to + # shared storage + bash "Backup /home" do + user 'root' + group 'root' + code <<-EOH + mkdir -p /tmp/home + rsync -a /home/ /tmp/home + EOH + end +end diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb new file mode 100644 index 0000000000..b75c04cb8c --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -0,0 +1,31 @@ +# frozen_string_literal: true + +# +# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +return if on_docker? + +if node['cluster']['node_type'] == 'HeadNode' + # Restore the shared storage home data if it doesn't already exist + # This is necessary to preserve any data in these directories that was + # generated during the build of ParallelCluster AMIs after converting to + # shared storage and backed up to a temporary location previously + # Remove the backup after the copy is done + bash "Restore /home" do + user 'root' + group 'root' + code <<-EOH + rsync -a --ignore-existing /tmp/home/ /home + rm -rf /tmp/home/ + EOH + end +end