Skip to content

Commit

Permalink
[Develop] Add support for an external shared /home (#2486)
Browse files Browse the repository at this point in the history
Users can now specify `/home` as a mount point in SharedStorage.  This was previously a reserved directory
When a user specifies this, the data in `/home` is transferred without replacement to the external filesystem.
This means that in order to share the `/home` directory across clusters, users must specify the same security
credentials when creating the clusters.
  • Loading branch information
dreambeyondorange authored Oct 10, 2023
1 parent ea23363 commit f905257
Show file tree
Hide file tree
Showing 11 changed files with 255 additions and 53 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Install [Spack](https://spack.io) by default in cluster user's home directory.
- Add support for `Scheduling/SlurmSettings/Database/DatabaseName` parameter to render `StorageLoc` in the slurmdbd configuration generated by ParallelCluster.
- Add the option to use EFS storage instead of NFS exports from the head node root volume for intra-cluster shared ParallelCluster, Intel, Slurm, and login node data.
- Allow for mounting `home` as an EFS or FSx external shared storage via the `SharedStorage` section of the config file.

**CHANGES**

Expand Down
19 changes: 4 additions & 15 deletions cookbooks/aws-parallelcluster-environment/recipes/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
action :configure
end
include_recipe 'aws-parallelcluster-environment::ephemeral_drives'
# update_fs_mapping generates the shared storages mapping file so must be executed before shared storages recipes
# update_fs_mapping generates the shared storage mapping file, so it must be executed before shared storage recipes
include_recipe 'aws-parallelcluster-environment::update_fs_mapping'
# Export home dir from the head node
# Export the home dir from the head node when using ebs
include_recipe 'aws-parallelcluster-environment::export_home'

if node['cluster']['internal_shared_storage_type'] == 'ebs'
Expand All @@ -32,16 +32,5 @@

include_recipe 'aws-parallelcluster-environment::ebs'
include_recipe 'aws-parallelcluster-environment::raid'
include_recipe "aws-parallelcluster-environment::efs"

# Mount FSx directory with manage_fsx resource
lustre "mount fsx" do
fsx_fs_id_array node['cluster']['fsx_fs_ids'].split(',')
fsx_fs_type_array node['cluster']['fsx_fs_types'].split(',')
fsx_shared_dir_array node['cluster']['fsx_shared_dirs'].split(',')
fsx_dns_name_array node['cluster']['fsx_dns_names'].split(',')
fsx_mount_name_array node['cluster']['fsx_mount_names'].split(',')
fsx_volume_junction_path_array node['cluster']['fsx_volume_junction_paths'].split(',')
action :mount
not_if { node['cluster']['fsx_fs_ids'].split(',').empty? }
end
include_recipe 'aws-parallelcluster-environment::efs'
include_recipe 'aws-parallelcluster-environment::fsx'
19 changes: 12 additions & 7 deletions cookbooks/aws-parallelcluster-environment/recipes/config/ebs.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,27 @@
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
shared_dirs = node['cluster']['ebs_shared_dirs'].split(',')
volumes = node['cluster']['volume'].split(',')
shared_dirs.each_with_index do |dir, index|
next unless dir == '/home' || dir == 'home'
shared_dirs.delete(dir)
volumes.delete_at(index)
break
end

case node['cluster']['node_type']
when 'HeadNode'
manage_ebs "add ebs" do
shared_dir_array node['cluster']['ebs_shared_dirs'].split(',')
vol_array node['cluster']['volume'].split(',')
shared_dir_array shared_dirs
vol_array volumes
action %i(mount export)
not_if { node['cluster']['ebs_shared_dirs'].split(',').empty? }
not_if { shared_dirs.empty? }
end unless on_docker?

when 'ComputeFleet', 'LoginNode'
# Parse shared directory info and turn into an array
shared_dir_array = node['cluster']['ebs_shared_dirs'].split(',')

# Mount each volume with NFS
shared_dir_array.each do |dir|
shared_dirs.each do |dir|
volume "mount volume #{dir}" do
action :mount
shared_dir dir
Expand Down
37 changes: 16 additions & 21 deletions cookbooks/aws-parallelcluster-environment/recipes/config/efs.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,26 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
return if on_docker?
efs_shared_dir_array = node['cluster']['efs_shared_dirs'].split(',')
efs_fs_id_array = node['cluster']['efs_fs_ids'].split(',')
efs_encryption_in_transit_array = node['cluster']['efs_encryption_in_transits'].split(',')
efs_iam_authorization_array = node['cluster']['efs_iam_authorizations'].split(',')
shared_dir_array = node['cluster']['efs_shared_dirs'].split(',')
id_array = node['cluster']['efs_fs_ids'].split(',')
encryption_array = node['cluster']['efs_encryption_in_transits'].split(',')
iam_array = node['cluster']['efs_iam_authorizations'].split(',')

cx_shared_dir_array = []
cx_efs_fs_id_array = []
cx_efs_encryption_array = []
cx_efs_iam_array = []

# Identify the customer use filesystems and store their data in arrays for the EFS resource
efs_shared_dir_array.each_with_index do |dir, index|
next if node['cluster']['internal_shared_dirs'].include?(dir)
cx_shared_dir_array.push(dir)
cx_efs_fs_id_array.push(efs_fs_id_array[index])
cx_efs_encryption_array.push(efs_encryption_in_transit_array[index])
cx_efs_iam_array.push(efs_iam_authorization_array[index])
# Identify the previously mounted filesystems and remove them from the set of filesystems to mount
shared_dir_array.each_with_index do |dir, index|
next unless node['cluster']['internal_shared_dirs'].include?(dir) || dir == "/home" || dir == "home"
shared_dir_array.delete(dir)
id_array.delete_at(index)
encryption_array.delete_at(index)
iam_array.delete_at(index)
end

# Mount EFS directories with the efs resource
efs "mount efs" do
shared_dir_array cx_shared_dir_array
efs_fs_id_array cx_efs_fs_id_array
efs_encryption_in_transit_array cx_efs_encryption_array
efs_iam_authorization_array cx_efs_iam_array
shared_dir_array shared_dir_array
efs_fs_id_array id_array
efs_encryption_in_transit_array encryption_array
efs_iam_authorization_array iam_array
action :mount
not_if { cx_shared_dir_array.empty? }
not_if { shared_dir_array.empty? }
end
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@

return if on_docker?

# Check if home is a shared filesystem and return if it is because there is nothing to export
shared_storage = node['cluster']['efs_shared_dirs'].split(',') + node['cluster']['fsx_shared_dirs'].split(',') +
node['cluster']['ebs_shared_dirs'].split(',') + node['cluster']['raid_shared_dir'].split(',')
return if shared_storage.include?('/home') || shared_storage.include?('home')

case node['cluster']['node_type']
when 'HeadNode'
volume "export /home" do
Expand Down
42 changes: 42 additions & 0 deletions cookbooks/aws-parallelcluster-environment/recipes/config/fsx.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
return if on_docker?
id_array = node['cluster']['fsx_fs_ids'].split(',')
type_array = node['cluster']['fsx_fs_types'].split(',')
shared_dir_array = node['cluster']['fsx_shared_dirs'].split(',')
dns_name_array = node['cluster']['fsx_dns_names'].split(',')
mount_name_array = node['cluster']['fsx_mount_names'].split(',')
volume_junction_path_array = node['cluster']['fsx_volume_junction_paths'].split(',')

# Identify the previously mounted filesystems and remove them from the set of filesystems to mount
shared_dir_array.each_with_index do |dir, index|
next unless dir == "/home" || dir == 'home'
id_array.delete_at(index)
type_array.delete_at(index)
shared_dir_array.delete(dir)
dns_name_array.delete_at(index)
mount_name_array.delete_at(index)
volume_junction_path_array.delete_at(index)
end

# Mount FSx shared directories with the lustre resource
lustre "mount fsx" do
fsx_fs_id_array id_array
fsx_fs_type_array type_array
fsx_shared_dir_array shared_dir_array
fsx_dns_name_array dns_name_array
fsx_mount_name_array mount_name_array
fsx_volume_junction_path_array volume_junction_path_array
action :mount
not_if { id_array.empty? }
end
119 changes: 111 additions & 8 deletions cookbooks/aws-parallelcluster-environment/recipes/config/mount_home.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,115 @@
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

volume "mount /home" do
action :mount
shared_dir '/home'
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['head_node_home_path']}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
return if on_docker?

shared_storage = { 'efs' => node['cluster']['efs_shared_dirs'].split(','),
'fsx' => node['cluster']['fsx_shared_dirs'].split(','),
'ebs' => node['cluster']['ebs_shared_dirs'].split(','),
# Making sure they are all the same object type, even though raid is just a string
'raid' => node['cluster']['raid_shared_dir'].split(','),
}

# Check if home is a shared filesystem
shared_home = 'none'
shared_storage.each do |type, dirs|
next unless dirs.include?('/home') || dirs.include?('home')
shared_home = type
break
end

if shared_home == 'none'
# Mount the NFS export to compute and login nodes, the head node will export /home later
case node['cluster']['node_type']
when 'ComputeFleet', 'LoginNode'
volume "mount /home" do
action :mount
shared_dir '/home'
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{node['cluster']['head_node_home_path']}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
end
when 'HeadNode'
Chef::Log.info("Do not mount NFS shares on the HeadNode")
else
raise "node_type must be ComputeFleet, LoginNode or HeadNode"
end
else
# Identify the filesystem that is shared and mount it
include_recipe "aws-parallelcluster-environment::update_fs_mapping"
include_recipe "aws-parallelcluster-environment::backup_home_shared_data"
case shared_home
when 'efs'
shared_storage['efs'].each_with_index do |dir, index|
next unless dir == "/home" || dir == 'home'
efs "mount shared efs home" do
shared_dir_array [dir]
efs_fs_id_array [node['cluster']['efs_fs_ids'].split(',')[index]]
efs_encryption_in_transit_array [node['cluster']['efs_encryption_in_transits'].split(',')[index]]
efs_iam_authorization_array [node['cluster']['efs_iam_authorizations'].split(',')[index]]
action :mount
end
break
end
when 'fsx'
shared_storage['fsx'].each_with_index do |dir, index|
next unless dir == "/home" || dir == 'home'
lustre "mount shared fsx home" do
fsx_fs_id_array [node['cluster']['fsx_fs_ids'].split(',')[index]]
fsx_fs_type_array [node['cluster']['fsx_fs_types'].split(',')[index]]
fsx_shared_dir_array [dir]
fsx_dns_name_array [node['cluster']['fsx_dns_names'].split(',')[index]]
fsx_mount_name_array [node['cluster']['fsx_mount_names'].split(',')[index]]
fsx_volume_junction_path_array [node['cluster']['fsx_volume_junction_paths'].split(',')[index]]
action :mount
end
break
end
when 'ebs'
case node['cluster']['node_type']
when 'HeadNode'
shared_storage['ebs'].each_with_index do |dir, index|
next unless dir == "/home" || dir == 'home'
manage_ebs "add ebs /home" do
shared_dir_array [dir]
vol_array [node['cluster']['volume'].split(',')[index]]
action %i(mount export)
end
break
end
when 'ComputeFleet', 'LoginNode'
volume "mount /home" do
action :mount
shared_dir '/home'
device(lazy { "#{node['cluster']['head_node_private_ip']}:#{format_directory('/home')}" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
end
end
when 'raid'
case node['cluster']['node_type']
when 'HeadNode'
raid "add raid /home" do
raid_shared_dir '/home'
raid_type node['cluster']['raid_type']
raid_vol_array node['cluster']['raid_vol_ids'].split(',')
action %i(mount export)
end
when 'ComputeFleet', 'LoginNode'
volume "mount raid /home volume over NFS" do
action :mount
shared_dir '/home'
device(lazy { "#{node['cluster']['head_node_private_ip']}:/home" })
fstype 'nfs'
options node['cluster']['nfs']['hard_mount_options']
retries 10
retry_delay 6
end
end
end
include_recipe "aws-parallelcluster-environment::restore_home_shared_data"
end
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

# Parse and get RAID shared directory info and turn into an array
raid_shared_dir = node['cluster']['raid_shared_dir']
return if raid_shared_dir.empty?
return if raid_shared_dir.empty? || raid_shared_dir == '/home' || raid_shared_dir == 'home'

case node['cluster']['node_type']
when 'HeadNode'
Expand Down
3 changes: 2 additions & 1 deletion cookbooks/aws-parallelcluster-environment/recipes/init.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
raise "internal_shared_storage_type must be ebs or efs"
end

include_recipe "aws-parallelcluster-environment::mount_home" if %w(ComputeFleet LoginNode).include? node['cluster']['node_type']
# Mount the home directory to all nodes if it is shared, otherwise mount the NFS share to compute and login nodes
include_recipe "aws-parallelcluster-environment::mount_home"

include_recipe "aws-parallelcluster-environment::network_interfaces"
include_recipe 'aws-parallelcluster-environment::imds'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

return if on_docker?

if node['cluster']['node_type'] == 'HeadNode'
# For each, backup the data to a temp location
# This is necessary to preserve any data in these directories that was
# generated during the build of ParallelCluster AMIs after converting to
# shared storage
bash "Backup /home" do
user 'root'
group 'root'
code <<-EOH
mkdir -p /tmp/home
rsync -a /home/ /tmp/home
EOH
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true

#
# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

return if on_docker?

if node['cluster']['node_type'] == 'HeadNode'
# Restore the shared storage home data if it doesn't already exist
# This is necessary to preserve any data in these directories that was
# generated during the build of ParallelCluster AMIs after converting to
# shared storage and backed up to a temporary location previously
# Remove the backup after the copy is done
bash "Restore /home" do
user 'root'
group 'root'
code <<-EOH
rsync -a --ignore-existing /tmp/home/ /home
rm -rf /tmp/home/
EOH
end
end

0 comments on commit f905257

Please sign in to comment.