From b434bdab149ebe24510df2d4a55dd9ba9746cd40 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 7 Mar 2024 10:31:00 +0000 Subject: [PATCH 1/3] fix filebeat container pull PR#351 (#373) --- ansible/roles/filebeat/tasks/install.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/filebeat/tasks/install.yml b/ansible/roles/filebeat/tasks/install.yml index 8e64722ec..6514e3028 100644 --- a/ansible/roles/filebeat/tasks/install.yml +++ b/ansible/roles/filebeat/tasks/install.yml @@ -15,3 +15,4 @@ - name: Reload filebeat unit file command: systemctl daemon-reload when: _filebeat_unit.changed + become: true From 17f2b2dcb6b2255db587c74e86bba475165c8195 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 7 Mar 2024 10:31:14 +0000 Subject: [PATCH 2/3] add hpctests_pre_cmd option (#375) --- ansible/roles/hpctests/README.md | 1 + ansible/roles/hpctests/defaults/main.yml | 1 + ansible/roles/hpctests/templates/hpl-build.sh.j2 | 2 +- ansible/roles/hpctests/templates/hpl-solo.sh.j2 | 2 +- ansible/roles/hpctests/templates/pingmatrix.sh.j2 | 1 + ansible/roles/hpctests/templates/pingpong.sh.j2 | 1 + 6 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ansible/roles/hpctests/README.md b/ansible/roles/hpctests/README.md index ee37791ec..5c5415158 100644 --- a/ansible/roles/hpctests/README.md +++ b/ansible/roles/hpctests/README.md @@ -33,6 +33,7 @@ Role Variables - `hpctests_hpl_arch`: Optional, default 'linux64'. Arbitrary architecture name for HPL build. HPL is compiled on the first compute node of those selected (see `hpctests_nodes`), so this can be used to create different builds for different types of compute node. The following variables should not generally be changed: +- `hpctests_pre_cmd`: Optional. Command(s) to include in sbatch templates before module load commands. - `hpctests_pingmatrix_modules`: Optional. List of modules to load for pingmatrix test. Defaults are suitable for OpenHPC 2.x cluster using the required packages. - `hpctests_pingpong_modules`: As above but for pingpong test. - `hpctests_pingpong_plot`: Whether to plot pingpong results. Default `yes`. diff --git a/ansible/roles/hpctests/defaults/main.yml b/ansible/roles/hpctests/defaults/main.yml index 280fd454e..30ddd8952 100644 --- a/ansible/roles/hpctests/defaults/main.yml +++ b/ansible/roles/hpctests/defaults/main.yml @@ -1,5 +1,6 @@ --- hpctests_rootdir: +hpctests_pre_cmd: '' hpctests_pingmatrix_modules: [gnu12 openmpi4] hpctests_pingpong_modules: [gnu12 openmpi4 imb] hpctests_pingpong_plot: yes diff --git a/ansible/roles/hpctests/templates/hpl-build.sh.j2 b/ansible/roles/hpctests/templates/hpl-build.sh.j2 index f243a08f7..9300231e8 100644 --- a/ansible/roles/hpctests/templates/hpl-build.sh.j2 +++ b/ansible/roles/hpctests/templates/hpl-build.sh.j2 @@ -8,7 +8,7 @@ {%if hpctests_nodes is defined %}#SBATCH --nodelist={{ hpctests_computes.stdout_lines[0] }}{% endif %} echo HPL arch: {{ hpctests_hpl_arch }} - +{{ hpctests_pre_cmd }} module load {{ hpctests_hpl_modules | join(' ' ) }} make arch={{ hpctests_hpl_arch }} clean_arch_all make arch={{ hpctests_hpl_arch }} diff --git a/ansible/roles/hpctests/templates/hpl-solo.sh.j2 b/ansible/roles/hpctests/templates/hpl-solo.sh.j2 index cc7d2b4dd..6178cd60e 100644 --- a/ansible/roles/hpctests/templates/hpl-solo.sh.j2 +++ b/ansible/roles/hpctests/templates/hpl-solo.sh.j2 @@ -15,6 +15,6 @@ echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST echo SLURM_JOB_ID: $SLURM_JOB_ID echo UCX_NET_DEVICES: $UCX_NET_DEVICES echo HPL arch: {{ hpctests_hpl_arch }} - +{{ hpctests_pre_cmd }} module load {{ hpctests_hpl_modules | join(' ' ) }} mpirun ./xhpl-{{ hpctests_hpl_arch }} diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 index 17fb3fd6a..d886e9ac8 100644 --- a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 +++ b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 @@ -12,6 +12,7 @@ export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }} echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST echo SLURM_JOB_ID: $SLURM_JOB_ID echo UCX_NET_DEVICES: $UCX_NET_DEVICES +{{ hpctests_pre_cmd }} module load {{ hpctests_pingmatrix_modules | join(' ' ) }} mpicc -o nxnlatbw mpi_nxnlatbw.c diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 index e74e52539..4dc2eebd5 100644 --- a/ansible/roles/hpctests/templates/pingpong.sh.j2 +++ b/ansible/roles/hpctests/templates/pingpong.sh.j2 @@ -12,6 +12,7 @@ export UCX_NET_DEVICES={{ hpctests_ucx_net_devices }} echo SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST echo SLURM_JOB_ID: $SLURM_JOB_ID echo UCX_NET_DEVICES: $UCX_NET_DEVICES +{{ hpctests_pre_cmd }} module load {{ hpctests_pingpong_modules | join(' ' ) }} #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1 From 6ca97fede4fa8bd13139bd7e3ddebb55f457e290 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 7 Mar 2024 10:35:53 +0000 Subject: [PATCH 3/3] Improve leafcloud CI support (#374) * use proper leafcloud jumphost user * use cheaper leafcloud flavor * add leafcloud packer vars file --- environments/.stackhpc/LEAFCLOUD.pkrvars.hcl | 11 +++++++++++ .../.stackhpc/inventory/group_vars/all/bastion.yml | 2 +- environments/.stackhpc/terraform/LEAFCLOUD.tfvars | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 environments/.stackhpc/LEAFCLOUD.pkrvars.hcl diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl new file mode 100644 index 000000000..2f325d754 --- /dev/null +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -0,0 +1,11 @@ +flavor = "en1.xsmall" +use_blockstorage_volume = true +volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny +image_disk_format = "qcow2" +networks = ["909e49e8-6911-473a-bf88-0495ca63853c"] # slurmapp-ci +source_image_name = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 +fatimage_source_image_name = "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" +ssh_keypair_name = "slurm-app-ci" +ssh_private_key_file = "~/.ssh/id_ed25519" +security_groups = ["default", "SSH"] +floating_ip_network = "external" diff --git a/environments/.stackhpc/inventory/group_vars/all/bastion.yml b/environments/.stackhpc/inventory/group_vars/all/bastion.yml index 626f9ff5c..94287827c 100644 --- a/environments/.stackhpc/inventory/group_vars/all/bastion.yml +++ b/environments/.stackhpc/inventory/group_vars/all/bastion.yml @@ -4,7 +4,7 @@ bastion_config: user: slurm-app-ci ip: 128.232.222.183 LEAFCLOUD: - user: rocky + user: slurm-app-ci ip: 195.114.30.222 # NB: The bastion_{user,ip} variables are used directly in the CI workflow too bastion_user: "{{ bastion_config[ci_cloud].user }}" diff --git a/environments/.stackhpc/terraform/LEAFCLOUD.tfvars b/environments/.stackhpc/terraform/LEAFCLOUD.tfvars index ec152ca42..5e73896c8 100644 --- a/environments/.stackhpc/terraform/LEAFCLOUD.tfvars +++ b/environments/.stackhpc/terraform/LEAFCLOUD.tfvars @@ -1,6 +1,6 @@ cluster_net = "slurmapp-ci" cluster_subnet = "slurmapp-ci" control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment -other_node_flavor = "ec1.small" +other_node_flavor = "en1.xsmall" state_volume_type = "unencrypted" home_volume_type = "unencrypted"