From 81395b4ab06887b9625b0272e6dd5e1df0e862bc Mon Sep 17 00:00:00 2001 From: Jeff Ohrstrom Date: Fri, 8 Nov 2024 11:10:59 -0500 Subject: [PATCH] simplify and fix node choice --- form.yml | 83 ++++++++------------------------------------------ submit.yml.erb | 32 ++----------------- 2 files changed, 16 insertions(+), 99 deletions(-) diff --git a/form.yml b/form.yml index 9f43582..b50e764 100644 --- a/form.yml +++ b/form.yml @@ -18,88 +18,31 @@ attributes: help: | Number of cores on node type (4 GB per core unless requesting whole node). Leave blank if requesting full node. - min: 0 - max: 28 + min: 1 + max: 96 step: 1 node_type: widget: select label: "Node type" help: | - - **Standard Compute**
- These are standard HPC machines. Owens has 648 of these nodes with 40 - cores and 128 GB of memory. Pitzer has 224 of these nodes with 40 cores and - 340 of these nodes with 48 cores. All pitzer nodes have 192 GB of RAM. Chosing "any" as the node type will decrease - your wait time. - - **GPU Enabled**
- These are HPC machines with GPUs. Owens has 160 nodes with 1 [NVIDIA Tesla P100 GPU] - and Pitzer has 74 nodes with 2 [NVIDIA Tesla V100 GPUs]. They have the same - CPU and memory characteristics of standard compute. However, Pitzer's 40 core machines - have 2 GPUs with 16 GB of RAM; and Pitzer's 48 core machines have 2 GPUs with 32 GB of RAM. - Dense GPU types have 4 GPUs with 16 GB of RAM. - - **Large Memory**
- These are HPC machines with very large amounts of memory. Owens has 16 hugemem nodes - with 48 cores and 1.5 TB of RAM. Pitzer has 4 hugemem nodes with 3 TB of RAM and 80 cores. - Pitzer also has 12 Largmem nodes which have 48 cores with 768 GB of RAM. - - [NVIDIA Tesla P100 GPU]: http://www.nvidia.com/object/tesla-p100.html - [NVIDIA Tesla V100 GPUs]: https://www.nvidia.com/en-us/data-center/v100/ + - **any** - (*96 cores*) Use any available Cardinal node. This reduces the + wait time as there are no node requirements. + - **gpu** - (*96 cores*) Use an Cardinal node that has an [NVIDIA H100 + GPU](https://www.nvidia.com/en-us/data-center/h100/). There are 32 + of these nodes on Cardinal. + - **hugemem** - (*96 cores*) Use an Cardinal node that has 2 TB of + available RAM as well as 96 cores. There are 16 of these nodes on + Cardinal. Requesting these nodes always reserve entire nodes. options: - [ "any", "any", - data-max-num-cores-for-cluster-owens: 28, - data-max-num-cores-for-cluster-pitzer: 48, - data-max-num-cores-for-cluster-cardinal: 96, - ] - - [ - "48 core", "any-48core", - data-max-num-cores-for-cluster-pitzer: 48, - data-option-for-cluster-owens: false, - data-option-for-cluster-cardinal: false, - ] - - [ - "40 core", "any-40core", - data-max-num-cores-for-cluster-pitzer: 40, - data-option-for-cluster-owens: false, - data-option-for-cluster-cardinal: false, + data-min-num-cores: 1, ] - [ "any gpu", "gpu", - data-max-num-cores-for-cluster-owens: 28, - data-max-num-cores-for-cluster-pitzer: 48, - data-max-num-cores-for-cluster-cardinal: 96, - ] - - [ - "40 core gpu", "gpu-40core", - data-max-num-cores-for-cluster-pitzer: 40, - data-option-for-cluster-owens: false, - data-option-for-cluster-cardinal: false, - ] - - [ - "48 core gpu", "gpu-48core", - data-max-num-cores-for-cluster-pitzer: 48, - data-option-for-cluster-owens: false, - data-option-for-cluster-cardinal: false, - ] - - [ - "largemem", "largemem", - data-min-num-cores-for-cluster-pitzer: 24, - data-max-num-cores-for-cluster-pitzer: 48, - data-option-for-cluster-owens: false, - data-option-for-cluster-cardinal: false, + data-min-num-cores: 1, ] - [ "hugemem", "hugemem", - data-min-num-cores-for-cluster-owens: 4, - data-max-num-cores-for-cluster-owens: 48, - data-min-num-cores-for-cluster-pitzer: 20, - data-max-num-cores-for-cluster-pitzer: 80, - data-option-for-cluster-cardinal: false, - ] - - [ - "debug", "debug", - data-max-num-cores-for-cluster-owens: 28, - data-max-num-cores-for-cluster-pitzer: 48, - data-option-for-cluster-owens: false, - data-option-for-cluster-pitzer: false, - data-option-for-cluster-cardinal: false, + data-min-num-cores: 47, ] diff --git a/submit.yml.erb b/submit.yml.erb index f953a5a..885f8b5 100644 --- a/submit.yml.erb +++ b/submit.yml.erb @@ -4,39 +4,13 @@ raise(StandardError, err_msg) unless CurrentUser.group_names.include?('matlab') nodes = 1 - - cores_lookup = { - "hugemem" => {"pitzer" => "80", "owens" => "48"}, - "largemem" => {"pitzer" => "48", "owens" => "28"}, - - "any" => {"pitzer" => "40", "owens" => "28"}, - "gpu" => {"pitzer" => "48", "owens" => "28"}, - - "any-48core" => {"pitzer" => "48", "owens" => "28"}, - "gpu-48core" => {"pitzer" => "48", "owens" => "28"}, - - "any-40core" => {"pitzer" => "40", "owens" => "28"}, - "gpu-40core" => {"pitzer" => "40", "owens" => "28"}, - } - - max_cores = cores_lookup[node_type][cluster] - ppn = num_cores.blank? ? max_cores : num_cores.to_i - + ppn = num_cores.blank? ? 96 : num_cores.to_i case node_type - when "hugemem" - partition = bc_num_slots.to_i > 1 ? "hugemem-parallel" : "hugemem" - slurm_args = [ "--nodes", "#{nodes}", "--ntasks-per-node", "#{ppn}", "--partition", partition ] + when "largemem" + slurm_args = [ "--nodes", "#{nodes}", "--ntasks-per-node", "#{ppn}", "--partition", "hugemem" ] when "gpu" slurm_args = [ "--nodes", "#{nodes}", "--ntasks-per-node", "#{ppn}", "--gpus-per-node", "1" ] - when "any40-core" - slurm_args = [ "--nodes", "#{nodes}", "--ntasks-per-node", "#{ppn}", "--contstraint", "48core" ] - when "any48-core" - slurm_args = [ "--nodes", "#{nodes}", "--ntasks-per-node", "#{ppn}", "--contstraint", "48core" ] - when "gpu-48core" - slurm_args = [ "--nodes", "#{nodes}", "--ntasks-per-node", "#{ppn}", "--gpus-per-node", "1", "--constraint", "48core" ] - when "gpu-40core" - slurm_args = [ "--nodes", "#{nodes}", "--ntasks-per-node", "#{ppn}", "--gpus-per-node", "1", "--constraint", "40core" ] else slurm_args = [ "--nodes", "#{nodes}", "--ntasks-per-node", "#{ppn}" ] end