From 786fc6c1bed2c7650ed9ed32ed754d8a8eae1135 Mon Sep 17 00:00:00 2001 From: Miguel Caballer Date: Wed, 25 Oct 2023 08:54:33 +0200 Subject: [PATCH 1/5] Ser drmaa version and test new slurm versions --- defaults/main.yml | 1 + tasks/drmaa.yml | 16 +++++++++++----- tests/test.yml | 2 ++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/defaults/main.yml b/defaults/main.yml index e539e2a..b3f8fb5 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -34,3 +34,4 @@ max_number_of_nodes: 3 slurm_vnode_prefix: wn # Install DRMAA library drmaa_lib_install: false +drmaa_lib_version: 1.0.7 diff --git a/tasks/drmaa.yml b/tasks/drmaa.yml index 39abe49..9336584 100644 --- a/tasks/drmaa.yml +++ b/tasks/drmaa.yml @@ -1,10 +1,5 @@ --- # Install DRMAA Library for slurm - - set_fact: - INSTALL_PATH: /opt - ID_FILE: "9" - FILENAME: "slurm-drmaa-1.0.7" - - name: Check if lib exists stat: path=/usr/local/lib/libdrmaa.so register: libstat @@ -14,10 +9,21 @@ - name: Install GCC package: name=gcc state=present + - set_fact: + INSTALL_PATH: /opt + FILENAME: "slurm-drmaa-{{ drmaa_lib_version }}" + - name: Download lib get_url: url: http://apps.man.poznan.pl/trac/slurm-drmaa/downloads/9 dest: "{{ INSTALL_PATH }}/{{ FILENAME }}.tgz" + when: drmaa_lib_version == "1.0.7" + + - name: Download lib + get_url: + url: https://github.com/natefoo/slurm-drmaa/releases/download/{{ drmaa_lib_version }}/slurm-drmaa-{{ drmaa_lib_version }}.tar.gz + dest: "{{ INSTALL_PATH }}/{{ FILENAME }}.tgz" + when: drmaa_lib_version != "1.0.7" - name: unarchive lib unarchive: diff --git a/tests/test.yml b/tests/test.yml index 2533bde..f4a58a7 100644 --- a/tests/test.yml +++ b/tests/test.yml @@ -9,3 +9,5 @@ slurm_wn_ips: ["127.0.0.1"] slurm_vnode_prefix: vnode- max_number_of_nodes: 1 + drmaa_lib_version: 1.1.4 + slurm_version: 22.05.10 From 330ed69affbd931ccfbff67bdb040ee21a7d8315 Mon Sep 17 00:00:00 2001 From: Miguel Caballer Date: Wed, 25 Oct 2023 09:03:44 +0200 Subject: [PATCH 2/5] Ser drmaa version and test new slurm versions --- tests/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test.yml b/tests/test.yml index f4a58a7..c754f8e 100644 --- a/tests/test.yml +++ b/tests/test.yml @@ -10,4 +10,4 @@ slurm_vnode_prefix: vnode- max_number_of_nodes: 1 drmaa_lib_version: 1.1.4 - slurm_version: 22.05.10 + slurm_version: 21.08.8 From 6f887145eb2d2fcd4262da824b5ed3e13e1cb401 Mon Sep 17 00:00:00 2001 From: Miguel Caballer Date: Wed, 25 Oct 2023 09:12:09 +0200 Subject: [PATCH 3/5] Fix slurm conf --- templates/slurm.conf.j2 | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index 7e173c4..501c3b7 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -8,7 +8,7 @@ ControlMachine={{slurm_server_name}} #BackupAddr= # AuthType=auth/munge -CacheGroups=0 +#CacheGroups=0 {% if 'blcr' in templates %} CheckpointType=checkpoint/blcr @@ -101,7 +101,7 @@ FastSchedule=1 #SchedulerRootFilter=1 #SchedulerTimeSlice=30 SchedulerType=sched/backfill -SchedulerPort=7321 +#SchedulerPort=7321 SelectType=select/linear #SelectTypeParameters= # @@ -128,7 +128,9 @@ SelectType=select/linear #AccountingStoragePort= AccountingStorageType=accounting_storage/none #AccountingStorageUser= -AccountingStoreJobComment=YES + +#AccountingStoreJobComment=YES + ClusterName=cluster #DebugFlags= #JobCompHost= From bc0f28b368c421f7778692f5da9b68846773313e Mon Sep 17 00:00:00 2001 From: Miguel Caballer Date: Wed, 25 Oct 2023 09:36:30 +0200 Subject: [PATCH 4/5] Add slurm_conf_options var --- README.md | 40 ++++++++++ defaults/main.yml | 39 ++++++++++ tasks/main.yml | 9 +++ templates/slurm.conf.j2 | 157 +--------------------------------------- tests/test.yml | 1 + 5 files changed, 92 insertions(+), 154 deletions(-) diff --git a/README.md b/README.md index 7490e4e..1d98e12 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,46 @@ The variables that can be passed to this role and a brief description about them user: user1 # Install DRMAA library drmaa_lib_install: false + drmaa_lib_version: 1.0.7 + # SLURM default configuration options + slurm_default_conf_options: + AuthType: auth/munge + CryptoType: crypto/munge + FirstJobId: 1 + JobRequeue: 0 + JobSubmitPlugins: all_partitions + ProctrackType: proctrack/pgid + ReturnToService: 2 + SlurmctldPidFile: /var/run/slurmctld.pid + SlurmctldPort: 6817 + SlurmdPidFile: /var/run/slurmctld.pid + SlurmdPort: 6818 + SlurmdSpoolDir: /var/spool/slurm + SlurmUser: slurm + StateSaveLocation: /var/slurm/checkpoint + SwitchType: switch/none + TaskPlugin: task/none + InactiveLimit: 0 + KillWait: 30 + MessageTimeout: 30 + MinJobAge: 300 + SlurmctldTimeout: 30 + SlurmdTimeout: 40 + Waittime: 0 + FastSchedule: 1 + SchedulerType: sched/backfill + SelectType: select/linear + AccountingStorageType: accounting_storage/none + ClusterName: cluster + JobCompType: jobcomp/none + JobAcctGatherFrequency: 30 + JobAcctGatherType: jobacct_gather/none + SlurmctldDebug: debug5 + SlurmctldLogFile: /var/log/slurm/slurmctld.log + SlurmdDebug: debug5 + SlurmdLogFile: /var/log/slurm/slurmd.log + # SLURM user configuration options + slurm_conf_options: {} Example Playbook ---------------- diff --git a/defaults/main.yml b/defaults/main.yml index b3f8fb5..3bf7cb3 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -35,3 +35,42 @@ slurm_vnode_prefix: wn # Install DRMAA library drmaa_lib_install: false drmaa_lib_version: 1.0.7 +# SLURM default configuration options +slurm_default_conf_options: + AuthType: auth/munge + CryptoType: crypto/munge + FirstJobId: 1 + JobRequeue: 0 + JobSubmitPlugins: all_partitions + ProctrackType: proctrack/pgid + ReturnToService: 2 + SlurmctldPidFile: /var/run/slurmctld.pid + SlurmctldPort: 6817 + SlurmdPidFile: /var/run/slurmctld.pid + SlurmdPort: 6818 + SlurmdSpoolDir: /var/spool/slurm + SlurmUser: slurm + StateSaveLocation: /var/slurm/checkpoint + SwitchType: switch/none + TaskPlugin: task/none + InactiveLimit: 0 + KillWait: 30 + MessageTimeout: 30 + MinJobAge: 300 + SlurmctldTimeout: 30 + SlurmdTimeout: 40 + Waittime: 0 + FastSchedule: 1 + SchedulerType: sched/backfill + SelectType: select/linear + AccountingStorageType: accounting_storage/none + ClusterName: cluster + JobCompType: jobcomp/none + JobAcctGatherFrequency: 30 + JobAcctGatherType: jobacct_gather/none + SlurmctldDebug: debug5 + SlurmctldLogFile: /var/log/slurm/slurmctld.log + SlurmdDebug: debug5 + SlurmdLogFile: /var/log/slurm/slurmd.log +# SLURM user configuration options +slurm_conf_options: {} \ No newline at end of file diff --git a/tasks/main.yml b/tasks/main.yml index 8ec2744..83434bf 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -58,6 +58,15 @@ - /var/slurm/checkpoint - /etc/slurm + - name: Use BLCR as checkpointing type + set_fact: + slurm_conf_options: "{{ slurm_default_conf_options | combine({'CheckpointType': 'checkpoint/blcr'}) }}" + when : 'blcr' in templates + + - name: Update default options with user options + set_fact: + slurm_conf_options: "{{ slurm_default_conf_options | combine(slurm_conf_options) }}" + - name: Include "{{slurm_type_of_node}}" SLURM recipe include_tasks: "{{slurm_type_of_node}}.yaml" diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index 501c3b7..a808ec9 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -3,162 +3,11 @@ # See the slurm.conf man page for more information. # ControlMachine={{slurm_server_name}} -#ControlAddr= -#BackupController= -#BackupAddr= -# -AuthType=auth/munge -#CacheGroups=0 - -{% if 'blcr' in templates %} -CheckpointType=checkpoint/blcr -{% endif %} - -CryptoType=crypto/munge -#DisableRootJobs=NO -#EnforcePartLimits=NO -#Epilog=/usr/local/slurm/epilog_controller -#EpilogSlurmctld=/usr/local/slurm/epilog_controller -FirstJobId=1 -#MaxJobId=999999 -#GresTypes= -#GroupUpdateForce=0 -#GroupUpdateTime=600 -#JobCheckpointDir=/var/slurm/checkpoint -#JobCredentialPrivateKey= -#JobCredentialPublicCertificate= -#JobFileAppend=0 -JobRequeue=0 -JobSubmitPlugins=all_partitions -#KillOnBadExit=0 -#Licenses=foo*4,bar -#MailProg=/bin/mail -#MaxJobCount=5000 -#MaxStepCount=40000 -#MaxTasksPerNode=128 -#MpiDefault=openmpi -#MpiParams=ports=12000-12999 -#PluginDir= -#PlugStackConfig= -#PrivateData=jobs -ProctrackType=proctrack/pgid -#Prolog= -#PrologSlurmctld= -#PropagatePrioProcess=0 -#PropagateResourceLimits= -#PropagateResourceLimitsExcept= -ReturnToService=2 -#SallocDefaultCommand= -SlurmctldPidFile=/var/run/slurmctld.pid -SlurmctldPort=6817 -SlurmdPidFile=/var/run/slurmctld.pid -SlurmdPort=6818 -SlurmdSpoolDir=/var/spool/slurm -SlurmUser=slurm -#SlurmdUser=root -#SrunEpilog= -#SrunProlog= -StateSaveLocation=/var/slurm/checkpoint -SwitchType=switch/none -#TaskEpilog= -TaskPlugin=task/none -#TaskPluginParam= -#TaskProlog= -#TopologyPlugin=topology/tree -#TmpFs=/tmp -#TrackWCKey=no -#TreeWidth= -#UnkillableStepProgram= -#UsePAM=0 -# -# -# TIMERS -#BatchStartTimeout=10 -#CompleteWait=0 -#EpilogMsgTime=2000 -#GetEnvTimeout=2 -#HealthCheckInterval=0 -#HealthCheckProgram= -InactiveLimit=0 -KillWait=30 -MessageTimeout=30 -#ResvOverRun=0 -MinJobAge=300 -#OverTimeLimit=0 -SlurmctldTimeout=30 -SlurmdTimeout=40 -#SlurmctldTimeout=120 -#SlurmdTimeout=300 -#UnkillableStepTimeout=60 -#VSizeFactor=0 -Waittime=0 -# -# -# SCHEDULING -#DefMemPerCPU=0 -FastSchedule=1 -#MaxMemPerCPU=0 -#SchedulerRootFilter=1 -#SchedulerTimeSlice=30 -SchedulerType=sched/backfill -#SchedulerPort=7321 -SelectType=select/linear -#SelectTypeParameters= -# -# -# JOB PRIORITY -#PriorityType=priority/basic -#PriorityDecayHalfLife= -#PriorityCalcPeriod= -#PriorityFavorSmall= -#PriorityMaxAge= -#PriorityUsageResetPeriod= -#PriorityWeightAge= -#PriorityWeightFairshare= -#PriorityWeightJobSize= -#PriorityWeightPartition= -#PriorityWeightQOS= -# -# -# LOGGING AND ACCOUNTING -#AccountingStorageEnforce=0 -#AccountingStorageHost= -#AccountingStorageLoc= -#AccountingStoragePass= -#AccountingStoragePort= -AccountingStorageType=accounting_storage/none -#AccountingStorageUser= -#AccountingStoreJobComment=YES +{% for key, value in slurm_conf_options.items() %} +{{key}}={{value}} +{% endfor %} -ClusterName=cluster -#DebugFlags= -#JobCompHost= -#JobCompLoc= -#JobCompPass= -#JobCompPort= -JobCompType=jobcomp/none -#JobCompUser= -JobAcctGatherFrequency=30 -JobAcctGatherType=jobacct_gather/none -SlurmctldDebug=debug5 -SlurmctldLogFile=/var/log/slurm/slurmctld.log -SlurmdDebug=debug5 -SlurmdLogFile=/var/log/slurm/slurmd.log -#SlurmSchedLogFile= -#SlurmSchedLogLevel= -# -# -# POWER SAVE SUPPORT FOR IDLE NODES (optional) -#SuspendProgram= -#ResumeProgram= -#SuspendTimeout= -#ResumeTimeout= -#ResumeRate= -#SuspendExcNodes= -#SuspendExcParts= -#SuspendRate= -#SuspendTime= # # # COMPUTE NODES diff --git a/tests/test.yml b/tests/test.yml index c754f8e..a9fae57 100644 --- a/tests/test.yml +++ b/tests/test.yml @@ -9,5 +9,6 @@ slurm_wn_ips: ["127.0.0.1"] slurm_vnode_prefix: vnode- max_number_of_nodes: 1 + drmaa_lib_install: true drmaa_lib_version: 1.1.4 slurm_version: 21.08.8 From ed66b0cec06b77e03556636f218e68c89c4dbed8 Mon Sep 17 00:00:00 2001 From: Miguel Caballer Date: Wed, 25 Oct 2023 09:41:08 +0200 Subject: [PATCH 5/5] fix typo --- tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/main.yml b/tasks/main.yml index 83434bf..609a81b 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -61,7 +61,7 @@ - name: Use BLCR as checkpointing type set_fact: slurm_conf_options: "{{ slurm_default_conf_options | combine({'CheckpointType': 'checkpoint/blcr'}) }}" - when : 'blcr' in templates + when : '"blcr" in templates' - name: Update default options with user options set_fact: