Skip to content

Commit

Permalink
Have systemd restart slurm processes a few times if they fail to start
Browse files Browse the repository at this point in the history
 - currently defaults to only do this on slurmd but allow to
   optionally enable it also for slurmdbd and slurmctld
 - #73
  • Loading branch information
martbhell committed Jan 18, 2017
1 parent c3d9f57 commit d3f2b3f
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 0 deletions.
7 changes: 7 additions & 0 deletions defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ siteName: "io"
nodeBase: "{{ siteName }}"
nodeGpuBase: "gpu"

# Have systemd restart slurm daemons if they fail to start
slurm_systemd_override_slurmd: True
slurm_systemd_override_slurmdbd: False
slurm_systemd_override_slurmctld: False
slurm_systemd_override_restart: "on-failure"
slurm_systemd_override_sec: "20"

# By setting slurm_munge_key_from_nfs to True we copy the munge.key from slurm_munge_key_nfs
# If it's False then we copy it from files/ where ansible runs
# This is the default. So with ansible-pull we set slurm_munge_key_nfs to True
Expand Down
4 changes: 4 additions & 0 deletions systemd_override.conf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# {{ ansible_managed }}
[Service]
Restart={{ slurm_systemd_override_restart }}
RestartSec={{ slurm_systemd_override_sec }}
8 changes: 8 additions & 0 deletions tasks/compute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,13 @@
service: name=slurm enabled=no
when: ansible_os_family == "RedHat" and ansible_distribution_major_version == "7"

- name: create systemd override directories for slurmd
file: path="/etc/systemd/system/slurmd.service.d" state=directory owner=root mode=0755
when: slurm_systemd_override_slurmd and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7"

- name: template in systemd override file for slurmd
template: src=systemd_override.conf.j2 dest=/etc/systemd/system/slurmd.service.d/slurmd_override.conf backup=no owner=root mode=0644
when: slurm_systemd_override_slurmd and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7"

- name: start and enable slurmd
service: name={{ slurmd_service }} state=started enabled=yes
9 changes: 9 additions & 0 deletions tasks/dbd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,12 @@

- name: template in dump-all-databases.sh
template: src=dump-all-databases.sh.j2 dest=/usr/local/sbin/dump-all-databases.sh owner=root mode=0750 backup=no

- name: create systemd override directories for slurmdbd
file: path="/etc/systemd/system/slurmdbd.service.d" state=directory owner=root mode=0755
when: slurm_systemd_override_slurmdbd and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7"

- name: template in systemd override file for slurmdbd
template: src=systemd_override.conf.j2 dest=/etc/systemd/system/slurmdbd.service.d/slurmdbd_override.conf backup=no owner=root mode=0644
when: slurm_systemd_override_slurmdbd and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7"

9 changes: 9 additions & 0 deletions tasks/service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,5 +115,14 @@
service: name=slurm enabled=no
when: ansible_os_family == "RedHat" and ansible_distribution_major_version == "7"

- name: create systemd override directories for slurmctld
file: path="/etc/systemd/system/slurmctld.service.d" state=directory owner=root mode=0755
when: slurm_systemd_override_slurmctld and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7"

- name: template in systemd override file for slurmctld
template: src=systemd_override.conf.j2 dest=/etc/systemd/system/slurmctld.service.d/slurmctld_override.conf backup=no owner=root mode=0644
when: slurm_systemd_override_slurmctld and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7"


- name: start and enable slurmctld
service: name={{ slurmctld_service }} state=started enabled=yes

0 comments on commit d3f2b3f

Please sign in to comment.