diff --git a/defaults/main.yml b/defaults/main.yml index 82f5d54..9eb2f5d 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -43,6 +43,13 @@ siteName: "io" nodeBase: "{{ siteName }}" nodeGpuBase: "gpu" +# Have systemd restart slurm daemons if they fail to start +slurm_systemd_override_slurmd: True +slurm_systemd_override_slurmdbd: False +slurm_systemd_override_slurmctld: False +slurm_systemd_override_restart: "on-failure" +slurm_systemd_override_sec: "20" + # By setting slurm_munge_key_from_nfs to True we copy the munge.key from slurm_munge_key_nfs # If it's False then we copy it from files/ where ansible runs # This is the default. So with ansible-pull we set slurm_munge_key_nfs to True diff --git a/systemd_override.conf.j2 b/systemd_override.conf.j2 new file mode 100644 index 0000000..07425fb --- /dev/null +++ b/systemd_override.conf.j2 @@ -0,0 +1,4 @@ +# {{ ansible_managed }} +[Service] +Restart={{ slurm_systemd_override_restart }} +RestartSec={{ slurm_systemd_override_sec }} diff --git a/tasks/compute.yml b/tasks/compute.yml index 8201312..6a05047 100644 --- a/tasks/compute.yml +++ b/tasks/compute.yml @@ -37,5 +37,13 @@ service: name=slurm enabled=no when: ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" + - name: create systemd override directories for slurmd + file: path="/etc/systemd/system/slurmd.service.d" state=directory owner=root mode=0755 + when: slurm_systemd_override_slurmd and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" + + - name: template in systemd override file for slurmd + template: src=systemd_override.conf.j2 dest=/etc/systemd/system/slurmd.service.d/slurmd_override.conf backup=no owner=root mode=0644 + when: slurm_systemd_override_slurmd and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" + - name: start and enable slurmd service: name={{ slurmd_service }} state=started enabled=yes diff --git a/tasks/dbd.yml b/tasks/dbd.yml index 2d69959..7e990b8 100644 --- a/tasks/dbd.yml +++ b/tasks/dbd.yml @@ -92,3 +92,12 @@ - name: template in dump-all-databases.sh template: src=dump-all-databases.sh.j2 dest=/usr/local/sbin/dump-all-databases.sh owner=root mode=0750 backup=no + + - name: create systemd override directories for slurmdbd + file: path="/etc/systemd/system/slurmdbd.service.d" state=directory owner=root mode=0755 + when: slurm_systemd_override_slurmdbd and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" + + - name: template in systemd override file for slurmdbd + template: src=systemd_override.conf.j2 dest=/etc/systemd/system/slurmdbd.service.d/slurmdbd_override.conf backup=no owner=root mode=0644 + when: slurm_systemd_override_slurmdbd and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" + diff --git a/tasks/service.yml b/tasks/service.yml index c20096c..66a9237 100644 --- a/tasks/service.yml +++ b/tasks/service.yml @@ -115,5 +115,14 @@ service: name=slurm enabled=no when: ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" + - name: create systemd override directories for slurmctld + file: path="/etc/systemd/system/slurmctld.service.d" state=directory owner=root mode=0755 + when: slurm_systemd_override_slurmctld and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" + + - name: template in systemd override file for slurmctld + template: src=systemd_override.conf.j2 dest=/etc/systemd/system/slurmctld.service.d/slurmctld_override.conf backup=no owner=root mode=0644 + when: slurm_systemd_override_slurmctld and ansible_os_family == "RedHat" and ansible_distribution_major_version == "7" + + - name: start and enable slurmctld service: name={{ slurmctld_service }} state=started enabled=yes