Skip to content

Commit

Permalink
Fix/1898 (#1899)
Browse files Browse the repository at this point in the history
* move update packages in scheduler's roles

* move update package role to main slurm task

* retry apt update

* restart service after update

* check pbs connection

* stop and start pbs services

* typo in sbatch instruction
  • Loading branch information
xpillons authored Apr 26, 2024
1 parent 0acf6e7 commit dab1e1b
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#SBATCH -p hpc
#SBATCH -t 5
#SBATCH --export=NONE
#SBACTH --exclusive
#SBATCH --exclusive

source /etc/profile.d/modules.sh
module use /usr/share/Modules/modulefiles
Expand Down
2 changes: 1 addition & 1 deletion playbooks/ood.yml
Original file line number Diff line number Diff line change
Expand Up @@ -628,4 +628,4 @@
apply:
become: true
vars:
packages_to_exclude_from_upgrade: "{{ (['ondemand','amlfs', 'jetpack8'] if ( lustre.create | default(false)) else ['ondemand', 'jetpack8']) }}"
packages_to_exclude_from_upgrade: "{{ (['ondemand','amlfs'] if ( lustre.create | default(false)) else ['ondemand']) }}"
34 changes: 29 additions & 5 deletions playbooks/roles/pbsserver/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,37 @@
args:
chdir: /opt/cycle/pbspro

- name: Restart pbs-server
service:
name: pbs
state: restarted

- name: create cron entry to remove old accounting files
cron:
name: "remove PBS accounting files older than 90 days"
special_time: daily
job: "find /var/spool/pbs/server_logs -mtime +90 -type f -print -exec rm {} +"

- name: Update Packages
include_role:
name: pkg_update
apply:
become: true

- name: stop postgresql
service:
name: postgresql
state: stopped

- name: stop pbs-server
service:
name: pbs
state: stopped

- name: start postgresql
service:
name: postgresql
state: started

- name: start pbs-server
service:
name: pbs
state: started

- name: check pbs connection
command: qstat
13 changes: 13 additions & 0 deletions playbooks/roles/pkg_update/tasks/Ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@
ansible.builtin.apt:
name: "*"
state: latest
# https://github.com/ansible/ansible/issues/51663
# There has been an intermittent issue with this task where it would fail and print the error:
#
# Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), is another process
# using it?
#
# The reason for this is unclear. It's not from unattended-upgrades as that has already been
# uninstalled when creating the base image. The workaround for now is to simply retry this task
# several times in the event that it fails, with a small delay between each attempt.
register: result
until: result is not failed
retries: 5
delay: 15

- name: Check if reboot is required
stat:
Expand Down
8 changes: 8 additions & 0 deletions playbooks/roles/slurm/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,11 @@

- import_tasks: '{{slurm_role}}.yml'
become: true

- name: Update Packages
include_role:
name: pkg_update
apply:
become: true
vars:
packages_to_exclude_from_upgrade: "['jetpack8']"
1 change: 1 addition & 0 deletions playbooks/roles/slurm/tasks/slurmserver.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,4 @@
- import_tasks: pyxis.yml
become: true
tags: [ 'pyxis' ]

7 changes: 0 additions & 7 deletions playbooks/scheduler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,3 @@
cc_webserverpath: '{{cyclecloud.web_server_path | default("")}}'
when: ( queue_manager is defined and queue_manager == "slurm" )

- name: Update Packages
include_role:
name: pkg_update
apply:
become: true
vars:
packages_to_exclude_from_upgrade: "['jetpack8']"

0 comments on commit dab1e1b

Please sign in to comment.