Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues when performing OFED/CUDA/Lustre extra builds #489

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ansible/roles/cuda/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cuda_distro: "rhel{{ ansible_distribution_major_version }}"
cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo"
cuda_driver_stream: default
cuda_driver_stream: 560-open
cuda_package_version: 'latest'
cuda_packages:
- "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}"
Expand Down
2 changes: 1 addition & 1 deletion ansible/roles/cuda/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
register: _cuda_driver_module_enabled

- name: Enable nvidia driver module
ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms"
ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_driver_stream }}"
register: _cuda_driver_module_enable
when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr"
changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout"
Expand Down
25 changes: 2 additions & 23 deletions ansible/roles/lustre/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,29 +42,8 @@
name: "{{ _lustre_find_rpms.files | map(attribute='path')}}"
disable_gpg_check: yes

- block:
- name: Remove lustre build prerequisites
# NB Only remove ones this role installed which weren't upgrades
ansible.builtin.dnf:
name: "{{ _new_pkgs }}"
state: absent
vars:
_installed_pkgs: |
{{
_lustre_dnf_build_packages.results |
select('match', 'Installed:') |
map('regex_replace', '^Installed: (.+?)-[0-9].*$', '\1')
}}
_removed_pkgs: |
{{
_lustre_dnf_build_packages.results |
select('match', 'Removed:') |
map('regex_replace', '^Removed: (.+?)-[0-9].*$', '\1')
}}
_new_pkgs: "{{ _installed_pkgs | difference(_removed_pkgs) }}"

- name: Delete lustre build dir
file:
- name: Delete lustre build dir
file:
path: "{{ lustre_build_dir }}"
state: absent
when: lustre_build_cleanup | bool
22 changes: 18 additions & 4 deletions ansible/roles/ofed/tasks/install.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@

- name: Install latest kernel packages
ansible.builtin.dnf:
name:
- kernel
- kernel-core
- kernel-tools
- kernel-tools-libs
state: latest

- name: Get installed kernels
command: dnf list --installed kernel
register: _ofed_dnf_kernels
Expand All @@ -8,16 +18,20 @@
register: _ofed_loaded_kernel
changed_when: false

- name: Check current kernel is newest installed
assert:
that: _ofed_kernel_current == _ofed_dnf_kernels_newest
fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?"
- name: Reboot into new kernel if not on latest
ansible.builtin.reboot:
vars:
_ofed_kernel_current: >-
{{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }}
_ofed_dnf_kernels_newest: >-
{{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }}
# dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos "
when: _ofed_kernel_current != _ofed_dnf_kernels_newest

- name: Get new running kernel
command: uname -r
register: _ofed_loaded_kernel
changed_when: false

- name: Enable epel
dnf:
Expand Down
Loading