Skip to content

Commit

Permalink
Different NVIDIA driver installation method for ohpc system
Browse files Browse the repository at this point in the history
  • Loading branch information
simo-tuomisto committed Apr 7, 2021
1 parent db1509a commit fbe9a83
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 12 deletions.
3 changes: 2 additions & 1 deletion molecule/default/molecule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ provisioner:
group_vars:
all:
gpu: True
cuda_driver_kernel_version: 3.10.0-1160.21.1.el7.x86_64 # The kernel to check kernel modules against
host_vars:
centos7_cuda_repo:
cuda_packages:
Expand All @@ -33,7 +34,7 @@ provisioner:
cuda_init_restart_service: False
centos7_cuda_run:
cuda_use_runfile: True
cuda_runfile_driver: False # Docker has different kernel than images kernel-headers
cuda_runfile_driver: True # Docker has different kernel than images kernel-headers
cuda_runfile_remove: False # Keep the installer in /tmp/centos7_cuda_run for multiple runs
cuda_restart_node_on_install: False
cuda_init: False
Expand Down
12 changes: 11 additions & 1 deletion molecule/default/verify.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
# This is an example playbook to execute Ansible tests.

- name: Verify
- name: Verify CUDA toolkit installation
hosts: all
tasks:
- name: Check that CUDA has been installed
Expand All @@ -11,3 +11,13 @@
- name: Verify that CUDA folder exists
assert:
that: cuda_path_check.stat.exists
- name: Verify NVIDIA driver kernel modules
hosts: centos7_cuda_run
tasks:
- name: Check that NVIDIA kernel module has been installed
stat:
path: /lib/modules/{{ cuda_driver_kernel_version }}/video/nvidia.ko
register: nvidia_module_file
- name: Verify that kernel module exists
assert:
that: nvidia_module_file.stat.exists
73 changes: 63 additions & 10 deletions tasks/install_runfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,74 @@
set_fact:
cuda_runfile_sh: "{{ cuda_runfile_url | basename }}"

- name: 'Determine running kernel'
command: uname -r
register: cuda_driver_kernel_running

- name: 'Determine kernel version'
set_fact:
cuda_driver_kernel_version: "{{ cuda_driver_kernel_version | default(cuda_driver_kernel_running.stdout, true) }}"

- name: 'Check NVIDIA kernel module'
stat:
path: /lib/modules/{{ cuda_driver_kernel_version }}/video/nvidia.ko
register: cuda_driver_kernel_module

- name: "Check CUDA toolkit path"
stat:
path: /usr/local/cuda
register: cuda_toolkit_path

- name: 'Determine if driver and toolkit are installed'
set_fact:
cuda_driver_installed: "{{ cuda_driver_kernel_module.stat.exists }}"
cuda_toolkit_installed: "{{ cuda_toolkit_path.stat.exists }}"

- name: "Create temporary directory for runfile"
file:
path: /tmp/cuda_runfile
state: directory

- name: "Download runfile"
get_url:
url: "{{ cuda_runfile_url }}"
dest: "/tmp/{{ cuda_runfile_sh }}"
dest: "/tmp/cuda_runfile/{{ cuda_runfile_sh }}"
when: (cuda_runfile_toolkit and not cuda_toolkit_installed) or
(cuda_runfile_driver and not cuda_driver_installed)

- name: 'Setting runfile arguments'
set_fact:
runfile_args: "--silent {{ '--driver' if cuda_runfile_driver else '' }} {{ '--toolkit' if cuda_runfile_toolkit else '' }}"

- name: "Run installer"
command: bash /tmp/{{ cuda_runfile_sh }} {{ runfile_args }}
register: cuda_install_out
- name: "Run installer for toolkit"
command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --silent --toolkit
register: cuda_toolkit_install_out
when: cuda_runfile_toolkit and not cuda_toolkit_installed

- name: 'Install driver'
block:

- name: 'Extract installer for driver installation'
command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --extract=/tmp/cuda_runfile

- name: 'Find NVIDIA runtime'
find:
paths: /tmp/cuda_runfile
patterns: 'NVIDIA*.run'
register: cuda_driver_runfile_find

- name: 'Set NVIDIA runfile path'
set_fact:
cuda_driver_runfile: "{{ cuda_driver_runfile_find.files[0].path }}"

- name: 'Print variables of interest'
debug:
msg: "{{ cuda_driver_runfile }} {{ cuda_driver_kernel_version }} {{ cuda_driver_kernel_running.stdout }}"

- name: 'Install driver'
command: bash {{ cuda_driver_runfile }} --silent --kernel-name={{ cuda_driver_kernel_version }} --kernel-source-path=/usr/src/kernels/{{ cuda_driver_kernel_version }}

when: cuda_runfile_driver and not cuda_driver_installed

- name: 'Remove installer after successful install'
- name: 'Remove installer'
file:
path: /tmp/{{ cuda_runfile_sh }}
path: /tmp/cuda_runfile
state: absent
when: cuda_install_out.rc == 0 and cuda_runfile_remove
when: and cuda_runfile_remove

0 comments on commit fbe9a83

Please sign in to comment.