From fbe9a83b68c07d10ee5b61062f2b3576a0251b9d Mon Sep 17 00:00:00 2001 From: Simo Tuomisto Date: Wed, 7 Apr 2021 19:37:41 +0300 Subject: [PATCH] Different NVIDIA driver installation method for ohpc system --- molecule/default/molecule.yml | 3 +- molecule/default/verify.yml | 12 +++++- tasks/install_runfile.yml | 73 ++++++++++++++++++++++++++++++----- 3 files changed, 76 insertions(+), 12 deletions(-) diff --git a/molecule/default/molecule.yml b/molecule/default/molecule.yml index c8f99cd..9b4ecda 100644 --- a/molecule/default/molecule.yml +++ b/molecule/default/molecule.yml @@ -24,6 +24,7 @@ provisioner: group_vars: all: gpu: True + cuda_driver_kernel_version: 3.10.0-1160.21.1.el7.x86_64 # The kernel to check kernel modules against host_vars: centos7_cuda_repo: cuda_packages: @@ -33,7 +34,7 @@ provisioner: cuda_init_restart_service: False centos7_cuda_run: cuda_use_runfile: True - cuda_runfile_driver: False # Docker has different kernel than images kernel-headers + cuda_runfile_driver: True # Docker has different kernel than images kernel-headers cuda_runfile_remove: False # Keep the installer in /tmp/centos7_cuda_run for multiple runs cuda_restart_node_on_install: False cuda_init: False diff --git a/molecule/default/verify.yml b/molecule/default/verify.yml index 0bc0c18..9521d16 100644 --- a/molecule/default/verify.yml +++ b/molecule/default/verify.yml @@ -1,7 +1,7 @@ --- # This is an example playbook to execute Ansible tests. -- name: Verify +- name: Verify CUDA toolkit installation hosts: all tasks: - name: Check that CUDA has been installed @@ -11,3 +11,13 @@ - name: Verify that CUDA folder exists assert: that: cuda_path_check.stat.exists +- name: Verify NVIDIA driver kernel modules + hosts: centos7_cuda_run + tasks: + - name: Check that NVIDIA kernel module has been installed + stat: + path: /lib/modules/{{ cuda_driver_kernel_version }}/video/nvidia.ko + register: nvidia_module_file + - name: Verify that kernel module exists + assert: + that: nvidia_module_file.stat.exists diff --git a/tasks/install_runfile.yml b/tasks/install_runfile.yml index dd9b692..c095ba5 100644 --- a/tasks/install_runfile.yml +++ b/tasks/install_runfile.yml @@ -23,21 +23,74 @@ set_fact: cuda_runfile_sh: "{{ cuda_runfile_url | basename }}" +- name: 'Determine running kernel' + command: uname -r + register: cuda_driver_kernel_running + +- name: 'Determine kernel version' + set_fact: + cuda_driver_kernel_version: "{{ cuda_driver_kernel_version | default(cuda_driver_kernel_running.stdout, true) }}" + +- name: 'Check NVIDIA kernel module' + stat: + path: /lib/modules/{{ cuda_driver_kernel_version }}/video/nvidia.ko + register: cuda_driver_kernel_module + +- name: "Check CUDA toolkit path" + stat: + path: /usr/local/cuda + register: cuda_toolkit_path + +- name: 'Determine if driver and toolkit are installed' + set_fact: + cuda_driver_installed: "{{ cuda_driver_kernel_module.stat.exists }}" + cuda_toolkit_installed: "{{ cuda_toolkit_path.stat.exists }}" + +- name: "Create temporary directory for runfile" + file: + path: /tmp/cuda_runfile + state: directory + - name: "Download runfile" get_url: url: "{{ cuda_runfile_url }}" - dest: "/tmp/{{ cuda_runfile_sh }}" + dest: "/tmp/cuda_runfile/{{ cuda_runfile_sh }}" + when: (cuda_runfile_toolkit and not cuda_toolkit_installed) or + (cuda_runfile_driver and not cuda_driver_installed) -- name: 'Setting runfile arguments' - set_fact: - runfile_args: "--silent {{ '--driver' if cuda_runfile_driver else '' }} {{ '--toolkit' if cuda_runfile_toolkit else '' }}" -- name: "Run installer" - command: bash /tmp/{{ cuda_runfile_sh }} {{ runfile_args }} - register: cuda_install_out +- name: "Run installer for toolkit" + command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --silent --toolkit + register: cuda_toolkit_install_out + when: cuda_runfile_toolkit and not cuda_toolkit_installed + +- name: 'Install driver' + block: + + - name: 'Extract installer for driver installation' + command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --extract=/tmp/cuda_runfile + + - name: 'Find NVIDIA runtime' + find: + paths: /tmp/cuda_runfile + patterns: 'NVIDIA*.run' + register: cuda_driver_runfile_find + + - name: 'Set NVIDIA runfile path' + set_fact: + cuda_driver_runfile: "{{ cuda_driver_runfile_find.files[0].path }}" + + - name: 'Print variables of interest' + debug: + msg: "{{ cuda_driver_runfile }} {{ cuda_driver_kernel_version }} {{ cuda_driver_kernel_running.stdout }}" + + - name: 'Install driver' + command: bash {{ cuda_driver_runfile }} --silent --kernel-name={{ cuda_driver_kernel_version }} --kernel-source-path=/usr/src/kernels/{{ cuda_driver_kernel_version }} + + when: cuda_runfile_driver and not cuda_driver_installed -- name: 'Remove installer after successful install' +- name: 'Remove installer' file: - path: /tmp/{{ cuda_runfile_sh }} + path: /tmp/cuda_runfile state: absent - when: cuda_install_out.rc == 0 and cuda_runfile_remove + when: and cuda_runfile_remove