Skip to content

Commit

Permalink
Merge pull request #37 from fgci-org/cuda-run-installation
Browse files Browse the repository at this point in the history
Cuda runfile installation
  • Loading branch information
VilleS1 authored May 3, 2021
2 parents 68d0f31 + f1fefb7 commit c61e442
Show file tree
Hide file tree
Showing 11 changed files with 295 additions and 3 deletions.
33 changes: 33 additions & 0 deletions .yamllint
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
# Based on ansible-lint config
extends: default

rules:
braces:
max-spaces-inside: 1
level: error
brackets:
max-spaces-inside: 1
level: error
colons:
max-spaces-after: -1
level: error
commas:
max-spaces-after: -1
level: error
comments: disable
comments-indentation: disable
document-start: disable
empty-lines:
max: 3
level: error
hyphens:
level: error
indentation: disable
key-duplicates: enable
line-length: disable
new-line-at-end-of-file: disable
new-lines:
type: unix
trailing-spaces: disable
truthy: disable
7 changes: 7 additions & 0 deletions defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ cuda_repo_url: "http://developer.download.nvidia.com/compute/cuda/repos/"
cuda_rpm_key_path: /etc/rpm/nvidia_packaging_key.asc
cuda_packages:
- cuda
cuda_use_runfile: False
cuda_runfile_url: "https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run"
cuda_runfile_driver: True
cuda_runfile_toolkit: True
cuda_runfile_download: True
cuda_runfile_remove: True
cuda_runfile_disable_nvidia_drm: False
cuda_restart_node_on_install: True
cuda_init: True
cuda_init_restart_service: True
Expand Down
2 changes: 2 additions & 0 deletions files/blacklist-nouveau.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
blacklist nouveau
options nouveau modeset=0
14 changes: 14 additions & 0 deletions files/nvidia-persistenced.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Unit]
Description=NVIDIA Persistence Daemon
After=syslog.target

[Service]
Type=forking
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
Restart=always
ExecStart=/usr/bin/nvidia-persistenced --verbose
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced/*
TimeoutSec=300

[Install]
WantedBy=multi-user.target
22 changes: 22 additions & 0 deletions molecule/default/INSTALL.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
*******
Docker driver installation guide
*******

Requirements
============

* Docker Engine

Install
=======

Please refer to the `Virtual environment`_ documentation for installation best
practices. If not using a virtual environment, please consider passing the
widely recommended `'--user' flag`_ when invoking ``pip``.

.. _Virtual environment: https://virtualenv.pypa.io/en/latest/
.. _'--user' flag: https://packaging.python.org/tutorials/installing-packages/#installing-to-the-user-site

.. code-block:: bash
$ python3 -m pip install 'molecule[docker]'
7 changes: 7 additions & 0 deletions molecule/default/converge.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
- name: Converge ansible-role-cuda
hosts: all
tasks:
- name: "Include ansible-role-cuda"
include_role:
name: "ansible-role-cuda"
49 changes: 49 additions & 0 deletions molecule/default/molecule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
---
dependency:
name: galaxy
driver:
name: docker
platforms:
- name: centos7_cuda_repo
image: docker.io/pycontribs/centos:7
pre_build_image: true
tmpfs:
- /run
volumes:
- /tmp/centos7_cuda_repo:/tmp:rw
- name: centos7_cuda_run
image: docker.io/pycontribs/centos:7
pre_build_image: true
tmpfs:
- /run
volumes:
- /tmp/centos7_cuda_run:/tmp:rw
provisioner:
name: ansible
inventory:
group_vars:
all:
gpu: True
cuda_driver_kernel_version: 3.10.0-1160.21.1.el7.x86_64 # The kernel to check kernel modules against
host_vars:
centos7_cuda_repo:
cuda_packages:
- cuda-libraries-11-2
cuda_restart_node_on_install: False
cuda_init: False
cuda_init_restart_service: False
centos7_cuda_run:
cuda_use_runfile: True
cuda_runfile_driver: True # Docker has different kernel than images kernel-headers
cuda_runfile_remove: False # Keep the installer in /tmp/centos7_cuda_run for multiple runs
cuda_restart_node_on_install: False
cuda_init: False
cuda_init_restart_service: False

verifier:
name: ansible
lint: |
set -e
yamllint .
ansible-lint
flake8
24 changes: 24 additions & 0 deletions molecule/default/verify.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
# This is an example playbook to execute Ansible tests.

- name: Verify CUDA toolkit installation
hosts: all
tasks:
- name: Check that CUDA has been installed
stat:
path: /usr/local/cuda-11.2
register: cuda_path_check
- name: Verify that CUDA folder exists
assert:
that: cuda_path_check.stat.exists
- name: Verify NVIDIA driver kernel modules
hosts: centos7_cuda_run
tasks:
- name: Check that NVIDIA kernel module has been installed
find:
path: /lib/modules/{{ cuda_driver_kernel_version }}
patterns: nvidia.ko
register: nvidia_module_find
- name: Verify that kernel module exists
assert:
that: nvidia_module_find.matched > 0
124 changes: 124 additions & 0 deletions tasks/install_runfile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
---

- name: "Ensure kernel headers are installed (yum)"
yum:
name: "{{ cuda_runfile_packages }}"
state: present
when: ansible_pkg_mgr in ["yum", "dnf"]

- name: "Ensure kernel headers are installed (apt)"
yum:
name:
- linux-headers-generic
- build-essential
state: present
when: ansible_pkg_mgr == "apt"

- name: "Disable nouveau"
copy:
src: blacklist-nouveau.conf
dest: /etc/modprobe.d/blacklist-nouveau.conf

- name: "Register installer name"
set_fact:
cuda_runfile_sh: "{{ cuda_runfile_url | basename }}"

- name: "Determine running kernel"
command: uname -r
register: cuda_driver_kernel_running

- name: "Determine kernel version"
set_fact:
cuda_driver_kernel_version: "{{ cuda_driver_kernel_version | default(cuda_driver_kernel_running.stdout, true) }}"

- name: "Check NVIDIA kernel module"
find:
path: "/lib/modules/{{ cuda_driver_kernel_version }}"
patterns: nvidia.ko
recurse: true
register: cuda_driver_kernel_module_find

- name: "Check CUDA toolkit path"
stat:
path: /usr/local/cuda
register: cuda_toolkit_path

- name: "Determine if driver and toolkit are installed"
set_fact:
cuda_driver_installed: "{{ cuda_driver_kernel_module_find.matched > 0 }}"
cuda_toolkit_installed: "{{ cuda_toolkit_path.stat.exists }}"

- name: "Print information about installed features"
debug:
msg:
- "Driver installed: {{ cuda_driver_installed }}"
- "Toolkit installed: {{ cuda_toolkit_installed }}"

- name: "Create temporary directory for runfile"
file:
path: /tmp/cuda_runfile
state: directory

- name: "Obtain runfile"
block:

- name: "Copy pre-downloaded runfile"
copy:
src: "{{ cuda_runfile_sh }}"
dest: /tmp/cuda_runfile
when: not cuda_runfile_download

- name: "Download runfile"
get_url:
url: "{{ cuda_runfile_url }}"
dest: "/tmp/cuda_runfile/{{ cuda_runfile_sh }}"
when: cuda_runfile_download

when: (cuda_runfile_toolkit and not cuda_toolkit_installed) or
(cuda_runfile_driver and not cuda_driver_installed)

- name: "Run installer for toolkit"
command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --silent --toolkit
register: cuda_toolkit_install_out
when: cuda_runfile_toolkit and not cuda_toolkit_installed

- name: "Install driver"
block:

- name: "Extract installer for driver installation"
command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --extract=/tmp/cuda_runfile

- name: "Find NVIDIA runtime"
find:
paths: /tmp/cuda_runfile
patterns: "NVIDIA*.run"
register: cuda_driver_runfile_find

- name: "Set NVIDIA runfile path"
set_fact:
cuda_driver_runfile: "{{ cuda_driver_runfile_find.files[0].path }}"

- name: "Print information about driver"
debug:
msg: "Building driver {{ cuda_driver_runfile }} for kernel {{ cuda_driver_kernel_version }}"

- name: "Install driver"
command: >
bash {{ cuda_driver_runfile }} --silent
--kernel-name={{ cuda_driver_kernel_version }}
--kernel-source-path=/usr/src/kernels/{{ cuda_driver_kernel_version }}
{{ "--no-drm" if cuda_runfile_disable_nvidia_drm else "" }}
- name: "Install nvidia-persistenced systemd-file"
copy:
src: files/nvidia-persistenced.service
dest: /etc/systemd/system/nvidia-persistenced.service
when: cuda_init_persistence_mode | bool

when: cuda_runfile_driver and not cuda_driver_installed

- name: "Remove installer"
file:
path: /tmp/cuda_runfile
state: absent
when: cuda_runfile_remove
8 changes: 6 additions & 2 deletions tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,21 @@

- block:
- include_tasks: configure_yum.yml
when: ansible_pkg_mgr in ['yum', 'dnf']
when: ansible_pkg_mgr in ['yum', 'dnf'] and not cuda_use_runfile

- include_tasks: configure_apt.yml
when: ansible_pkg_mgr == 'apt'
when: ansible_pkg_mgr == 'apt' and not cuda_use_runfile

- include_tasks: install_runfile.yml
when: cuda_use_runfile

- name: Install CUDA packages (1.5-2GB download, also restarts if cuda_restart_node_on_install is set to True)
package:
name: "{{ item }}"
state: present
with_items: "{{ cuda_packages }}"
register: cuda_packages_installation
when: not cuda_use_runfile
notify:
- ZZ CUDA Restart server
- ZZ CUDA Wait for server to restart
Expand Down
8 changes: 7 additions & 1 deletion vars/centos-7.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
---
cuda_repo_subfolder: rhel7

# vim:ft=ansible:
cuda_runfile_packages:
- kernel-devel
- "@Development tools"
- which


# vim:ft=ansible:

0 comments on commit c61e442

Please sign in to comment.