diff --git a/.yamllint b/.yamllint new file mode 100644 index 0000000..8827676 --- /dev/null +++ b/.yamllint @@ -0,0 +1,33 @@ +--- +# Based on ansible-lint config +extends: default + +rules: + braces: + max-spaces-inside: 1 + level: error + brackets: + max-spaces-inside: 1 + level: error + colons: + max-spaces-after: -1 + level: error + commas: + max-spaces-after: -1 + level: error + comments: disable + comments-indentation: disable + document-start: disable + empty-lines: + max: 3 + level: error + hyphens: + level: error + indentation: disable + key-duplicates: enable + line-length: disable + new-line-at-end-of-file: disable + new-lines: + type: unix + trailing-spaces: disable + truthy: disable diff --git a/defaults/main.yml b/defaults/main.yml index 95d35a3..9d6f6bb 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -7,6 +7,13 @@ cuda_repo_url: "http://developer.download.nvidia.com/compute/cuda/repos/" cuda_rpm_key_path: /etc/rpm/nvidia_packaging_key.asc cuda_packages: - cuda +cuda_use_runfile: False +cuda_runfile_url: "https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run" +cuda_runfile_driver: True +cuda_runfile_toolkit: True +cuda_runfile_download: True +cuda_runfile_remove: True +cuda_runfile_disable_nvidia_drm: False cuda_restart_node_on_install: True cuda_init: True cuda_init_restart_service: True diff --git a/files/blacklist-nouveau.conf b/files/blacklist-nouveau.conf new file mode 100644 index 0000000..c9b9bfc --- /dev/null +++ b/files/blacklist-nouveau.conf @@ -0,0 +1,2 @@ +blacklist nouveau +options nouveau modeset=0 diff --git a/files/nvidia-persistenced.service b/files/nvidia-persistenced.service new file mode 100644 index 0000000..8a461ed --- /dev/null +++ b/files/nvidia-persistenced.service @@ -0,0 +1,14 @@ +[Unit] +Description=NVIDIA Persistence Daemon +After=syslog.target + +[Service] +Type=forking +PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid +Restart=always +ExecStart=/usr/bin/nvidia-persistenced --verbose +ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced/* +TimeoutSec=300 + +[Install] +WantedBy=multi-user.target diff --git a/molecule/default/INSTALL.rst b/molecule/default/INSTALL.rst new file mode 100644 index 0000000..d926ca2 --- /dev/null +++ b/molecule/default/INSTALL.rst @@ -0,0 +1,22 @@ +******* +Docker driver installation guide +******* + +Requirements +============ + +* Docker Engine + +Install +======= + +Please refer to the `Virtual environment`_ documentation for installation best +practices. If not using a virtual environment, please consider passing the +widely recommended `'--user' flag`_ when invoking ``pip``. + +.. _Virtual environment: https://virtualenv.pypa.io/en/latest/ +.. _'--user' flag: https://packaging.python.org/tutorials/installing-packages/#installing-to-the-user-site + +.. code-block:: bash + + $ python3 -m pip install 'molecule[docker]' diff --git a/molecule/default/converge.yml b/molecule/default/converge.yml new file mode 100644 index 0000000..a878209 --- /dev/null +++ b/molecule/default/converge.yml @@ -0,0 +1,7 @@ +--- +- name: Converge ansible-role-cuda + hosts: all + tasks: + - name: "Include ansible-role-cuda" + include_role: + name: "ansible-role-cuda" diff --git a/molecule/default/molecule.yml b/molecule/default/molecule.yml new file mode 100644 index 0000000..9b4ecda --- /dev/null +++ b/molecule/default/molecule.yml @@ -0,0 +1,49 @@ +--- +dependency: + name: galaxy +driver: + name: docker +platforms: + - name: centos7_cuda_repo + image: docker.io/pycontribs/centos:7 + pre_build_image: true + tmpfs: + - /run + volumes: + - /tmp/centos7_cuda_repo:/tmp:rw + - name: centos7_cuda_run + image: docker.io/pycontribs/centos:7 + pre_build_image: true + tmpfs: + - /run + volumes: + - /tmp/centos7_cuda_run:/tmp:rw +provisioner: + name: ansible + inventory: + group_vars: + all: + gpu: True + cuda_driver_kernel_version: 3.10.0-1160.21.1.el7.x86_64 # The kernel to check kernel modules against + host_vars: + centos7_cuda_repo: + cuda_packages: + - cuda-libraries-11-2 + cuda_restart_node_on_install: False + cuda_init: False + cuda_init_restart_service: False + centos7_cuda_run: + cuda_use_runfile: True + cuda_runfile_driver: True # Docker has different kernel than images kernel-headers + cuda_runfile_remove: False # Keep the installer in /tmp/centos7_cuda_run for multiple runs + cuda_restart_node_on_install: False + cuda_init: False + cuda_init_restart_service: False + +verifier: + name: ansible +lint: | + set -e + yamllint . + ansible-lint + flake8 diff --git a/molecule/default/verify.yml b/molecule/default/verify.yml new file mode 100644 index 0000000..e321c63 --- /dev/null +++ b/molecule/default/verify.yml @@ -0,0 +1,24 @@ +--- +# This is an example playbook to execute Ansible tests. + +- name: Verify CUDA toolkit installation + hosts: all + tasks: + - name: Check that CUDA has been installed + stat: + path: /usr/local/cuda-11.2 + register: cuda_path_check + - name: Verify that CUDA folder exists + assert: + that: cuda_path_check.stat.exists +- name: Verify NVIDIA driver kernel modules + hosts: centos7_cuda_run + tasks: + - name: Check that NVIDIA kernel module has been installed + find: + path: /lib/modules/{{ cuda_driver_kernel_version }} + patterns: nvidia.ko + register: nvidia_module_find + - name: Verify that kernel module exists + assert: + that: nvidia_module_find.matched > 0 diff --git a/tasks/install_runfile.yml b/tasks/install_runfile.yml new file mode 100644 index 0000000..4ede7fc --- /dev/null +++ b/tasks/install_runfile.yml @@ -0,0 +1,124 @@ +--- + +- name: "Ensure kernel headers are installed (yum)" + yum: + name: "{{ cuda_runfile_packages }}" + state: present + when: ansible_pkg_mgr in ["yum", "dnf"] + +- name: "Ensure kernel headers are installed (apt)" + yum: + name: + - linux-headers-generic + - build-essential + state: present + when: ansible_pkg_mgr == "apt" + +- name: "Disable nouveau" + copy: + src: blacklist-nouveau.conf + dest: /etc/modprobe.d/blacklist-nouveau.conf + +- name: "Register installer name" + set_fact: + cuda_runfile_sh: "{{ cuda_runfile_url | basename }}" + +- name: "Determine running kernel" + command: uname -r + register: cuda_driver_kernel_running + +- name: "Determine kernel version" + set_fact: + cuda_driver_kernel_version: "{{ cuda_driver_kernel_version | default(cuda_driver_kernel_running.stdout, true) }}" + +- name: "Check NVIDIA kernel module" + find: + path: "/lib/modules/{{ cuda_driver_kernel_version }}" + patterns: nvidia.ko + recurse: true + register: cuda_driver_kernel_module_find + +- name: "Check CUDA toolkit path" + stat: + path: /usr/local/cuda + register: cuda_toolkit_path + +- name: "Determine if driver and toolkit are installed" + set_fact: + cuda_driver_installed: "{{ cuda_driver_kernel_module_find.matched > 0 }}" + cuda_toolkit_installed: "{{ cuda_toolkit_path.stat.exists }}" + +- name: "Print information about installed features" + debug: + msg: + - "Driver installed: {{ cuda_driver_installed }}" + - "Toolkit installed: {{ cuda_toolkit_installed }}" + +- name: "Create temporary directory for runfile" + file: + path: /tmp/cuda_runfile + state: directory + +- name: "Obtain runfile" + block: + + - name: "Copy pre-downloaded runfile" + copy: + src: "{{ cuda_runfile_sh }}" + dest: /tmp/cuda_runfile + when: not cuda_runfile_download + + - name: "Download runfile" + get_url: + url: "{{ cuda_runfile_url }}" + dest: "/tmp/cuda_runfile/{{ cuda_runfile_sh }}" + when: cuda_runfile_download + + when: (cuda_runfile_toolkit and not cuda_toolkit_installed) or + (cuda_runfile_driver and not cuda_driver_installed) + +- name: "Run installer for toolkit" + command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --silent --toolkit + register: cuda_toolkit_install_out + when: cuda_runfile_toolkit and not cuda_toolkit_installed + +- name: "Install driver" + block: + + - name: "Extract installer for driver installation" + command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --extract=/tmp/cuda_runfile + + - name: "Find NVIDIA runtime" + find: + paths: /tmp/cuda_runfile + patterns: "NVIDIA*.run" + register: cuda_driver_runfile_find + + - name: "Set NVIDIA runfile path" + set_fact: + cuda_driver_runfile: "{{ cuda_driver_runfile_find.files[0].path }}" + + - name: "Print information about driver" + debug: + msg: "Building driver {{ cuda_driver_runfile }} for kernel {{ cuda_driver_kernel_version }}" + + - name: "Install driver" + command: > + bash {{ cuda_driver_runfile }} --silent + --kernel-name={{ cuda_driver_kernel_version }} + --kernel-source-path=/usr/src/kernels/{{ cuda_driver_kernel_version }} + {{ "--no-drm" if cuda_runfile_disable_nvidia_drm else "" }} + + - name: "Install nvidia-persistenced systemd-file" + copy: + src: files/nvidia-persistenced.service + dest: /etc/systemd/system/nvidia-persistenced.service + when: cuda_init_persistence_mode | bool + + when: cuda_runfile_driver and not cuda_driver_installed + +- name: "Remove installer" + file: + path: /tmp/cuda_runfile + state: absent + when: cuda_runfile_remove diff --git a/tasks/main.yml b/tasks/main.yml index 66760f6..436dbf7 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -10,10 +10,13 @@ - block: - include_tasks: configure_yum.yml - when: ansible_pkg_mgr in ['yum', 'dnf'] + when: ansible_pkg_mgr in ['yum', 'dnf'] and not cuda_use_runfile - include_tasks: configure_apt.yml - when: ansible_pkg_mgr == 'apt' + when: ansible_pkg_mgr == 'apt' and not cuda_use_runfile + + - include_tasks: install_runfile.yml + when: cuda_use_runfile - name: Install CUDA packages (1.5-2GB download, also restarts if cuda_restart_node_on_install is set to True) package: @@ -21,6 +24,7 @@ state: present with_items: "{{ cuda_packages }}" register: cuda_packages_installation + when: not cuda_use_runfile notify: - ZZ CUDA Restart server - ZZ CUDA Wait for server to restart diff --git a/vars/centos-7.yml b/vars/centos-7.yml index b331a96..5618557 100644 --- a/vars/centos-7.yml +++ b/vars/centos-7.yml @@ -1,4 +1,10 @@ --- cuda_repo_subfolder: rhel7 -# vim:ft=ansible: \ No newline at end of file +cuda_runfile_packages: + - kernel-devel + - "@Development tools" + - which + + +# vim:ft=ansible: