NVIDIA-Merlin · oliverholworthy · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023
diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml
@@ -1,71 +1,146 @@
-name: GPU CI
+name: gpu-ci
 
 on:
   workflow_dispatch:
   push:
     branches:
       - main
-      - "pull-request/[0-9]+"
+      - pull-request/*
     tags:
       - "v[0-9]+.[0-9]+.[0-9]+"
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   gpu-ci:
     runs-on: linux-amd64-gpu-p100-latest-1
+    strategy:
+      matrix:
+        image:
+          [
+            "nvcr.io/nvidia/tensorflow:23.02-tf2-py3",
+            "nvcr.io/nvidia/tensorflow:23.04-tf2-py3",
+            "nvcr.io/nvidia/tensorflow:23.06-tf2-py3",
+          ]
     container:
-      image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
+      image: ${{ matrix.image }}
       env:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
-      options: --shm-size=1G
-      credentials:
-        username: $oauthtoken
-        password: ${{ secrets.NGC_TOKEN }}
-
     steps:
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
+      - name: Install and upgrade python packages
+        run: |
+          python -m pip install --upgrade pip tox
+      - uses: actions/cache@v3
+        with:
+          path: .tox
+          key: tox-${{ matrix.image }}-${{ hashFiles('requirements/*.txt') }}
+      - name: Get Branch name
+        id: get-branch-name
+        uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3
       - name: Run tests
         run: |
-          nvidia-smi
-          pip install tox
-          ref_type=${{ github.ref_type }}
-          branch=main
-          if [[ $ref_type == "tag"* ]]
-          then
-            git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
-            branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
-          fi
-          if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
+          if [ "${{ github.ref }}" != 'refs/heads/main' ]; then
               extra_pytest_markers="and changed"
           fi
-          PYTEST_MARKERS="unit and not (examples or integration or notebook) and (singlegpu or not multigpu) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu
+          merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
+          MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \
+            PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \
+            tox -e gpu
+
+  # gpu-cu11:
+  #   runs-on: linux-amd64-gpu-p100-latest-1
+  #   env:
+  #     IMAGE: "nvidia/cuda:11.8.0-runtime-ubuntu22.04"
+  #   container:
+  #     image: ${{ env.IMAGE }}
+  #     env:
+  #       NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+  #   strategy:
+  #     matrix:
+  #       versions: [{ rapids: "23.04", python: "3.8" }]
+  #   steps:
+  #     - name: Install Ubuntu packages
+  #       run: |
+  #         apt-get update -y
+  #         apt-get install -y \
+  #           git \
+  #           'libcudnn8=*cuda11.8' `# tensorflow GPU support` \
+  #           cuda-nvcc-11-8 `# required for numba`
+  #     - uses: actions/checkout@v3
+  #       with:
+  #         fetch-depth: 0
+  #     - name: Set up Python ${{ matrix.version.python }}
+  #       id: setup-python
+  #       uses: actions/setup-python@v4
+  #       with:
+  #         python-version: ${{ matrix.version.python }}
+  #     - uses: actions/cache@v3
+  #       with:
+  #         path: .tox
+  #         key: tox-${{ matrix.IMAGE }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }}
+  #     - name: Install and upgrade python packages
+  #       run: |
+  #         python -m pip install --upgrade pip tox
+  #     - name: Get Branch name
+  #       id: get-branch-name
+  #       uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3
+  #     - name: Run tests
+  #       run: |
+  #         if [ "${{ github.ref }}" != 'refs/heads/main' ]; then
+  #             extra_pytest_markers="and changed"
+  #         fi
+  #         merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
+  #         RAPIDS_VERSION=${{ matrix.version.rapids }} MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \
+  #           PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" \
+  #           tox -e gpu-cu11
 
-  gpu-ci-examples:
+  tests-examples:
     runs-on: linux-amd64-gpu-p100-latest-1
     container:
-      image: nvcr.io/nvstaging/merlin/merlin-ci-runner:latest
+      image: "nvidia/cuda:11.8.0-runtime-ubuntu22.04"
       env:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
-      options: --shm-size=1G
-      credentials:
-        username: $oauthtoken
-        password: ${{ secrets.NGC_TOKEN }}
+    strategy:
+      matrix:
+        version: [{ rapids: "23.04", python: "3.8" }]
     steps:
+      - name: Install Ubuntu packages
+        run: |
+          apt-get update -y
+          # libcudnn8 installed for tensorflow GPU support
+          apt-get install -y \
+            git \
+            'libcudnn8=*cuda11.8' `# tensorflow GPU support` \
+            cuda-nvcc-11-8 `# required for numba`
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
+      - name: Set up Python ${{ matrix.version.python }}
+        id: setup-python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.version.python }}
+      - uses: actions/cache@v3
+        with:
+          path: .tox
+          key: tox-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('requirements/*.txt') }}
+      - name: Install and upgrade python packages
+        run: |
+          python -m pip install --upgrade pip tox
+      - name: Get Branch name
+        id: get-branch-name
+        uses: NVIDIA-Merlin/.github/actions/branch-name@6f0539fba24f60da2aee63c5925bee7cee3206e3
       - name: Run tests
         run: |
-          pip install tox
-          ref_type=${{ github.ref_type }}
-          branch=main
-          if [[ $ref_type == "tag"* ]]
-          then
-            git -c protocol.version=2 fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 origin +refs/heads/release*:refs/remotes/origin/release*
-            branch=$(git branch -r --contains ${{ github.ref_name }} --list '*release*' --format "%(refname:short)" | sed -e 's/^origin\///')
-          fi
-          if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
+          if [ "${{ github.ref }}" != 'refs/heads/main' ]; then
               extra_pytest_markers="and changed"
           fi
-          PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu
+          merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
+          RAPIDS_VERSION=${{ matrix.version.rapids }} MERLIN_BRANCH=$merlin_branch COMPARE_BRANCH=$merlin_branch \
+            PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" \
+            tox -e gpu-cu11
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -1,5 +1,6 @@
 -r dev.txt
 -r pytorch.txt
 -r tensorflow.txt
+-r transformers.txt
 
 numpy<1.24
diff --git a/tox.ini b/tox.ini
@@ -2,18 +2,42 @@
 ; .github/workflows/cpu-ci.yml for the workflow definition.
 
 [tox]
-envlist = gpu,multi-gpu,horovod-cpu,nvtabular-cpu,systems-cpu,transformers4rec-cpu,docs,docs-multi
+envlist = gpu,gpu-cu11,multi-gpu,horovod-cpu,nvtabular-cpu,systems-cpu,transformers4rec-cpu,docs,docs-multi
 
 [testenv]
 commands =
     pip install --upgrade pip
     pip install -e .[all]
 
+[testenv:gpu-cu11]
+; Runs in: GitHub Actions
+; Runs GPU-based tests.
+setenv =
+    TF_GPU_ALLOCATOR=cuda_malloc_async
+    PIP_EXTRA_INDEX_URL=https://pypi.nvidia.com
+    PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+allowlist_externals =
+    bash
+passenv =
+    CUDA_VISIBLE_DEVICES
+deps =
+    -rrequirements/test.txt
+    git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/dataloader.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/NVTabular.git@{env:MERLIN_BRANCH}
+    git+https://github.com/NVIDIA-Merlin/systems.git@{env:MERLIN_BRANCH}
+    nvidia-cudnn-cu11~=8.6.0
+    cudf-cu11=={env:RAPIDS_VERSION}
+    dask-cudf-cu11=={env:RAPIDS_VERSION}
+commands =
+    bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)'
+
 [testenv:gpu]
 ; Runs in: Github Actions
 ; Runs GPU-based tests.
 allowlist_externals =
     bash
+    cp
 deps =
     -rrequirements/test.txt
     git+https://github.com/NVIDIA-Merlin/core.git@{env:MERLIN_BRANCH}
@@ -26,6 +50,8 @@ setenv =
     TF_GPU_ALLOCATOR=cuda_malloc_async
 sitepackages=true
 commands =
+    ; copy system libs into virtualenv path (e.g. XGBoost) 
+    bash -c 'cp $(python -c "import sys; print(sys.base_prefix)")/lib/*.so* $(python -c "import sys; print(sys.prefix)")/lib'
     bash -c 'python -m pytest --cov-report term --cov merlin -m "{env:PYTEST_MARKERS}" -rxs {posargs:tests} || ([ $? = 5 ] && exit 0 || exit $?)'
 
 [testenv:horovod-gpu]