diff --git a/.github/workflows/pages-html.yml b/.github/workflows/pages-html.yml new file mode 100644 index 0000000..6db3d0a --- /dev/null +++ b/.github/workflows/pages-html.yml @@ -0,0 +1,25 @@ +name: Deploy HTML slides to Pages + +on: + # Runs on pushes targeting the default branch + push: + branches: + - "main" + paths: + - "docs/**" + - ".github/workflows/pages.yml" + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +jobs: + pages-html: + uses: ./.github/workflows/pages.yml + with: + include_pdf: false diff --git a/.github/workflows/pages-pdf.yml b/.github/workflows/pages-pdf.yml new file mode 100644 index 0000000..9244020 --- /dev/null +++ b/.github/workflows/pages-pdf.yml @@ -0,0 +1,17 @@ +name: Deploy HTML and PDF slides to Pages + +on: + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +jobs: + pages-pdf: + uses: ./.github/workflows/pages.yml + with: + include_pdf: true diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 0000000..8230bae --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,62 @@ +# Script based on examples in https://github.com/actions/starter-workflows/tree/main/pages +name: Deploy slides to Pages + +on: + workflow_call: + inputs: + include_pdf: + required: true + type: boolean + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + build: + timeout-minutes: 30 + runs-on: ubuntu-latest + container: + image: ghcr.io/csc-training/slidefactory:3.1.0-beta.6 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + id: pages + uses: actions/configure-pages@v4 + - name: Build slides + env: + INCLUDE_PDF: ${{ inputs.include_pdf }} + run: | + git config --global --add safe.directory $PWD + GIT_SHORT_SHA=$(git rev-parse --short $GITHUB_SHA) + GIT_DATE=$(git show -s --format=%ci $GITHUB_SHA) + + ARGS="" + [[ "$INCLUDE_PDF" == "true" ]] && ARGS="--with-pdf" + + slidefactory pages about.yml build --info_content "Updated for [$GIT_SHORT_SHA]($GITHUB_SERVER_URL/$GITHUB_REPOSITORY/commit/$GITHUB_SHA) ($GIT_DATE)" $ARGS + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: ./build + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore index e5b559e..b41c394 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ a.out *.qdrep *.sqlite heat +build/ diff --git a/Exercises_Instructions.md b/Exercises_Instructions.md index 8200b9e..71a70f1 100644 --- a/Exercises_Instructions.md +++ b/Exercises_Instructions.md @@ -38,21 +38,21 @@ The Intel DevCloud can be acces via the [web interface](https://console.cloud.in ### Disk area -The (computing and storage) resources can be accessed on on supercomputers via project-based allocation system, where users are granted access based on the specific needs and goals of their projects. Running applications and storage area are directly linked ot this projects. For this event we have been granted access to the training `project_2008874` on Mahti and `project_462000456` on LUMI. +The (computing and storage) resources can be accessed on on supercomputers via project-based allocation system, where users are granted access based on the specific needs and goals of their projects. Running applications and storage area are directly linked ot this projects. For this event we have been granted access to the training `project_2012125` on Mahti and `project_462000456` on LUMI. All the exercises in the supercomputers have to be carried out in the **scratch** disk area. The name of the scratch directory can be queried with the commands `csc-workspaces` on Mahti and `lumi-workspaces` onLUMI. As the base directory is shared between members of the project, you should create your own directory: on Mahti ``` -cd /scratch/project_2008874 +cd /scratch/project_2012125 mkdir -p $USER cd $USER ``` on LUMI ``` -cd /scratch/project_462000456 +cd /scratch/project_462000752 mkdir -p $USER cd $USER ``` @@ -122,19 +122,18 @@ In order to use the intel SYCL compiler one has to set the environment varibles on Mahti: ``` -. /projappl/project_2008874/intel/oneapi/setvars.sh --include-intel-llvm +. /projappl/project_2012125/intel/oneapi/setvars.sh --include-intel-llvm module load cuda # This is needed for compiling sycl code for nvidia gpus module load openmpi/4.1.2-cuda # This is neeeded for using CUDA aware MPI ``` on LUMI: ``` -. /projappl/project_462000456/intel/oneapi/setvars.sh --include-intel-llvm +. /projappl/project_462000752/intel/oneapi/setvars.sh --include-intel-llvm -module load LUMI/22.08 +module load LUMI module load partition/G -module load rocm/5.3.3 -module load cce/16.0.1 +module load rocm/6.0.3 export MPICH_GPU_SUPPORT_ENABLED=1 # Needed for GPU aware MPI ``` After this one can load other modules that might be needed for compiling the codes. With the environment set-up we can compile and run the SYCL codes. @@ -158,21 +157,20 @@ module load cuda # This is needed for compiling sycl code for nvidia gpus module load openmpi/4.1.2-cuda # This is neeeded for using CUDA aware MPI ``` ``` -/projappl/project_2008874/AdaptiveCpp/bin/acpp -fuse-ld=lld -O3 -L/appl/spack/v017/install-tree/gcc-8.5.0/gcc-11.2.0-zshp2k/lib64 .cpp +/projappl/project_2012125/AdaptiveCpp/bin/acpp -fuse-ld=lld -O3 -L/appl/spack/v017/install-tree/gcc-8.5.0/gcc-11.2.0-zshp2k/lib64 .cpp ``` on LUMI: ``` -module load LUMI/22.08 +module load LUMI module load partition/G -module load rocm/5.3.3 -module load cce/16.0.1 +module load rocm/6.0.3 export MPICH_GPU_SUPPORT_ENABLED=1 -export LD_LIBRARY_PATH=/appl/lumi/SW/LUMI-22.08/G/EB/Boost/1.79.0-cpeCray-22.08/lib:$LD_LIBRARY_PATH -export LD_PRELOAD=/pfs/lustrep4/appl/lumi/SW/LUMI-22.08/G/EB/rocm/5.3.3/llvm/lib/libomp.so +#export LD_LIBRARY_PATH=/appl/lumi/SW/LUMI-22.08/G/EB/Boost/1.79.0-cpeCray-22.08/lib:$LD_LIBRARY_PATH ??? +#export LD_PRELOAD=/pfs/lustrep4/appl/lumi/SW/LUMI-22.08/G/EB/rocm/5.3.3/llvm/lib/libomp.so ?????? ``` ``` - /projappl/project_462000456/AdaptiveCpp/bin/acpp -O3 .cpp + /projappl/project_462000752/AdaptiveCpp/bin/acpp -O3 .cpp ``` In general one can set specific targets via the `--acpp-targets` flag, but we set-up AdaptiveCpp so that on Mahti the `acpp` compiler will automatically generate code for CPU and Nvidia GPUs, while on LUMI for CPU and AMD GPUs. @@ -198,17 +196,16 @@ icpx -fuse-ld=lld -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backen ``` or ``` -/projappl/project_2008874/AdaptiveCpp/bin/acpp -fuse-ld=lld -O3 -L/appl/spack/v017/install-tree/gcc-8.5.0/gcc-11.2.0-zshp2k/lib64 `mpicxx --showme:compile` `mpicxx --showme:link` .cpp +/projappl/project_2012125/AdaptiveCpp/bin/acpp -fuse-ld=lld -O3 -L/appl/spack/v017/install-tree/gcc-8.5.0/gcc-11.2.0-zshp2k/lib64 `mpicxx --showme:compile` `mpicxx --showme:link` .cpp ``` Similarly on LUMI. First we set up the envinronment and load the modules as indicated above ``` -. /projappl/project_462000456/intel/oneapi/setvars.sh --include-intel-llvm +. /projappl/project_462000752/intel/oneapi/setvars.sh --include-intel-llvm -module load LUMI/22.08 +module load LUMI module load partition/G -module load rocm/5.3.3 -module load cce/16.0.1 +module load rocm/6.0.3 export MPICH_GPU_SUPPORT_ENABLED=1 ``` Now compile with intel compilers: @@ -218,8 +215,8 @@ icpx -fsycl -fsycl-targets=amdgcn-amd-amdhsa,spir64_x86_64 -Xsycl-target-backend ``` Or with AdaptiveCpp: ``` -export LD_PRELOAD=/pfs/lustrep4/appl/lumi/SW/LUMI-22.08/G/EB/rocm/5.3.3/llvm/lib/libomp.so -/projappl/project_462000456/AdaptiveCpp/bin/acpp -O3 `CC --cray-print-opts=cflags` .cpp `CC --cray-print-opts=libs` +#export LD_PRELOAD=/pfs/lustrep4/appl/lumi/SW/LUMI-22.08/G/EB/rocm/5.3.3/llvm/lib/libomp.so +/projappl/project_462000752/AdaptiveCpp/bin/acpp -O3 `CC --cray-print-opts=cflags` .cpp `CC --cray-print-opts=libs` ``` ## Running applications in supercomputers @@ -231,9 +228,9 @@ The `job.sh` file contains all the necessary information (number of nodes, tasks ### Useful environment variables -Use [`SYCL_PI_TRACE`](https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-pi-trace-options) to enable runtime tracing (e.g. device discovery): +Use [`SYCL_UR_TRACE`](https://intel.github.io/llvm-docs/EnvironmentVariables.html#sycl-pi-trace-options) to enable runtime tracing (e.g. device discovery): - export SYCL_PI_TRACE=1 + export SYCL_UR_TRACE=1 ### Running on Mahti @@ -242,7 +239,7 @@ Use [`SYCL_PI_TRACE`](https://intel.github.io/llvm-docs/EnvironmentVariables.htm ``` #!/bin/bash #SBATCH --job-name=example -#SBATCH --account=project_2008874 +#SBATCH --account=project_2012125 #SBATCH --partition=medium #SBATCH --reservation=hlgp-cpu-f2024 #SBATCH --time=00:05:00 @@ -258,7 +255,7 @@ The output of job will be in file `slurm-xxxxx.out`. You can check the status of `scancel JOBID`. The reservation `hlgp-cpu-f2024` for partition `medium` is available during the training days and it -is accessible only if the users are part of `project_2008874`. +is accessible only if the users are part of `project_2012125`. Some applications use MPI, in this case the number of node and number of tasks per node will have to be adjusted accordingly. @@ -270,9 +267,9 @@ single GPU with single MPI task and a single thread use: ``` #!/bin/bash #SBATCH --job-name=example -#SBATCH --account=project_2008874 +#SBATCH --account=project_2012125 #SBATCH --partition=gpusmall -#SBATCH --reservation=hlgp-gpu-f2024-thu +#SBATCH --reservation=hlgp-gpu-f2024-thu ????????? #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --time=00:05:00 @@ -282,7 +279,7 @@ srun my_gpu_exe ``` The reservation `hlgp-gpu-f2024-wed` is valid on Wednesday, 15:00 to 17:00. On Thursday we will use `hlgp-gpu-f2024-thu` , while on Friday `hlgp-gpu-f2024-fri`. Outside the course hours, you can use gputest partition instead without the reservation argument, ie, ``` -srun --account=project_2008874 --nodes=1 --partition=gputest --gres=gpu:a100:1 --time=00:05:00 ./my_gpu_exe +srun --account=project_2012125 --nodes=1 --partition=gputest --gres=gpu:a100:1 --time=00:05:00 ./my_gpu_exe ``` @@ -295,9 +292,9 @@ LUMI is similar to Mahti. ``` #!/bin/bash #SBATCH --job-name=example -#SBATCH --account=project_462000456 +#SBATCH --account=project_462000752 #SBATCH --partition=standard -##SBATCH --reservation=hlgp-cpu-f2024 # The reservation does not work +##SBATCH --reservation=hlgp-cpu-f2024 ??????# The reservation does not work #SBATCH --time=00:05:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 @@ -312,9 +309,9 @@ srun my_cpu_exe ``` #!/bin/bash #SBATCH --job-name=example -#SBATCH --account=project_462000456 +#SBATCH --account=project_462000752 #SBATCH --partition=standard-g -#SBATCH --reservation=hlgp-gpu-f2024 +#SBATCH --reservation=hlgp-gpu-f2024 ?????? #SBATCH --time=00:05:00 #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 diff --git a/docs/List-of-topics.md b/List-of-topics.md similarity index 100% rename from docs/List-of-topics.md rename to List-of-topics.md diff --git a/README_setup.md b/README_setup.md new file mode 100644 index 0000000..8318203 --- /dev/null +++ b/README_setup.md @@ -0,0 +1,111 @@ +# Usage + +## OneAPI on Mahti + +Set up the environment: + + source /projappl/project_2012125/intel/oneapi/setvars.sh --include-intel-llvm + ml cuda/11.5.0 openmpi/4.1.2-cuda + +Compile for nvidia and cpu targets: + + icpx -fuse-ld=lld -std=c++20 -O3 -fsycl -fsycl-targets=nvptx64-nvidia-cuda,spir64_x86_64 -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 sycl_code.cpp + +Run as an usual gpu program: + + srun -A project_2012125 -p gputest --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --gres=gpu:a100:1 --time=00:15:00 ./a.out + +## OneAPI on LUMI + +Set up the environment: + + source /projappl/project_462000752/intel/oneapi/setvars.sh --include-intel-llvm + ml rocm/6.0.3 + export MPICH_GPU_SUPPORT_ENABLED=1 + +Compile for amd and cpu targets: + + icpx -fuse-ld=lld -std=c++20 -O3 -fsycl -fsycl-targets=amdgcn-amd-amdhsa,spir64_x86_64 -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx90a sycl_code.cpp + +Run as an usual gpu program: + + srun -A project_462000752 -p dev-g --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --gpus-per-node=1 --time=00:15:00 ./a.out + +## AdaptiveCpp on LUMI + +Set up the environment: + + module load LUMI/24.03 + module load partition/G + module load rocm/6.0.3 + export PATH=/projappl/project_462000752/ACPP/bin/:$PATH + export LD_LIBRARY_PATH=/appl/lumi/SW/LUMI-24.03/G/EB/Boost/1.83.0-cpeGNU-24.03/lib64/:$LD_LIBRARY_PATH + export LD_PRELOAD=/opt/rocm-6.0.3/llvm/lib/libomp.so + +Compile for amd and cpu targets: + + acpp -O2 --acpp-targets="omp.accelerated;hip:gfx90a" .cpp + +Run as an usual gpu program: + + srun -A project_462000752 -p dev-g --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --gpus-per-node=1 --time=00:15:00 ./a.out + +# Installations + +*Here are instructions how the modules used above were installed.* + +## OneAPI on Mahti + +Download [Intel oneAPI base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?packages=oneapi-toolkit&oneapi-toolkit-os=linux&oneapi-lin=offline): + + wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/96aa5993-5b22-4a9b-91ab-da679f422594/intel-oneapi-base-toolkit-2025.0.0.885_offline.sh + +Install: + + sh ./intel-oneapi-base-toolkit-2025.0.0.885_offline.sh -a --silent --cli --eula accept --download-cache $SCRATCH/$USER/oneapi_tmp --install-dir $PROJAPPL/intel/oneapi + +Get [Codeplay oneAPI for NVIDIA GPUs](https://developer.codeplay.com/products/oneapi/nvidia/2025.0.0/guides/get-started-guide-nvidia#installation): + + curl -LOJ "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=nvidia&version=2025.0.0&filters[]=12.0&filters[]=linux" + +Install: + + sh ./oneapi-for-nvidia-gpus-2025.0.0-cuda-12.0-linux.sh -y --extract-folder $SCRATCH/$USER/oneapi_tmp --install-dir $PROJAPPL/intel/oneapi + +## OneAPI on LUMI + +Download [Intel oneAPI base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?packages=oneapi-toolkit&oneapi-toolkit-os=linux&oneapi-lin=offline): + + wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/96aa5993-5b22-4a9b-91ab-da679f422594/intel-oneapi-base-toolkit-2025.0.0.885_offline.sh + +Install: + + sh ./intel-oneapi-base-toolkit-2025.0.0.885_offline.sh -a --silent --cli --eula accept --download-cache $SCRATCH/$USER/oneapi_tmp --install-dir $PROJAPPL/intel/oneapi + +Get [Codeplay oneAPI for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/2025.0.0/guides/get-started-guide-amd#installation): + + curl -LOJ "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=amd&version=2025.0.0&filters[]=6.0.2&filters[]=linux" + +Install: + + sh ./oneapi-for-amd-gpus-2025.0.0-rocm-6.0.2-linux.sh -y --extract-folder $SCRATCH/$USER/oneapi_tmp --install-dir $PROJAPPL/intel/oneapi + +## AdaptiveCpp on LUMI + + +Load the modules needed: +``` +module load LUMI/24.03 +module load partition/G +module load rocm/6.0.3 +``` +Clone repository and Compile with both cpu and rocm support: +``` +git clone https://github.com/AdaptiveCpp/AdaptiveCpp.git +cd AdaptiveCpp +mkdir build +cd build +cmake -DCMAKE_INSTALL_PREFIX=/projappl/project_462000752/ACPP/ -DROCM_PATH=$ROCM_PATH -DWITH_CPU_BACKEND=ON -DWITH_CUDA_BACKEND=OFF -DWITH_ROCM_BACKEND=ON -DACPP_TARGETS="gfx90a" -DWITH_ACCELERATED_CPU=ON -DWITH_SSCP_COMPILER=OFF -DWITH_OPENCL_BACKEND=OFF -DWITH_LEVEL_ZERO_BACKEND=OFF -DBOOST_ROOT=/appl/lumi/SW/LUMI-24.03/G/EB/Boost/1.83.0-cpeGNU-24.03/ .. +make -j 64 +make install +``` diff --git a/about.yml b/about.yml new file mode 100644 index 0000000..f0b9fb7 --- /dev/null +++ b/about.yml @@ -0,0 +1,3 @@ +# This file is used in the generation of the web page +title: High-Level GPU Programming +slidesdir: docs diff --git a/exercises/computing_platform/Install_AdaptiveCpp_LUMI.md b/exercises/computing_platform/Install_AdaptiveCpp_LUMI.md index 79dd941..6d2c9a4 100644 --- a/exercises/computing_platform/Install_AdaptiveCpp_LUMI.md +++ b/exercises/computing_platform/Install_AdaptiveCpp_LUMI.md @@ -1,24 +1,19 @@ ## Install from source: -Clone and switch to the appropriate version: -``` -git clone https://github.com/AdaptiveCpp/AdaptiveCpp.git -# git switch --detach v0.9.4 # use this only if there are problems with the latest version -``` Load the modules needed: ``` -module load LUMI/22.08 +module load LUMI/24.03 module load partition/G -module load Boost/1.79.0-cpeCray-22.08 -module load rocm/5.3.3 -module load cce/16.0.1 +module load rocm/6.0.3 ``` Compile with both cpu and gpu (mi250x) acceleration: ``` -cd /scratch/project_462000456/AdaptiveCpp +git clone https://github.com/AdaptiveCpp/AdaptiveCpp.git +cd AdaptiveCpp mkdir build cd build -cmake -DCMAKE_INSTALL_PREFIX=//AdaptiveCpp/ -DROCM_PATH=$ROCM_PATH -DWITH_CPU_BACKEND=ON -DWITH_CUDA_BACKEND=OFF -DWITH_ROCM_BACKEND=ON -DDEFAULT_GPU_ARCH=gfx90a -DWITH_ACCELERATED_CPU=ON -DWITH_SSCP_COMPILER=OFF -DWITH_OPENCL_BACKEND=OFF -DWITH_LEVEL_ZERO_BACKEND=OFF .. +cmake -DCMAKE_INSTALL_PREFIX=/projappl/project_462000752/ACPP/ -DROCM_PATH=$ROCM_PATH -DWITH_CPU_BACKEND=ON -DWITH_CUDA_BACKEND=OFF -DWITH_ROCM_BACKEND=ON -DACPP_TARGETS="gfx90a" -DWITH_ACCELERATED_CPU=ON -DWITH_SSCP_COMPILER=OFF -DWITH_OPENCL_BACKEND=OFF -DWITH_LEVEL_ZERO_BACKEND=OFF -DBOOST_ROOT=/appl/lumi/SW/LUMI-24.03/G/EB/Boost/1.83.0-cpeGNU-24.03/ .. +make -j 64 make install ``` @@ -26,11 +21,11 @@ make install Load the modules needed: ``` -module load LUMI/22.08 +module load LUMI/24.03 module load partition/G -module load Boost/1.79.0-cpeCray-22.08 -module load rocm/5.3.3 -module load cce/16.0.1 +module load rocm/6.0.3 +/appl/lumi/SW/LUMI-24.03/G/EB/Boost/1.83.0-cpeGNU-24.03/lib64/ +export LD_PRELOAD=/opt/rocm-6.0.3/llvm/lib/libomp.so ``` Compile `sycl` codes: ``` diff --git a/exercises/computing_platform/Install_oneAPI_LUMI.md b/exercises/computing_platform/Install_oneAPI_LUMI.md deleted file mode 100644 index a433406..0000000 --- a/exercises/computing_platform/Install_oneAPI_LUMI.md +++ /dev/null @@ -1,34 +0,0 @@ - - - -Download and intall the oneapi basekit: - -``` -wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/163da6e4-56eb-4948-aba3-debcec61c064/l_BaseKit_p_2024.0.1.46_offline.sh - -module load LUMI/22.08 -module load partition/G -module load rocm/5.3.3 -module load cce/16.0.1 -chmod +x l_BaseKit_p_2024.0.1.46_offline.sh -./l_BaseKit_p_2024.0.1.46_offline.sh -a -s --eula accept --download-cache /scratch/project_462000456/cristian/tttt/ --install-dir /scratch/project_462000456/intel/oneapi -``` - -Now get the hip plugin (the link below might be changed in the future): - -Get and install the plug-in: -``` -curl -LOJ "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=amd&version=2024.0.1&filters[]=5.4.3&filters[]=linux" -sh oneapi-for-amd-gpus-2024.0.1-rocm-5.4.3-linux.sh -y --extract-folder /scratch/project_462000456/tttt/ --install-dir /scratch/project_462000456/intel/oneapi -``` - -## Usage - -``` -. /scratch/project_462000456/intel/oneapi/setvars.sh --include-intel-llvm -module load LUMI/22.08 -module load partition/G -module load rocm/5.3.3 -module load cce/16.0.1 -icpx -fsycl -fsycl-targets=amdgcn-amd-amdhsa,spir64_x86_64 -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx90a .cpp -``` diff --git a/exercises/computing_platform/Install_oneAPI_Mahti.md b/exercises/computing_platform/Install_oneAPI_Mahti.md deleted file mode 100644 index 94431c1..0000000 --- a/exercises/computing_platform/Install_oneAPI_Mahti.md +++ /dev/null @@ -1,29 +0,0 @@ -# Installing oneAPI on Mahti - -Load cuda: - - module load cuda - -Download the oneAPI basekit: - - wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/163da6e4-56eb-4948-aba3-debcec61c064/l_BaseKit_p_2024.0.1.46_offline.sh - chmod +x l_BaseKit_p_2024.0.1.46_offline.sh - -Install: - - ./l_BaseKit_p_2024.0.1.46_offline.sh -a -s --eula accept --download-cache /scratch/project_2008874/$USER/oneapi_tmp/ --install-dir /scratch/project_2008874/$USER/intel/oneapi - -Get the cuda plugin (the link below might be changed in the future): - - curl -LOJ "https://developer.codeplay.com/api/v1/products/download?product=oneapi&variant=nvidia&version=2024.0.1&filters[]=12.0&filters[]=linux" - -Install: - - ./oneapi-for-nvidia-gpus-2024.0.1-cuda-12.0-linux.sh -y --extract-folder /scratch/project_2008874/$USER/oneapi_tmp/ --install-dir /scratch/project_2008874/$USER/intel/oneapi - -## References - -* [Intel oneAPI installation guide](https://www.intel.com/content/www/us/en/docs/oneapi/installation-guide-linux/2024-0/install-with-command-line.html) -* [Intel oneAPI download page](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline) -* [Codeplay oneAPI for NVIDIA GPUs](https://developer.codeplay.com/products/oneapi/nvidia/2024.0.1/guides/get-started-guide-nvidia) - diff --git a/exercises/computing_platform/Usage_AdaptiveCpp_LUMI.md b/exercises/computing_platform/Usage_AdaptiveCpp_LUMI.md index 5f7e260..9aa9eec 100644 --- a/exercises/computing_platform/Usage_AdaptiveCpp_LUMI.md +++ b/exercises/computing_platform/Usage_AdaptiveCpp_LUMI.md @@ -2,13 +2,14 @@ Load the modules needed: ``` -module load LUMI/22.08 +module load LUMI/24.03 module load partition/G -module load Boost/1.79.0-cpeCray-22.08 -module load rocm/5.3.3 -module load cce/16.0.1 +module load rocm/6.0.3 +export PATH=/projappl/project_462000752/ACPP/bin/:$PATH +export LD_LIBRARY_PATH=/appl/lumi/SW/LUMI-24.03/G/EB/Boost/1.83.0-cpeGNU-24.03/lib64/:$LD_LIBRARY_PATH +export LD_PRELOAD=/opt/rocm-6.0.3/llvm/lib/libomp.so ``` Compile `sycl` codes: ``` -/scratch/project_462000456/AdaptiveCpp/bin/syclcc -O2 --hipsycl-targets="omp.accelerated;hip:gfx90a" .cpp +/projappl/project_462000752/ACPP/bin/acpp -O2 --acpp-targets="omp.accelerated;hip:gfx90a" .cpp ``` diff --git a/exercises/computing_platform/Usage_oneAPI_LUMI.md b/exercises/computing_platform/Usage_oneAPI_LUMI.md deleted file mode 100644 index 5039681..0000000 --- a/exercises/computing_platform/Usage_oneAPI_LUMI.md +++ /dev/null @@ -1,23 +0,0 @@ -## Usage - -``` -module load LUMI/22.08 -module load partition/G -module load rocm/5.3.3 -module load cce/16.0.1 -. /scratch/project_462000456//intel/oneapi/setvars.sh --include-intel-llvm -export MPICH_GPU_SUPPORT_ENABLED=1 -``` - -``` -icpx -fsycl -fsycl-targets=amdgcn-amd-amdhsa,spir64_x86_64 -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx90a .cpp -``` - -## Running -CPU -``` -srun -p debug --exclusive -n 1 --cpus-per-task=128 --time=00:05:00 --account=project_462000456 ./a.out -``` -``` -#SBATCH -``` diff --git a/exercises/computing_platform/Usage_oneAPI_Mahti.md b/exercises/computing_platform/Usage_oneAPI_Mahti.md deleted file mode 100644 index 18aad94..0000000 --- a/exercises/computing_platform/Usage_oneAPI_Mahti.md +++ /dev/null @@ -1,23 +0,0 @@ -# Using oneAPI on Mahti - -Set the environments paths: - - . /projappl/project_2008874/intel/oneapi/setvars.sh --include-intel-llvm - ml cuda/11.5.0 openmpi/4.1.2-cuda - -Compile for nvidia and cpu targets: - - clang++ -std=c++17 -O3 -fsycl -fsycl-targets=nvptx64-nvidia-cuda,spir64_x86_64 -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_80 .cpp - -Run as an usual gpu program: - - srun --partition=gputest --account=project_2008874 --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --gres=gpu:a100:1 --time=00:15:00 ./a.out - - -## The Intel® DPC++ Compatibility Tool - -The Intel® DPC++ Compatibility Tool (syclomatic) is included in the oneAPI basekit. For migrating cuda to sycl use (for example): - - dpct --in-root=./ src/vector_add.cu - -See [the heat equation exercise](sycl/10-heat-equation-from-cuda/) for a complete example. diff --git a/exercises/sycl/02-vector_add/Readme.md b/exercises/sycl/02-vector_add/Readme.md new file mode 100644 index 0000000..8fc753b --- /dev/null +++ b/exercises/sycl/02-vector_add/Readme.md @@ -0,0 +1,156 @@ +# Vector Addition +The task is to compute the element-wise addition of two vectors (C = A + B) in parallel. + +A skeleton code is provided in vector_add_<..>.cpp. You need to complete the missing parts to calculate the result in parallel. Try running the program on both CPU and GPU devices. + +A typical application running on an accelerator follows these steps: + + 1. Initialize data on the host. + 1. Create a queue and associate it with the desired device. + 1. Manage memory on the device by creating necessary constructs. + 1. Launch the kernel. + 1. Retrieve and verify the results on the host. + +In this exercise, we will explore various memory models. + +## I. Memory management using Buffers and Accessors + +Use the skeleton provided in `vector_add_buffer.cpp`. Look for the **//TODO** lines. + +### Step 1: Define a Queue +Start by defining a **queue** and selecting the appropriate device selector. SYCL provides predefined selectors, such as: default, gpu, cpu, accelerator or you can use the procedure from the [previous exercise](../01-info/enumerate_device.cpp). + +### Step 2: Create Buffers +Next, create buffers to encapsulate the data. For a one-dimensional array of integers of length `N`, with pointer `P`, a buffer can be constructed as follows: + +```cpp + sycl::buffer a_buf(P, sycl::range<1>(N)); +``` +### Step 3: Create Accessors +Accessors provide a mechanism to access data inside the buffers. Accessors on the device must be created within command groups. There are two ways to create accessors. Using the `sycl::accessor` class constructor + +```cpp + sycl::accessor a{a_buf, h, sycl::read_write}; +``` +or using the buffer `.getaccess<...>(h)` member function: +```cpp + auto a = a_buf.get_access(h); +``` +**Important** Use appropriate access modes for your data: + - **Input Buffers:** Use `sycl::access::mode::read` to avoid unnecessary device-to-host data transfers. + - **Output Buffers:** Use `sycl::access::mode::write` to avoid unnecessary host-to-device data transfers. + - **Input/Ouput Buffers:** Use `sycl::access::mode::read_write` for the variables that are input, but they also get modified during the computaions. + +### Step 4: Submit the Task +Once accessors are ready, submit the task to the device using the `.parallel_for()` member function. The basic submission: + +```cpp + h.parallel_for(sycl::range{N}, [=](sycl::id<1> idx) { + c[idx] = a[idx] + b[idx]; + }); +``` +Here: + - `sycl::range{N}` or `sycl::range(N)` specify number of work-items be launched + - `sycl::id<1>` represents the index used within the kernel. + +#### Using **item** class instead of **id** +Modify the lambda function to use the **sycl::item** class instead of the **id** class. In this case the index `idx` is obtained from the `.get_id()` member. + +#### Using ND-Range +This basic launching serves our purpose for this simpler example, however it is useful to test also the **ND-RANGE**. In case we specify to the runtime the total size of the grid of work-items and size of a work-group as well: + +```cpp + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(((N+local_size-1)/local_size)*local_size), sycl::range<1>(local_size)), [=](sycl::nd_itemi<1> item) { + auto idx=item.get_global_id(0); + c[idx] = a[idx] + b[idx]; + }); +``` +**Note** that **ND-RANGE** requires that the total number of work-items to be divisible by the size of the work-group. + +### Step 5: Retrieve Data +The final task in this exercise is to move the checking of the results within the scope of the buffers (before the ending curly bracket) and add the appropriate method to access this data. + +By default, buffers are automatically synchronized with the host when they go out of scope. However, if you need to access data within the buffer’s scope, use **host accessors**. + +Similar to the device accessors, it is possible to define host accessors in two ways. By using the accessor class constructor +```cpp + host_accessor c{c_buf, sycl::access::mode::read}; +``` +or by using the `.get_access` member function of the buffer +```cpp + auto c = c_buf.get_access(); +``` + +## II. Memory management with Unified Shared Memory + +### IIa) **malloc_device** + +Use the skeleton provided in `vector_add_usm_device.cpp`. Look for the **//TODO** lines. + +### Step 1: Define a Queue +Same as using buffers + +### Step 2: Allocate Memory on the Device Using `malloc_device` +Instead of creating buffers, allocate memory directly on the device using `sycl::malloc_device`. For a one-dimensional array of integers of length N, memory can be allocated as follows: + +```cpp + int* a_usm = sycl::malloc_device(N, q); +``` +### Step 3: Copy Data to the Device + +You need to copy the data from the host to the device memory. Use sycl::memcpy to transfer data from the host memory to device memory before launching the kernel: +```cpp + q.memcpy(a_usm, a.data(), N * sizeof(int)).wait(); +``` + +### Step 4: Submit the Task +Same as using buffers. + +### Step 5: Retrieve Data + +After the kernel execution is complete, you need to copy the result back from the device to the host. Use `sycl::memcpy` again to transfer the result: +```cpp + q.memcpy(c.data(), c_usm, N * sizeof(int)).wait(); +``` +### Step 6: Free Device Memory + +Once you're done with the device memory, free the allocated memory using `sycl::free`: + +```cpp + sycl::free(a_usm, q); +``` +This ensures that the allocated memory is properly released on the device. + + + +### IIb) **malloc_shared** + +Use the skeleton provided in `vector_add_usm_managed.cpp`. Look for the **//TODO** lines. + +### Step 1: Define a Queue +Same as before + +### Step 2: Allocate Memory on the Device Using `malloc_managed` +Allocate memory that can be migrated between host and device using `sycl::malloc_managed`. For a one-dimensional array of integers of length N, memory can be allocated as follows: + +```cpp +int* a = sycl::malloc_managed(N, q); +``` +Step 3: Initialize Data on Host + +This part is already in the skeleton, it is done using `std::fill`. Though if you have time you can replace it with a **for loop**. + +### Step 4: Submit the Task +Same as using buffers. + +### Step 5: Synchronize and Check Results + +Since `malloc_managed` migrates data automatically between the host and device, no explicit memory transfer is required. Ensure the queue finishes execution before accessing the results using `q.wait()`; +### Step 6: Free Device Memory + +Once you're done with the device memory, free the allocated memory using `sycl::free`: + +```cpp +sycl::free(a_usm, q); +``` +This ensures that the allocated memory is properly released on the device. \ No newline at end of file diff --git a/exercises/sycl/02-vector_add/solution/vector_add_buffer.cpp b/exercises/sycl/02-vector_add/solution/vector_add_buffer.cpp new file mode 100644 index 0000000..fcd26c8 --- /dev/null +++ b/exercises/sycl/02-vector_add/solution/vector_add_buffer.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2023 Intel Corporation + +// SPDX-License-Identifier: MIT +#include +#include +using namespace sycl; + +int main() { + // Set up queue on any available device + queue q; + + // Initialize input and output memory on the host + constexpr size_t N = 256; + std::vector a(N), b(N), c(N); + std::fill(a.begin(), a.end(), 1); + std::fill(b.begin(), b.end(), 2); + std::fill(c.begin(), c.end(), 0); + + { + // Create buffers for data and sum + buffer a_buf(a.data(), range<1>(N)); + buffer b_buf(b.data(), range<1>(N)); + buffer c_buf(c.data(), range<1>(N)); + + // Submit the kernel to the queue + q.submit([&](handler& h) { + accessor a{a_buf, h}; + accessor b{b_buf, h}; + accessor c{c_buf, h}; + + h.parallel_for(range{N}, [=](id<1> idx) { + c[idx] = a[idx] + b[idx]; + }); + }); + } + + // Check that all outputs match expected value + bool passed = std::all_of(c.begin(), c.end(), + [](int i) { return (i == 3); }); + std::cout << ((passed) ? "SUCCESS" : "FAILURE") + << std::endl; + return (passed) ? 0 : 1; +} diff --git a/exercises/sycl/02-vector_add/solution/vector_add_usm_device.cpp b/exercises/sycl/02-vector_add/solution/vector_add_usm_device.cpp new file mode 100644 index 0000000..c0505a2 --- /dev/null +++ b/exercises/sycl/02-vector_add/solution/vector_add_usm_device.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: MIT +#include +#include +using namespace sycl; + +int main() { + // Set up queue on any available device + queue q; + + // Initialize input and output memory on the host + constexpr size_t N = 256; + std::vector a(N), b(N), c(N); + std::fill(a.begin(), a.end(), 1); + std::fill(b.begin(), b.end(), 2); + std::fill(c.begin(), c.end(), 0); + + // Allocate the memory using universal share memory + int* a_usm = malloc_device(N, q); + int* b_usm = malloc_device(N, q); + int* c_usm = malloc_device(N, q); + + // Copy data from host to USM + q.memcpy(a_usm, a.data(), N * sizeof(int)).wait(); + q.memcpy(b_usm, b.data(), N * sizeof(int)).wait(); + q.memcpy(c_usm, c.data(), N * sizeof(int)).wait(); + + // Submit the kernel to the queue + q.submit([&](handler& h) { + + h.parallel_for(range{N}, [=](id<1> idx) { + c_usm[idx] = a_usm[idx] + b_usm[idx]; + }); + + }); + + // Wait for the kernel to finish + q.wait(); + + // Copy data from USM to host + q.memcpy(c.data(), c_usm, N * sizeof(int)).wait(); + + // Free USM allocations + free(a_usm, q); + free(b_usm, q); + free(c_usm, q); + + // Check that all outputs match the expected value + bool passed = std::all_of(c.begin(), c.end(), + [](int i) { return (i == 3); }); + std::cout << ((passed) ? "SUCCESS" : "FAILURE") << std::endl; + return (passed) ? 0 : 1; +} diff --git a/exercises/sycl/02-vector_add/solution/vector_add_usm_managed.cpp b/exercises/sycl/02-vector_add/solution/vector_add_usm_managed.cpp new file mode 100644 index 0000000..32fb3b7 --- /dev/null +++ b/exercises/sycl/02-vector_add/solution/vector_add_usm_managed.cpp @@ -0,0 +1,47 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: MIT +#include +#include +using namespace sycl; + +int main() { + // Set up queue on any available device + queue q; + + // Initialize input and output memory on the host + constexpr size_t N = 256; + + // Allocate memory using malloc_managed + int* a = malloc_managed(N, q); + int* b = malloc_managed(N, q); + int* c = malloc_managed(N, q); + + // Initialize input memory on the host + std::fill(a, a + N, 1); + std::fill(b, b + N, 2); + std::fill(c, c + N, 0); + + // Submit the kernel to the queue + q.submit([&](handler& h) { + + h.parallel_for(range{N}, [=](id<1> idx) { + c[idx] = a[idx] + b[idx]; + }); + + }); + + // Wait for the kernel to finish + q.wait(); + + // Check that all outputs match the expected value + bool passed = std::all_of(c.begin(), c.end(), + [](int i) { return (i == 3); }); + std::cout << ((passed) ? "SUCCESS" : "FAILURE") << std::endl; + + // Free managed allocations + free(a, q); + free(b, q); + free(c, q); + + return (passed) ? 0 : 1; +} diff --git a/exercises/sycl/02-vector_add/vector_add_buffer.cpp b/exercises/sycl/02-vector_add/vector_add_buffer.cpp index 94e061a..265907e 100644 --- a/exercises/sycl/02-vector_add/vector_add_buffer.cpp +++ b/exercises/sycl/02-vector_add/vector_add_buffer.cpp @@ -7,33 +7,31 @@ using namespace sycl; int main() { // Set up queue on any available device + //TODO queue q; // Initialize input and output memory on the host constexpr size_t N = 256; - std::vector a(N), b(N), c(N); + std::vector a_host(N), b_host(N), c_host(N); std::fill(a.begin(), a.end(), 1); std::fill(b.begin(), b.end(), 2); std::fill(c.begin(), c.end(), 0); { - // Create buffers for data and sum - buffer a_buf(a.data(), range<1>(N)); - buffer b_buf(b.data(), range<1>(N)); - buffer c_buf(c.data(), range<1>(N)); + // Create buffers for the + // TODO // Submit the kernel to the queue q.submit([&](handler& h) { - accessor a{a_buf, h}; - accessor b{b_buf, h}; - accessor c{c_buf, h}; - - // BEGIN CODE SNIP - h.parallel_for(range{N}, [=](id<1> idx) { - c[idx] = a[idx] + b[idx]; - }); - // END CODE SNIP - }); + // Create accessors + //TODO + + h.parallel_for( + //TODO + ); + + //TODO after the submission works + //Checking inside the scope of the buffers } // Check that all outputs match expected value diff --git a/exercises/sycl/02-vector_add/vector_add_usm.cpp b/exercises/sycl/02-vector_add/vector_add_usm.cpp deleted file mode 100644 index 4ba2ca5..0000000 --- a/exercises/sycl/02-vector_add/vector_add_usm.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: MIT -#include -#include -using namespace sycl; - -int main() { - // Set up queue on any available device - queue q; - - // Initialize input and output memory on the host - constexpr size_t N = 256; - std::vector a(N), b(N), c(N); - std::fill(a.begin(), a.end(), 1); - std::fill(b.begin(), b.end(), 2); - std::fill(c.begin(), c.end(), 0); - - { - // Allocate the memory using universal share memory - int* a_usm = malloc_device(N, q); - int* b_usm = malloc_device(N, q); - int* c_usm = malloc_device(N, q); - - // Copy data from host to USM - q.memcpy(a_usm, a.data(), N * sizeof(int)).wait(); - q.memcpy(b_usm, b.data(), N * sizeof(int)).wait(); - q.memcpy(c_usm, c.data(), N * sizeof(int)).wait(); - - // Submit the kernel to the queue - q.submit([&](handler& h) { - // BEGIN CODE SNIP - h.parallel_for(range{N}, [=](id<1> idx) { - c_usm[idx] = a_usm[idx] + b_usm[idx]; - }); - // END CODE SNIP - }); - - // Wait for the kernel to finish - q.wait(); - - // Copy data from USM to host - q.memcpy(c.data(), c_usm, N * sizeof(int)).wait(); - - // Free USM allocations - free(a_usm, q); - free(b_usm, q); - free(c_usm, q); - } - - // Check that all outputs match the expected value - bool passed = std::all_of(c.begin(), c.end(), - [](int i) { return (i == 3); }); - std::cout << ((passed) ? "SUCCESS" : "FAILURE") << std::endl; - return (passed) ? 0 : 1; -} diff --git a/exercises/sycl/02-vector_add/vector_add_usm_device.cpp b/exercises/sycl/02-vector_add/vector_add_usm_device.cpp new file mode 100644 index 0000000..7a78cbe --- /dev/null +++ b/exercises/sycl/02-vector_add/vector_add_usm_device.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: MIT +#include +#include +using namespace sycl; + +int main() { + // Set up queue on any available device + //TODO + + // Initialize input and output memory on the host + constexpr size_t N = 256; + std::vector a(N), b(N), c(N); + std::fill(a.begin(), a.end(), 1); + std::fill(b.begin(), b.end(), 2); + std::fill(c.begin(), c.end(), 0); + + // Allocate the memory using malloc_device + //TODO + + // Copy data from host to USM + //TODO + + // Submit the kernel to the queue + q.submit([&](handler& h) { + + h.parallel_for( + //TODO + ); + + }); + + // Wait for the kernel to finish + q.wait(); + + // Copy data from USM to host + //TODO + + // Free USM allocations + //TODO + + + // Check that all outputs match the expected value + bool passed = std::all_of(c.begin(), c.end(), + [](int i) { return (i == 3); }); + std::cout << ((passed) ? "SUCCESS" : "FAILURE") << std::endl; + return (passed) ? 0 : 1; +} diff --git a/exercises/sycl/02-vector_add/vector_add_usm_managed.cpp b/exercises/sycl/02-vector_add/vector_add_usm_managed.cpp new file mode 100644 index 0000000..4ba158a --- /dev/null +++ b/exercises/sycl/02-vector_add/vector_add_usm_managed.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: MIT +#include +#include +using namespace sycl; + +int main() { + // Set up queue on any available device + //TODO + + // Initialize input and output memory on the host + constexpr size_t N = 256; + + // Allocate memory using malloc_managed + //TODO + + // Initialize input memory on the host + std::fill(a, a + N, 1); + std::fill(b, b + N, 2); + std::fill(c, c + N, 0); + + // Submit the kernel to the queue + q.submit([&](handler& h) { + + h.parallel_for( + //TODO + ); + }); + + // Wait for the kernel to finish + q.wait(); + + // Check that all outputs match the expected value + bool passed = std::all_of(c.begin(), c.end(), + [](int i) { return (i == 3); }); + std::cout << ((passed) ? "SUCCESS" : "FAILURE") << std::endl; + + // Free the memory + //TODO + + return (passed) ? 0 : 1; +}