From d7c456b87e0c7995a1cd0f362af468647577aceb Mon Sep 17 00:00:00 2001 From: Cristian-Vasile Achim <66278390+csccva@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:06:53 +0200 Subject: [PATCH] Create 08.sycl-queus-groups-kernels.md --- docs/08.sycl-queus-groups-kernels.md | 295 +++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 docs/08.sycl-queus-groups-kernels.md diff --git a/docs/08.sycl-queus-groups-kernels.md b/docs/08.sycl-queus-groups-kernels.md new file mode 100644 index 0000000..b006e88 --- /dev/null +++ b/docs/08.sycl-queus-groups-kernels.md @@ -0,0 +1,295 @@ +--- +title: Queues, Command Groups, Kernels +subtitle: High-Level GPU Programming +author: CSC Training +date: 2024-11 +lang: en +--- + +# GPU Programming Model{.section} + + +# Anatomy of a SYCL code + + +```cpp +#include +using namespace sycl; + +template +void axpy(queue &q, const T &a, const std::vector &x, std::vector &y) { + range<1> N{x.size()}; + buffer x_buf(x.data(), N); + buffer y_buf(y.data(), N); + + q.submit([&](handler &h) { + auto x = x_buf.template get_access(h); // accessor x(x_buf, h, read_only); + auto y = y_buf.template get_access(h); // accessor y(y_buf, h, read_write); + + h.parallel_for(N, [=](id<1> i) { + y[i] += a * x[i]; + }); + }); + q.wait_and_throw(); +} +``` + + +# GPU Programming model + + - Program runs on the CPU (host) + - CPU initializes the GPUs (devices), allocates the memory, and stages the -GPU transfers + - **Note!** CPU can also be a device + - CPU launched the parallel code (kernel) ito be executed on a device by several threads + - Code is written from the point of view of a single thread + - each thread has a unique ID + +# Device Discovery + + - it is important to know which devices are available + - SYCL provides methods for querying: + - `platform::get_platforms()` gives a list of available platforms + - `.get_devices()` gives a list of present devices in a specific platform + - `.get_info >()` gives invo about specific property + - can be applied to various classes + - wide `sycl::info` namespace + +# SYCL Queue + + - SYCL class responsible for submitting commands + - bridge between the host and the target device (**only one**) + - associated with a SYCL device and a SYCL context + - enable asynchronous execution + - has an error-handling mechanism via an optional `exception_handler` + - are **out-of-order** (default) or **in-order** (`{property::queue::in_order()}`) + - encapsulates operations (e.g., kernel execution or memory transfers) using **command groups** + +# Choosing the Device + + - `queue q();` targets the best device + - Pre-configured classes of devices: + - `queue q(default_selector_v);` targets the best device + - `queue q(cpu_selector_v);` targets the best CPU + - `queue q(gpu_selector_v);` targets the best GPU + - `queue q(accelerator_selector_v);` targets the best accelerator + +# Custom Selector + + +```cpp +using namespace sycl; +class custom_selector : public device_selector +{ +public: + int operator()(const device &dev) const override + { + int score = -1; + if (dev.is_gpu()) { + auto vendor = dev.get_info(); + if (vendor.find("NVIDIA") != std::string::npos) score += 75; + if (vendor.find("Intel") != std::string::npos) score += 50; + if (vendor.find("AMD") != std::string::npos) score += 100; + } + if (dev.is_host()) score += 25; // Give host device points so it is used if no GPU is available. + + return score; + } +}; +``` +```cpp +auto Q = queue { custom_selector {} }; + + std::cout << "we are running on: " + << Q.get_device().get_info() << " " + << Q.get_device().get_info() << std::endl; +``` + + +# Explicit Way + - using `get_platforms()` and/or `get_devices` +```cpp + std::cout << "\tChecking for GPUs\n" << std::endl; + + auto gpu_devices= sycl::device::get_devices(sycl::info::device_type::gpu); + auto n_gpus=size( gpu_devices ); + + std::cout << "\t\t There are "<< n_gpus << " GPUs\n"<< std::endl; + if(n_gpus>0){ + queue q{gpu_devices[my_rank]}; + } + else{ + std::cout << "\t\t There are no GPUs found \n Existing"<< std::endl; + exit(1); + } +``` + + +# Queue Class Member Functions + + - **Enqeue work**: `submit()`, `parallel_for()`, `single_task()` + - **Memory Operations**: `memcpy()` , `fill()`, `copy()`, `memset()` + - **Utilities**: `is_empty()`, `get_device()`, `get_context()`, `throw_asynchronous()` + - **Synchronizations**: `wait()`, `wait_and_throw()` + +# Command Groups{.section} + +# Command Groups + + - created via `.submit()` member + - containers for operations to be executed + - give more control over executions than: + - `q.parallel_for(N, [=](id<1> i) { y[i] += a * x[i];});` + - can have dependencies for ensuring desired order + - are executed *asynchronous* within specific **context** and **queue** + +```cpp + q.submit([&](handler &cgh) { + auto x = x_buf.get_access(h); + auto y = y_buf.get_access(h); + + h.parallel_for(N, [=](id<1> i) { + y[i] += a * x[i]; + }); + }); +``` + + +# Kernels{.section} + +# Kernels + - code to be executed in parallel + - written from the point of view of a work-item (gpu thread) + - each intance gets a unique `id` using the work-item index + +
+ - lambda expressions +```cpp + [=](id<1> i) { + y[i] += a * x[i]; + } +``` +
+ +
+ - function object (functors) + +```cpp +class AXPYFunctor { +public: + AXPYFunctor(float a, accessor x, accessor y): a(a), x(x), + y(y) {} + + void operator()(id<1> i) { + y[i] += a * x[i]; + } + +private: + float a; + accessor x; + accessor y; +}; +``` + + + +# Launching Kernels{.section} + +# Grid of Work-Items + +
+ + +![](img/Grid_threads.png){.center width=37%} + +
A grid of work-groups executing the same **kernel**
+ +
+ +
+![](img/mi100-architecture.png){.center width=53%} + +
AMD Instinct MI100 architecture (source: AMD)
+
+ + - a grid of work-items is created on a specific device to perform the work. + - each work-item executes the same kernel + - each work-item typically processes different elements of the data. + - there is no global synchronization or data exchange. + +# Basic Parallel Launch with `parallel_for` + +
+ + - **range** class to prescribe the span off iterations + - **id** class to index an instance of a kernel + - **item** class gives additional functions + +
+ +
+ +```cpp +cgh.parallel_for(range<1>(N), [=](id<1> idx){ + y[idx] += a * x[idx]; +}); +``` + +```cpp +cgh.parallel_for(range<1>(N), [=](item<1> item){ + auto idx = item.get_id(); + auto R = item.get_range(); + y[idx] += a * x[idx]; +}); +``` + +
+ + - runtime choose how to group the work-items + - supports 1D,2D, and 3D-grids + - no control over the size of groups,no locality within kernels + + +# Parallel launch with **nd-range** I + +![](img/ndrange.jpg){.center width=100%} + +https://link.springer.com/book/10.1007/978-1-4842-9691-2 + +# Parallel launch with **nd-range** II + + - enables low level performance tuning + - **nd_range** sets the global range and the local range + - iteration space is divided into work-groups + - work-items within a work-group are scheduled on a single compute unit + - **nd_item** enables to querying for work-group range and index. + +```cpp +cgh.parallel_for(nd_range<1>(range<1>(N),range<1>(64)), [=](nd_item<1> item){ + auto idx = item.get_global_id(); + auto local_id = item.get_local_id(); + y[idx] += a * x[idx]; +}); +``` + +# Parallel launch with **nd-range** III + - extra functionalities + - each work-group has work-group *local memory* + - faster to access than global memory + - can be used as programmable cache + - group-level *barriers* and *fences* to synchronize work-items within a group + - *barriers* force all work-items to reach a speciffic point before continuing + - *fences* ensures writes are visible to all work-items before proceedin + - group-level collectives, for communication, e.g. broadcasting, or computation, e.g. scans + - useful for reductions at group-level + + +# Summary + + - **queues** are bridges between host and devices + - each queue maps to one device + - work is enqued by submitting **command groups** + - give lots of flexibility + - parallel code (kernel) is submitted as a lambda function or as a function operator + - two methods to express the parallelism + - basic launching + - via **nd-range**