From 8eca2da272002f969cf27270188c9e5e1c65c31c Mon Sep 17 00:00:00 2001
From: Bence Parajdi <bence@streamhpc.com>
Date: Wed, 14 Aug 2024 15:06:09 +0200
Subject: [PATCH] Documentation: Add hardware capabilities page

Co-authored-by:   Matthias Knorr <MKKnorr@web.de>
---
 docs/index.md                              |   1 +
 docs/reference/cpp_language_extensions.rst |   7 +-
 docs/reference/hardware_features.rst       | 249 +++++++++++++++++++++
 docs/sphinx/_toc.yml.in                    |   1 +
 4 files changed, 257 insertions(+), 1 deletion(-)
 create mode 100644 docs/reference/hardware_features.rst

diff --git a/docs/index.md b/docs/index.md
index 0ef68cd649..a9a9b26249 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -57,6 +57,7 @@ The HIP documentation is organized into the following categories:
 * [Comparing syntax for different APIs](./reference/terms)
 * [List of deprecated APIs](./reference/deprecated_api_list)
 * [FP8 numbers in HIP](./reference/fp8_numbers)
+* {doc}`./understand/hardware_features`
 
 :::
 
diff --git a/docs/reference/cpp_language_extensions.rst b/docs/reference/cpp_language_extensions.rst
index c0b804c552..243b6ae08e 100644
--- a/docs/reference/cpp_language_extensions.rst
+++ b/docs/reference/cpp_language_extensions.rst
@@ -293,6 +293,7 @@ dimensions to 1.
     dim3(uint32_t _x=1, uint32_t _y=1, uint32_t _z=1) : x(_x), y(_y), z(_z) {};
   };
 
+.. _memory_fence_instructions:
 
 Memory fence instructions
 ====================================================
@@ -306,7 +307,7 @@ HIP supports ``__threadfence()`` and ``__threadfence_block()``. If you're using
     ``hipHostMalloc()``.
   * Remove ``memcpy`` for all allocated fine-grained system memory regions.
 
-.. _synchronization functions:
+.. _synchronization_functions:
 
 Synchronization functions
 ====================================================
@@ -376,6 +377,8 @@ To read a high-resolution timer from the device, HIP provides the following buil
 
   Note that ``clock()`` and ``clock64()`` do not work properly on AMD RDNA3 (GFX11) graphic processors.
 
+.. _atomic functions:
+
 Atomic functions
 ===============================================
 
@@ -734,6 +737,8 @@ will be enabled unconditionally in the next ROCm release. Wherever possible, the
 implementation includes a static assert to check that the program source uses
 the correct type for the mask.
 
+.. _warp_vote_functions:
+
 Warp vote and ballot functions
 -------------------------------------------------------------------------------------------------------------
 
diff --git a/docs/reference/hardware_features.rst b/docs/reference/hardware_features.rst
new file mode 100644
index 0000000000..7ec9ec329e
--- /dev/null
+++ b/docs/reference/hardware_features.rst
@@ -0,0 +1,249 @@
+.. meta::
+  :description: This chapter describes the hardware features of the different hardware architectures.
+  :keywords: AMD, ROCm, HIP, hardware, hardware features, hardware architectures
+
+*******************************************************************************
+Hardware features
+*******************************************************************************
+
+This page gives an overview of the different hardware architectures and the
+features they implement. Hardware features do not imply performance, that
+depends on the specifications found in the :doc:`rocm:reference/gpu-arch-specs`
+page.
+
+  .. list-table::
+      :header-rows: 1
+      :name: hardware-features-table
+
+      *
+        - Hardware feature support
+        - RDNA1
+        - CDNA1
+        - RDNA2
+        - CDNA2
+        - RDNA3
+        - CDNA3
+      *
+        - :ref:`atomic functions` on 32-bit integer values in global and shared memory
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Atomic functions on 64-bit integer values in global and shared memory
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Atomic addition on 32-bit floating point values in global and shared memory
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Atomic addition on 64-bit floating point values in global memory and shared memory
+        - ❌
+        - ❌
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`Warp vote functions <warp_vote_functions>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`Memory fence instructions <memory_fence_instructions>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`Synchronization functions <synchronization_functions>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`Surface functions <surface_object_reference>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`float16 half precision IEEE-conformant floating-point operations<rocm:precision_support_floating_point_types>`
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - :ref:`bfloat16 16-bit floating-point operations<rocm:precision_support_floating_point_types>`
+        - ❌
+        - ✅
+        - ❌
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Support for :ref:`8-bit floating-point types <rocm:precision_support_floating_point_types>`
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ✅
+      *
+        - Support for :ref:`tensor float32 <rocm:precision_support_floating_point_types>`
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ❌
+        - ✅
+      *
+        - Packed math with 16-bit floating point values
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Packed math with 32-bit floating point values
+        - ❌
+        - ❌
+        - ❌
+        - ✅
+        - ❌
+        - ✅
+      *
+        - Matrix Cores
+        - ❌
+        - ✅
+        - ❌
+        - ✅
+        - ❌
+        - ✅
+      *
+        - On-Chip Error Correcting Code (ECC)
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+        - ✅
+      *
+        - Maximum dimensionality of grid
+        - 3
+        - 3
+        - 3
+        - 3
+        - 3
+        - 3
+      *
+        - Maximum x-, y- or z-dimension of a grid
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+      *
+        - Maximum number of threads per grid
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+        - :math:`2^{32} - 1`
+      *
+        - Maximum x-, y- or z-dimension of a block
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+      *
+        - Maximum number of threads per block
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+        - :math:`1024`
+      *
+        - Wavefront size
+        - 32 [1]_
+        - 64
+        - 32 [1]_
+        - 64
+        - 32 [1]_
+        - 64
+      *
+        - Maximum number of resident blocks per compute unit
+        - 40 [1]_
+        - 32
+        - 32 [1]_
+        - 32
+        - 32 [1]_
+        - 32
+      *
+        - Maximum number of resident wavefronts per compute unit
+        - 40 [1]_
+        - 32
+        - 32 [1]_
+        - 32
+        - 32 [1]_
+        - 32
+      *
+        - Maximum number of resident threads per compute unit
+        - 1280 [2]_
+        - 2048
+        - 1024 [2]_
+        - 2048
+        - 1024 [2]_
+        - 2048
+      *
+        - Maximum number of 32-bit vector registers per thread
+        - 256
+        - 256 (vector) + 256 (matrix)
+        - 256
+        - 256 (vector) + 256 (matrix)
+        - 256
+        - 256 (vector) + 256 (matrix)
+      *
+        - Maximum number of 32-bit scalar accumulation registers per thread
+        - 106
+        - 104
+        - 106
+        - 104
+        - 106
+        - 104
+
+.. [1] RDNA architectures have a configurable wavefront size. The native
+   wavefront size is 32, but they can run in "CU mode", which has an effective
+   wavefront size of 64. This affects the number of resident wavefronts and
+   blocks per compute Unit.
+.. [2] RDNA architectures expand the concept of the traditional compute unit
+   with the so-called work group processor, which effectively includes two
+   compute units, within which all threads can cooperate.
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index f36efda6cc..60dacd8382 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -114,6 +114,7 @@ subtrees:
     title: List of deprecated APIs
   - file: reference/fp8_numbers
     title: FP8 numbers in HIP
+  - file: reference/hardware_features
 
 - caption: Tutorials
   entries: