From 814f61cf3401230004d0b39dcd480f8ea9109400 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1ty=C3=A1s=20Aradi?= <matyas@streamhpc.com>
Date: Tue, 5 Nov 2024 21:15:29 +0100
Subject: [PATCH] Update based on reviews

---
 docs/how-to/hip_runtime_api/call_stack.rst | 86 +++++++++++-----------
 docs/index.md                              |  1 +
 2 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/docs/how-to/hip_runtime_api/call_stack.rst b/docs/how-to/hip_runtime_api/call_stack.rst
index ae3e0be6cd..ecbe0c6f1a 100644
--- a/docs/how-to/hip_runtime_api/call_stack.rst
+++ b/docs/how-to/hip_runtime_api/call_stack.rst
@@ -1,51 +1,49 @@
 .. meta::
-    :description: This page lists graph safe ROCm libraries.
-    :keywords: AMD, ROCm, HIP, hipGRAPH
+    :description: This page describes call stack concept in HIP
+    :keywords: AMD, ROCm, HIP, call stack
 
-********************************************************************************
+*******************************************************************************
 Call stack
-********************************************************************************
+*******************************************************************************
 
 The call stack and the program counter work together to manage the flow of
 control in a program, especially in the context of function calls, loops, and
 branching.
 
 The call stack is a concept in computer science, representing the stack data
-structure that stores information about the active subroutines or functions.
-Each time a function is called, a new call frame is added to the top of the stack,
-containing information such as local variables, return addresses and function
-parameters. When the function execution completes, the frame is removed from the
-stack. This concept allows the program to return to the calling function and
-continue execution from where it left off.
-
-The program counter keeps track of the current or next instruction to
-be executed, while the call stack maintains the history of function calls,
-return addresses, and local variables. Together, they allow a program to execute
-complex control flow structures, handle function calls and return to the
-appropriate location in the code after a function completes.
-
-The call stack in GPU programming has extra complexity due to parallel execution
-also NVIDIA and AMD GPUs are using different approaches. NVIDIA GPUs have the
-independent thread scheduling feature since Volta GPU, where every thread has
-its own call stack and effective program counter. At AMD GPUs every
-warp/wavefront has its own call stack and program counter.
+structure that used for managing function calls, by saving the state of the
+current function. Each time a function is called, a new call frame is added to
+the top of the stack, containing information such as local variables, return
+addresses and function parameters. When the function execution completes, the
+frame is removed from the stack and loaded back into the corresponding
+registers. This concept allows the program to return to the calling function
+and continue execution from where it left off.
+
+The program counter keeps track of the current or next instruction to be
+executed. The program counter and the call stack concept together allow a
+program to handle function calls and return to the appropriate location in the
+code after a function completes.
 
 The call stack for each thread must track its function calls, local variables,
-and return addresses.
+and return addresses. However, in GPU programming, the memory required to store
+the call stack increases due to the parallelism inherent to the GPUs. NVIDIA
+and AMD GPUs are using different approaches. NVIDIA GPUs have the independent
+thread scheduling feature since Volta GPU, where every thread has its own call
+stack and effective program counter. At AMD GPUs every warp/wavefront has its
+own call stack and program counter.
 
 If a thread or warp exceeds its stack size, a stack overflow occurs, causing
-kernel failure. This can be detected and handled using debuggers or with the
-:doc:`error handling functions <../how-to/hip_runtime_api/error_handling>`.
+kernel failure. This can be detected using debuggers.
 
 Call stack management with HIP
 ================================================================================
 
-Developers can adjust the call stack size for every thread, allowing fine-tuning
-based on specific kernel requirements. This helps prevent stack overflow errors
-by ensuring sufficient stack memory is allocated.
+Developers can adjust the call stack size, allowing fine-tuning based on
+specific kernel requirements. This helps prevent stack overflow errors by
+ensuring sufficient stack memory is allocated.
 
 .. code-block:: cpp
-        
+
     #include <hip/hip_runtime.h>
     #include <iostream>
 
@@ -78,18 +76,23 @@ by ensuring sufficient stack memory is allocated.
         return 0;
     }
 
+Depending on the GPU model, at full occupancy, it can consume a significant
+amount of memory. For instance, an MI300X with 304 computing units (CU) and up
+to 2048 threads per CU would use 304 · 2048 · 8192 bytes = 4.75 GiB.
+
 Handling recursion and deep function calls
 --------------------------------------------------------------------------------
 
 Similar to CPU programming, recursive functions and deeply nested function calls
 are supported. However, developers must ensure that these functions do not
-exceed the available stack memory, considering the limited resources on GPUs.
+exceed the available stack memory, considering a huge amount of memory needed
+for the call stack due to the GPUs inherent parallelism.
 
 .. code-block:: cpp
 
     #include <hip/hip_runtime.h>
     #include <iostream>
-    
+
     #define HIP_CHECK(expression)                \
     {                                            \
         const hipError_t status = expression;    \
@@ -101,32 +104,33 @@ exceed the available stack memory, considering the limited resources on GPUs.
                     << __LINE__ << std::endl;    \
         }                                        \
     }
-    
+
     __device__ unsigned long long factorial(unsigned long long n)
     {
-        if (n == 0 || n == 1) {
+        if (n == 0 || n == 1)
+        {
             return 1;
         }
         return n * factorial(n - 1);
     }
-    
+
     __global__ void kernel(unsigned long long n)
     {
         unsigned long long result = factorial(n);
         const size_t x = threadIdx.x + blockDim.x * blockIdx.x;
-        
-        if(x == 0)
-          printf("%llu! = %llu \n", n, result);
+
+        if (x == 0)
+            printf("%llu! = %llu \n", n, result);
     }
-    
+
     int main()
     {
         kernel<<<1, 1>>>(10);
         HIP_CHECK(hipDeviceSynchronize());
 
-        // With -O0 optimization option hit the stack limit
-        // kernel<<<1, 256>>>(2048);
+        // This hits the stack limit due to excessive recursion
+        // kernel<<<1, 1>>>(2048);
         // HIP_CHECK(hipDeviceSynchronize());
-        
+
         return 0;
     }
diff --git a/docs/index.md b/docs/index.md
index 1194126ae3..8a6d491b35 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -33,6 +33,7 @@ The HIP documentation is organized into the following categories:
   * {doc}`./how-to/hip_runtime_api/memory_management`
   * {doc}`./how-to/hip_runtime_api/cooperative_groups`
   * {doc}`./how-to/hip_runtime_api/hipgraph`
+  * {doc}`./how-to/hip_runtime_api/call_stack`
 * [HIP porting guide](./how-to/hip_porting_guide)
 * [HIP porting: driver API guide](./how-to/hip_porting_driver_api)
 * {doc}`./how-to/hip_rtc`