diff --git a/docs/tutorials/reduction.rst b/docs/tutorials/reduction.rst index b6e3cfcd6a..8af47b7141 100644 --- a/docs/tutorials/reduction.rst +++ b/docs/tutorials/reduction.rst @@ -47,7 +47,7 @@ Reduction on GPUs ================= Implementing reductions on GPUs will require a basic understanding of the -{doc}`/understand/programming_model_reference`. The article explores aspects of low-level +:doc:`/understand/programming_model_reference`. The article explores aspects of low-level optimization best discussed through the {ref}`inherent_thread_model`, and as such will refrain from using Cooperative Groups. @@ -326,76 +326,81 @@ Consider the following code: This compiles to the following binaries: - * - **LLVM Block** - - **GCC** - - **MSVC** - - * - .. code-block:: asm - - main: - push rbx - lea rbx, [rip + .L.str] - mov rdi, rbx - xor esi, esi - xor eax, eax - call printf@PLT - mov rdi, rbx - mov esi, 1 - xor eax, eax - call printf@PLT - mov rdi, rbx - mov esi, 2 - xor eax, eax - call printf@PLT - mov rdi, rbx - mov esi, 3 - xor eax, eax - call printf@PLT - xor eax, eax - pop rbx - ret - .L.str: - .asciz "%d" - - - .. code-block:: asm - - .LC0: - .string "%d" - main: - push rbx - xor ebx, ebx - .L2: - mov esi, ebx - mov edi, OFFSET FLAT:.LC0 - xor eax, eax - add ebx, 1 - call printf - cmp ebx, 4 - jne .L2 - xor eax, eax - pop rbx - ret - - - .. code-block:: asm - - main PROC - $LN12: - push rbx - sub rsp, 32 - xor ebx, ebx - npad 8 - $LL4@main: - mov edx, ebx - lea rcx, OFFSET FLAT:`string' - call printf - inc ebx - cmp ebx, 4 - jl SHORT $LL4@main - xor eax, eax - add rsp, 32 - pop rbx - ret 0 - main ENDP +**LLVM Block** + +.. code-block:: + + main: + push rbx + lea rbx, [rip + .L.str] + mov rdi, rbx + xor esi, esi + xor eax, eax + call printf@PLT + mov rdi, rbx + mov esi, 1 + xor eax, eax + call printf@PLT + mov rdi, rbx + mov esi, 2 + xor eax, eax + call printf@PLT + mov rdi, rbx + mov esi, 3 + xor eax, eax + call printf@PLT + xor eax, eax + pop rbx + ret + .L.str: + .asciz "%d" + + +**GCC** + +.. code-block:: asm + + .LC0: + .string "%d" + main: + push rbx + xor ebx, ebx + .L2: + mov esi, ebx + mov edi, OFFSET FLAT:.LC0 + xor eax, eax + add ebx, 1 + call printf + cmp ebx, 4 + jne .L2 + xor eax, eax + pop rbx + ret + + +**MSVC** + +.. code-block:: + + main PROC + $LN12: + push rbx + sub rsp, 32 + xor ebx, ebx + npad 8 + $LL4@main: + mov edx, ebx + lea rcx, OFFSET FLAT:'string' + call printf + inc ebx + cmp ebx, 4 + jl SHORT $LL4@main + xor eax, eax + add rsp, 32 + pop rbx + ret 0 + main ENDP + LLVM unrolls the the loop and compiles to a flat series of ``printf`` invocations while GCC and MSVC both agree to keep the loop intact, visible from the compare