From 6d2ae15391e0d60679dbb1ae1cd936e776dd7268 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Wed, 6 Dec 2023 10:09:17 -0800 Subject: [PATCH 01/67] Update _toc.yml.in TOC restructure --- docs/sphinx/_toc.yml.in | 58 +++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index a4becb31dc..c7ccbef1e1 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -1,24 +1,44 @@ +# Anywhere {branch} is used, the branch name will be substituted. +# These comments will also be removed. defaults: numbered: False - maxdepth: 6 + maxdepth: 7 root: index +title: What is MIOpen? subtrees: -- entries: - - file: releasenotes - - file: citation - - file: install - - file: embed - - file: driver - - file: DebugAndLogging - - file: cache - - file: perfdatabase - - file: finddb - - file: find_and_immediate - - file: Getting_Started_FusionAPI - - file: MI200AlternateImplementation - - file: MIOpen_Porting_Guide - - file: apireference -- caption: About - entries: - - file: license +- entries: + - file: find_and_immediate.rst + - file: finddb.rst + - file: cache.rst + - file: perfdatabase.rst + - file: Getting_Started_FusionAPI.rst + - file: DebugAndLogging.rst + - file: MI200AlternateImplementation.rst + - file: MIOpen_Porting_Guide.rst + - file: citation.rst + + title: Quick-start + - file: reference/install.rst + - file: reference/embed.rst + - file: reference/driver.rst + + title: API reference + subtrees: + - entries: + - file: reference/apireference.rst + title: API library + + - file: tutorials/index.rst + title: Tutorials + subtrees: + - entries: + - file: tutorials/install.rst + - file: tutorials/embed.rst + - file: tutorials/driver.rst + title: Installing and building MIOpen + + + + + From 856b2f11e54991859b94c6fc14091fafb761b1f8 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:10:00 -0800 Subject: [PATCH 02/67] Update _toc.yml.in Changed title to caption --- docs/sphinx/_toc.yml.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index c7ccbef1e1..e6e68b4e85 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -4,7 +4,7 @@ defaults: numbered: False maxdepth: 7 root: index -title: What is MIOpen? +caption: What is MIOpen? subtrees: - entries: - file: find_and_immediate.rst @@ -17,12 +17,12 @@ subtrees: - file: MIOpen_Porting_Guide.rst - file: citation.rst - title: Quick-start + caption: Quick-start - file: reference/install.rst - file: reference/embed.rst - file: reference/driver.rst - title: API reference + caption: API reference subtrees: - entries: - file: reference/apireference.rst From cae46fd52c793c8ce4cad639ddf72e967053a0a4 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:10:29 -0800 Subject: [PATCH 03/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index e6e68b4e85..3603a8c3c6 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -29,7 +29,7 @@ subtrees: title: API library - file: tutorials/index.rst - title: Tutorials + caption: Tutorials subtrees: - entries: - file: tutorials/install.rst From 709164fe393537d1856f39d6614b3c6385660e2a Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:21:11 -0800 Subject: [PATCH 04/67] Update _toc.yml.in Corrected white space for - caption --- docs/sphinx/_toc.yml.in | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 3603a8c3c6..461dcf7417 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -4,7 +4,7 @@ defaults: numbered: False maxdepth: 7 root: index -caption: What is MIOpen? +- caption: What is MIOpen? subtrees: - entries: - file: find_and_immediate.rst @@ -17,19 +17,19 @@ subtrees: - file: MIOpen_Porting_Guide.rst - file: citation.rst - caption: Quick-start + - caption: Quick-start - file: reference/install.rst - file: reference/embed.rst - file: reference/driver.rst - caption: API reference + - caption: API reference subtrees: - entries: - file: reference/apireference.rst title: API library - file: tutorials/index.rst - caption: Tutorials + - caption: Tutorials subtrees: - entries: - file: tutorials/install.rst From 0832e538783acc74d1b70973f6864650271de41b Mon Sep 17 00:00:00 2001 From: Sam Wu Date: Wed, 6 Dec 2023 13:25:52 -0700 Subject: [PATCH 05/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 51 ++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 461dcf7417..a535b17f98 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -7,35 +7,34 @@ root: index - caption: What is MIOpen? subtrees: - entries: - - file: find_and_immediate.rst - - file: finddb.rst - - file: cache.rst - - file: perfdatabase.rst - - file: Getting_Started_FusionAPI.rst - - file: DebugAndLogging.rst - - file: MI200AlternateImplementation.rst - - file: MIOpen_Porting_Guide.rst - - file: citation.rst + - file: find_and_immediate.rst + - file: finddb.rst + - file: cache.rst + - file: perfdatabase.rst + - file: Getting_Started_FusionAPI.rst + - file: DebugAndLogging.rst + - file: MI200AlternateImplementation.rst + - file: MIOpen_Porting_Guide.rst + - file: citation.rst - - caption: Quick-start - - file: reference/install.rst - - file: reference/embed.rst - - file: reference/driver.rst +- caption: Quick-start + entries: + - file: reference/install.rst + - file: reference/embed.rst + - file: reference/driver.rst - - caption: API reference - subtrees: - - entries: - - file: reference/apireference.rst - title: API library - +- caption: API reference + entries: + - file: reference/apireference.rst + title: API library - file: tutorials/index.rst - - caption: Tutorials - subtrees: - - entries: - - file: tutorials/install.rst - - file: tutorials/embed.rst - - file: tutorials/driver.rst - title: Installing and building MIOpen + +- caption: Tutorials + entries: + - file: tutorials/install.rst + - file: tutorials/embed.rst + - file: tutorials/driver.rst + title: Installing and building MIOpen From cac7a8872707c117d88cb56993470c7f371809cf Mon Sep 17 00:00:00 2001 From: Sam Wu Date: Wed, 6 Dec 2023 13:58:35 -0700 Subject: [PATCH 06/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index a535b17f98..dee7af39a7 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -4,9 +4,9 @@ defaults: numbered: False maxdepth: 7 root: index -- caption: What is MIOpen? subtrees: -- entries: +- caption: What is MIOpen? + entries: - file: find_and_immediate.rst - file: finddb.rst - file: cache.rst @@ -16,28 +16,19 @@ subtrees: - file: MI200AlternateImplementation.rst - file: MIOpen_Porting_Guide.rst - file: citation.rst - - caption: Quick-start entries: - file: reference/install.rst - file: reference/embed.rst - file: reference/driver.rst - - caption: API reference entries: - file: reference/apireference.rst title: API library - file: tutorials/index.rst - - caption: Tutorials entries: - file: tutorials/install.rst - file: tutorials/embed.rst - file: tutorials/driver.rst title: Installing and building MIOpen - - - - - - From 79970e08e676dc4a8c4d0009fea19508bebebc96 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Fri, 8 Dec 2023 12:00:43 -0800 Subject: [PATCH 07/67] Update _toc.yml.in Removed citation.rst based on customer request https://ontrack-internal.amd.com/browse/SWDEV-416591 --- docs/sphinx/_toc.yml.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index dee7af39a7..a2a6c68f5f 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -15,7 +15,7 @@ subtrees: - file: DebugAndLogging.rst - file: MI200AlternateImplementation.rst - file: MIOpen_Porting_Guide.rst - - file: citation.rst + - caption: Quick-start entries: - file: reference/install.rst From f7a771b903b46056f6ba564cd8db58324fe041d1 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 17:57:36 -0800 Subject: [PATCH 08/67] Update _toc.yml.in Updates --- docs/sphinx/_toc.yml.in | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index a2a6c68f5f..82713fdde8 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -1,3 +1,15 @@ +.. meta:: + :description: MIOpen documentation and API reference library + :keywords: MIOpen, ROCm, API, documentation + +******************************************************************** +MIOpen documentation +******************************************************************** + +Welcome to the MIOpen docs home page! To learn more, see :ref:`what-is-MIOpen` + +Our documentation is structured as follows: + # Anywhere {branch} is used, the branch name will be substituted. # These comments will also be removed. defaults: @@ -32,3 +44,17 @@ subtrees: - file: tutorials/embed.rst - file: tutorials/driver.rst title: Installing and building MIOpen + +Contributing to this documentation +======================================================= + +We welcome collaboration! If you'd like to contribute to our documentation, you can find instructions +in our `Contributing to ROCm `_ section, which discusses: + +* `Document structure `_ +* `Toolchains `_ +* `Documentation builds `_ +* `How to provide feedback `_ + +Licensing information for all ROCm components is listed on our +`Licensing `_ page. From 3c5e83c1d95f7fd7e47d906c816dcdc1bff3bd91 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:00:40 -0800 Subject: [PATCH 09/67] Create reference Folder for MIOpen reference --- docs/reference | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/reference diff --git a/docs/reference b/docs/reference new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docs/reference @@ -0,0 +1 @@ + From 4068655513b6fe4d65fe67b353eba0df5480604b Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:02:38 -0800 Subject: [PATCH 10/67] Delete docs/reference --- docs/reference | 1 - 1 file changed, 1 deletion(-) delete mode 100644 docs/reference diff --git a/docs/reference b/docs/reference deleted file mode 100644 index 8b13789179..0000000000 --- a/docs/reference +++ /dev/null @@ -1 +0,0 @@ - From d0bc0f0c478c292da83ad1b7faa14f2620cf0d01 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:05:36 -0800 Subject: [PATCH 11/67] Create quick-start.rst --- docs/tutorials/quick-start.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/tutorials/quick-start.rst diff --git a/docs/tutorials/quick-start.rst b/docs/tutorials/quick-start.rst new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docs/tutorials/quick-start.rst @@ -0,0 +1 @@ + From 4252d7c76f5374f9d0d8e44a15eb9408023b89b2 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:07:48 -0800 Subject: [PATCH 12/67] Create what-is-MIOpen.rst Adding What is MIOpen.rst --- docs/tutorials/what-is-MIOpen.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/tutorials/what-is-MIOpen.rst diff --git a/docs/tutorials/what-is-MIOpen.rst b/docs/tutorials/what-is-MIOpen.rst new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docs/tutorials/what-is-MIOpen.rst @@ -0,0 +1 @@ + From ad34523c3fa1b2b6fe31d1a709beda9f62d3dec7 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:08:01 -0800 Subject: [PATCH 13/67] Delete docs/tutorials/quick-start.rst --- docs/tutorials/quick-start.rst | 1 - 1 file changed, 1 deletion(-) delete mode 100644 docs/tutorials/quick-start.rst diff --git a/docs/tutorials/quick-start.rst b/docs/tutorials/quick-start.rst deleted file mode 100644 index 8b13789179..0000000000 --- a/docs/tutorials/quick-start.rst +++ /dev/null @@ -1 +0,0 @@ - From be715eb8f6522d1f9f21cc1c3e31ba4e6e48f4a8 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:11:08 -0800 Subject: [PATCH 14/67] Update what-is-MIOpen.rst --- docs/tutorials/what-is-MIOpen.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/tutorials/what-is-MIOpen.rst b/docs/tutorials/what-is-MIOpen.rst index 8b13789179..80e5d4e9f4 100644 --- a/docs/tutorials/what-is-MIOpen.rst +++ b/docs/tutorials/what-is-MIOpen.rst @@ -1 +1,17 @@ +.. meta:: +:description: MIOpen is AMD’s deep learning primitives library, which provides highly optimized, and hand-tuned implementations of +different operators such as convolution, batch normalization, pooling, softmax, activation and layers for Recurrent Neural +Networks (RNNs), used in both training and inference [9]. + :keywords: MIOpen, ROCm, library, API +.. _what-is-MIOpen: + +********************* +What is MIOpen? +********************* + +MIOpen is AMD’s deep learning primitives library, which provides highly optimized, and hand-tuned implementations of +different operators such as convolution, batch normalization, pooling, softmax, activation and layers for Recurrent Neural +Networks (RNNs), used in both training and inference. Moreover, MIOpen is fully open-source including all its +GPU kernels; complementing AMD’s open-source ROCm stack [10]. MIOpen is the first to extend the open-source +advantage into GPU vendor libraries thereby, continuing to embark on the same ethos as the deep learning community. From ef2a85b64a8fb872e9d615358d69089421490ae5 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:12:17 -0800 Subject: [PATCH 15/67] Update what-is-MIOpen.rst --- docs/tutorials/what-is-MIOpen.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/what-is-MIOpen.rst b/docs/tutorials/what-is-MIOpen.rst index 80e5d4e9f4..b31c022f60 100644 --- a/docs/tutorials/what-is-MIOpen.rst +++ b/docs/tutorials/what-is-MIOpen.rst @@ -1,7 +1,7 @@ .. meta:: -:description: MIOpen is AMD’s deep learning primitives library, which provides highly optimized, and hand-tuned implementations of -different operators such as convolution, batch normalization, pooling, softmax, activation and layers for Recurrent Neural -Networks (RNNs), used in both training and inference [9]. + :description: MIOpen is AMD’s deep learning primitives library, which provides highly optimized, and hand-tuned implementations of + different operators such as convolution, batch normalization, pooling, softmax, activation and layers for Recurrent Neural + Networks (RNNs), used in both training and inference [9]. :keywords: MIOpen, ROCm, library, API .. _what-is-MIOpen: From 29005e8ec28477a585cf0bc88068b8eaad20f8c8 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:13:31 -0800 Subject: [PATCH 16/67] Create apireference.rst Adding MIOpen API documentation --- docs/reference/apireference.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/reference/apireference.rst diff --git a/docs/reference/apireference.rst b/docs/reference/apireference.rst new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docs/reference/apireference.rst @@ -0,0 +1 @@ + From 8473e31dca0c8c24292ef7d5aef7672380a24f2a Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:14:19 -0800 Subject: [PATCH 17/67] Update apireference.rst Add API reference documentation --- docs/reference/apireference.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/reference/apireference.rst b/docs/reference/apireference.rst index 8b13789179..777ea10b0b 100644 --- a/docs/reference/apireference.rst +++ b/docs/reference/apireference.rst @@ -1 +1,25 @@ +API Reference +============= + + +.. toctree:: + :maxdepth: 4 + :caption: Contents: + + datatypes + handle + tensor + activation + convolution + rnn + batchnorm + lrn + pooling + softmax + fusion + loss + dropout + reduction + layernorm + sum From 7acfd67a35e61cee62c190ea4712ce16337a0dad Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:17:49 -0800 Subject: [PATCH 18/67] Create find_and_immediate.md --- docs/tutorials/find_and_immediate.md | 189 +++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 docs/tutorials/find_and_immediate.md diff --git a/docs/tutorials/find_and_immediate.md b/docs/tutorials/find_and_immediate.md new file mode 100644 index 0000000000..313ea9c26e --- /dev/null +++ b/docs/tutorials/find_and_immediate.md @@ -0,0 +1,189 @@ +Find and Immediate Mode +======================= + + + +## Find API + +MIOpen contains several convolution algorithms for each stage of training or inference. Pre-MIOpen version 2.0 users needed to call Find methods in order generate a set of applicable algorithms. + +A typical workflow for the find stage: + +``` +miopenConvolutionForwardGetWorkSpaceSize(handle, + weightTensorDesc, + inputTensorDesc, + convDesc, + outputTensorDesc, + &maxWorkSpaceSize); + +// < allocate workspace > + + +// NOTE: +// miopenFindConvolution*() call is expensive in terms of execution time and required workspace. +// Therefore it is highly recommended to save off the selected algorithm and workspace required so that +// can be reused later within the lifetime of the same MIOpen handle object. +// In this way, there should be is no need to invoke miopenFind*() more than once per application lifetime. + +miopenFindConvolutionForwardAlgorithm(handle, + inputTensorDesc, + input_device_mem, + weightTensorDesc, + weight_device_mem, + convDesc, + outputTensorDesc, + output_device_mem,, + request_algo_count, + &ret_algo_count, + perf_results, + workspace_device_mem, + maxWorkSpaceSize, + 1); + +// < select fastest algorithm > + +// < free previously allocated workspace and allocate workspace required for the selected algorithm> + +miopenConvolutionForward(handle, &alpha, + inputTensorDesc, + input_device_mem, + weightTensorDesc, + weight_device_mem, + convDesc, + perf_results[0].fwd_algo, // use the fastest algo + &beta, + outputTensorDesc, + output_device_mem, + workspace_device_mem, + perf_results[0].memory); //workspace size +``` + + +The results of Find() are returned in an array of `miopenConvAlgoPerf_t` structs in order of performance, with the fastest at index 0. + +This call sequence is executed once per session as it is inherently expensive. Of those, `miopenFindConvolution*()` is the most expensive call. It caches its own results on disk, so the subsequent calls during the same MIOpen session will execute faster. However, it is better to remember results of `miopenFindConvolution*()` in the application, as recommended above. + +Internally MIOpen's Find calls will compile and benchmark a set of `solvers` contained in `miopenConvAlgoPerf_t` this is done in parallel per `miopenConvAlgorithm_t`. The level of parallelism can be controlled using an environment variable. See the debugging section [controlling parallel compilation](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/DebugAndLogging.html#controlling-parallel-compilation) for more details. + + +## Immediate Mode API + +MIOpen v2.0 introduces the immediate which removes the requirement for the `miopenFindConvolution*()` calls and their associated runtime costs. In this mode, the user can query the MIOpen runtime for all the supported _solutions_ for a given convolution configuration. These solutions may either be using the same algorithm or different ones. The sequence of operations for in immediate mode is similar to launching regular convolutions in MIOpen i.e. through the use of the `miopenFindConvolution*()` API. However, in this case the different APIs have much lower runtime cost. A typical convolution call would be similar to the following sequence of calls: + +* The user constructs the MIOpen handle and relevant descriptors such as the convolution descriptor as usual. +* With the above data structures, the user calls `miopenConvolution*GetSolutionCount` to get the **maximum** number of supported solutions for the convolution descriptor in question. +* The count obtained above is used to allocate memory for the `miopenConvSolution_t` structure introduced in MIOpen v2.0 +* The user calls `miopenConvolution*GetSolution` to populate the `miopenConvSolution_t` structures allocated above. The returned list is ordered in the order of best performance, thus the first element would be the fastest. +* While the above structure returns the amount of workspace required for an algorithm, the user may inquire the amount of a workspace required for a known solution id by using the `miopenConvolution*GetSolutionWorkspaceSize` API call. However, this is not a requirement, since the strucure returned by `miopenConvolution*GetSolution` would already have this information. +* Now the user may initiate the convolution operation in _immediate_ mode by calling `miopenConvolution*Immediate`. Which would populate the output tensor descriptor with the respective convolution result. However, the first call to `miopenConvolution*Immediate` may consume more time since the kernel may not be present in the kernel cache and may need to be compiled. +* Optionally, the user may compile the solution of choice by calling `miopenConvolution*CompileSolution` which would ensure that the kernel represented by the chosen solution is populated in the kernel cache a priori, removing the necessity for compiling the kernel in question. + + +``` +miopenConvolutionForwardGetSolutionCount(handle, + weightTensorDesc, + inputTensorDesc, + convDesc, + outputTensorDesc, + &solutionCount); + + +// < allocate an array of miopenConvSolution_t of size solutionCount > + + +miopenConvolutionForwardGetSolution(handle, + weightTensorDesc, + inputTensorDesc, + convDesc, + outputTensorDesc, + solutionCount, + &actualCount, + solutions); + +// < select a solution from solutions array > + +miopenConvolutionForwardGetSolutionWorkspaceSize(handle, + weightTensorDesc, + inputTensorDesc, + convDesc, + outputTensorDesc, + selected->solution_id, + &ws_size); + +// < allocate solution workspace of size ws_size > + + +// This stage is optional +miopenConvolutionForwardCompileSolution(handle, + weightTensorDesc, + inputTensorDesc, + convDesc, + outputTensorDesc, + selected->solution_id); + + + + miopenConvolutionForwardImmediate(handle, + weightTensor, + weight_device_mem, + inputTensorDesc, + input_device_mem, + convDesc, + outputTensorDesc, + output_device_mem, + workspace_device_mem, + ws_size, + selected->solution_id); +``` + +## Immediate Mode Fallback + +The immediate mode is underpinned by the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html), however it may not contain every configuration of interest. If Find-Db encounters a database miss it has two fallback paths it can take, depending on whether the cmake variable MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK is set to ON or OFF. However, if the user requires the best possible performance they should run the Find stage at least once. + +### 1. AI-based Heuristic Fallback (Default) + +If MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK is set to ON, which it is by default, Immediate Mode's behavior on a database miss is to use an AI-based heurisitic to pick the optimal solution. First, the applicability of the AI-based heuristic for the given configuration is checked. If the heuristic is applicable, it feeds various parameters of the given configuration into a neural network which has been tuned to predict the optimal solution with 90% accuracy. + +### 2. Weighted Throughput Index Based Fallback + +When MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK is set to OFF, or the AI Heuristic is not applicable for the given convolution configuration, Immediate mode's behavior on encountering a database miss is to use a Weighted Thoughput Index (WTI) based mechanism to estimate which solution would be optimal based upon parameters of the convolution configuration. + + + +## Limitations of Immediate Mode + +### Architectual Limitations +The system Find-Db has only been populated for the following architectures: + * gfx906 with 64 CUs + * gfx906 with 60 CUs + * gfx900 with 64 CUs + * gfx900 with 56 CUs + +If the user's architecture is not listed above they will need to run the Find API once on their system per application in order to take advantage of immediate mode's more efficient behavior. + + +### Backend Limitations + +OpenCL support for immediate mode via the fallback is limited to fp32 datatypes. This is because this current release's fallback path goes through GEMM which on the OpenCL is serviced through MIOpenGEMM -- which itself only contains support for fp32. The HIP backend uses rocBLAS as its fallback path which contains a richer set of datatypes. + + +### Find Modes + +MIOpen provides a set of Find modes which are used to accelerate the Find calls. The different modes are set by using the environment variable `MIOPEN_FIND_MODE`, and setting it to one of the values: + +- `NORMAL`, or `1`: Normal Find: This is the full Find mode call, which will benchmark all the solvers and return a list. +- `FAST`, or `2`: Fast Find: Checks the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html) for an entry. If there is a Find-Db hit, use that entry. If there is a miss, utilize the Immediate mode fallback. If Start-up times are expected to be faster, but worse GPU performance. +- `HYBRID`, or `3`, or unset `MIOPEN_FIND_MODE`: Hybrid Find: Checks the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html) for an entry. If there is a Find-Db hit, use that entry. If there is a miss, use the existing Find machinery. Slower start-up times than Fast Find, but no GPU performance drop. +- `4`: This value is reserved and should not be used. +- `DYNAMIC_HYBRID`, or `5`: Dynamic Hybrid Find: Checks the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html) for an entry. If there is a Find-Db hit, uses that entry. If there is a miss, uses the existing Find machinery with skipping non-dynamic kernels. Faster start-up times than Hybrid Find, but GPU performance may be a bit worse. + + Currently, the default Find mode is `DYNAMIC_HYBRID`. To run the full `NORMAL` Find mode, set the environment as: + ``` + export MIOPEN_FIND_MODE=NORMAL + ``` + Or, + ``` + export MIOPEN_FIND_MODE=1 + ``` + From b7c73a5038ae40fe0c28f9ceb52799004ff30ddb Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:18:41 -0800 Subject: [PATCH 19/67] Create finddb.rst --- docs/tutorials/finddb.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/tutorials/finddb.rst diff --git a/docs/tutorials/finddb.rst b/docs/tutorials/finddb.rst new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docs/tutorials/finddb.rst @@ -0,0 +1 @@ + From 7064c40867fe90ebd21e8972e9191dcda878461f Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:19:41 -0800 Subject: [PATCH 20/67] Update finddb.rst --- docs/tutorials/finddb.rst | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/tutorials/finddb.rst b/docs/tutorials/finddb.rst index 8b13789179..df64ed27d8 100644 --- a/docs/tutorials/finddb.rst +++ b/docs/tutorials/finddb.rst @@ -1 +1,45 @@ +Find-Db Database +================ + +Prior to MIOpen 2.0, users utilized calls such as `miopenFindConvolution*Algorithm()` to gather a set of convolution algorithms in the form of an array of `miopenConvSolution_t` structs. This process is time consuming because it requires online benchmarking of competing algorithms. In MIOpen 2.0 an [immediate mode](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/find_and_immediate.html) is introduced. + +Immediate mode is based on a database which contains the results of calls to the legacy Find() stage. This database is called `Find-Db`. It consists of two parts: +- **System Find-Db**, a system-wide storage which holds the pre-run values for the most applicable configurations, +- **User Find-Db**, a per-user storage which is intended to hold results for arbitrary user-run configurations. It also performs double duty as a cache for the Find() stage. + +The User Find-Db **always takes precedence** over System Find-Db. + +By default, System Find-Db resides within MIOpen's install location, while User Find-Db resides in the user's home directory. See [Setting up locations](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#setting-up-locations) for more information. + + * The System Find-Db is *not* modified upon installation of MIOpen. + * There are separate Find databases for HIP and OpenCL backends. + +### Populating the User Find-Db + +MIOpen collects Find-db information during the following MIOpen API calls: +- `miopenFindConvolutionForwardAlgorithm()` +- `miopenFindConvolutionBackwardDataAlgorithm()` +- `miopenFindConvolutionBackwardWeightsAlgorithm()` + +During the call, find data entries are collected for one _problem configuration_ (implicitly defined by the tensor descriptors and convolution descriptor passed to API function). + + +### Updating MIOpen and the User Find-Db + +When the user installs a new version of MIOpen, the new version of MIOpen will _ignore_ old **User find-db*** files. Thus, the user is _not required_ to move or delete their old User find-db files. However, the user may wish to re-collect the information into their brand new **User find-db**. This should be done in the same way as it was done with the previous version of the library -- _if_ it was done. This would keep Immediate mode optimized. + + +### Disabling Find-Db + +By default MIOpen will use the Find-Db. Users can disable the Find-Db by setting the environmental variable `MIOPEN_DEBUG_DISABLE_FIND_DB` to 1: +``` +export MIOPEN_DEBUG_DISABLE_FIND_DB=1 +``` + +**Note:** The System Find-Db has the ability to be cached into memory and may increase performance dramatically. To disable this option use the cmake configuration flag: +``` +-DMIOPEN_DEBUG_FIND_DB_CACHING=Off +``` + + From 8de1cf50fb27b1e57a16ed0fe5164667311dbdbc Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:20:12 -0800 Subject: [PATCH 21/67] Delete docs/tutorials/finddb.rst --- docs/tutorials/finddb.rst | 45 --------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 docs/tutorials/finddb.rst diff --git a/docs/tutorials/finddb.rst b/docs/tutorials/finddb.rst deleted file mode 100644 index df64ed27d8..0000000000 --- a/docs/tutorials/finddb.rst +++ /dev/null @@ -1,45 +0,0 @@ -Find-Db Database -================ - -Prior to MIOpen 2.0, users utilized calls such as `miopenFindConvolution*Algorithm()` to gather a set of convolution algorithms in the form of an array of `miopenConvSolution_t` structs. This process is time consuming because it requires online benchmarking of competing algorithms. In MIOpen 2.0 an [immediate mode](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/find_and_immediate.html) is introduced. - -Immediate mode is based on a database which contains the results of calls to the legacy Find() stage. This database is called `Find-Db`. It consists of two parts: -- **System Find-Db**, a system-wide storage which holds the pre-run values for the most applicable configurations, -- **User Find-Db**, a per-user storage which is intended to hold results for arbitrary user-run configurations. It also performs double duty as a cache for the Find() stage. - -The User Find-Db **always takes precedence** over System Find-Db. - -By default, System Find-Db resides within MIOpen's install location, while User Find-Db resides in the user's home directory. See [Setting up locations](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#setting-up-locations) for more information. - - * The System Find-Db is *not* modified upon installation of MIOpen. - * There are separate Find databases for HIP and OpenCL backends. - -### Populating the User Find-Db - -MIOpen collects Find-db information during the following MIOpen API calls: -- `miopenFindConvolutionForwardAlgorithm()` -- `miopenFindConvolutionBackwardDataAlgorithm()` -- `miopenFindConvolutionBackwardWeightsAlgorithm()` - -During the call, find data entries are collected for one _problem configuration_ (implicitly defined by the tensor descriptors and convolution descriptor passed to API function). - - -### Updating MIOpen and the User Find-Db - -When the user installs a new version of MIOpen, the new version of MIOpen will _ignore_ old **User find-db*** files. Thus, the user is _not required_ to move or delete their old User find-db files. However, the user may wish to re-collect the information into their brand new **User find-db**. This should be done in the same way as it was done with the previous version of the library -- _if_ it was done. This would keep Immediate mode optimized. - - -### Disabling Find-Db - -By default MIOpen will use the Find-Db. Users can disable the Find-Db by setting the environmental variable `MIOPEN_DEBUG_DISABLE_FIND_DB` to 1: -``` -export MIOPEN_DEBUG_DISABLE_FIND_DB=1 -``` - -**Note:** The System Find-Db has the ability to be cached into memory and may increase performance dramatically. To disable this option use the cmake configuration flag: -``` --DMIOPEN_DEBUG_FIND_DB_CACHING=Off -``` - - - From 1ca42e66b4ecef5c0a714be1bda19671014f0abc Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:20:29 -0800 Subject: [PATCH 22/67] Create finddb.md --- docs/tutorials/finddb.md | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 docs/tutorials/finddb.md diff --git a/docs/tutorials/finddb.md b/docs/tutorials/finddb.md new file mode 100644 index 0000000000..e6e21b901b --- /dev/null +++ b/docs/tutorials/finddb.md @@ -0,0 +1,44 @@ +Find-Db Database +================ + +Prior to MIOpen 2.0, users utilized calls such as `miopenFindConvolution*Algorithm()` to gather a set of convolution algorithms in the form of an array of `miopenConvSolution_t` structs. This process is time consuming because it requires online benchmarking of competing algorithms. In MIOpen 2.0 an [immediate mode](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/find_and_immediate.html) is introduced. + +Immediate mode is based on a database which contains the results of calls to the legacy Find() stage. This database is called `Find-Db`. It consists of two parts: +- **System Find-Db**, a system-wide storage which holds the pre-run values for the most applicable configurations, +- **User Find-Db**, a per-user storage which is intended to hold results for arbitrary user-run configurations. It also performs double duty as a cache for the Find() stage. + +The User Find-Db **always takes precedence** over System Find-Db. + +By default, System Find-Db resides within MIOpen's install location, while User Find-Db resides in the user's home directory. See [Setting up locations](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#setting-up-locations) for more information. + + * The System Find-Db is *not* modified upon installation of MIOpen. + * There are separate Find databases for HIP and OpenCL backends. + +### Populating the User Find-Db + +MIOpen collects Find-db information during the following MIOpen API calls: +- `miopenFindConvolutionForwardAlgorithm()` +- `miopenFindConvolutionBackwardDataAlgorithm()` +- `miopenFindConvolutionBackwardWeightsAlgorithm()` + +During the call, find data entries are collected for one _problem configuration_ (implicitly defined by the tensor descriptors and convolution descriptor passed to API function). + + +### Updating MIOpen and the User Find-Db + +When the user installs a new version of MIOpen, the new version of MIOpen will _ignore_ old **User find-db*** files. Thus, the user is _not required_ to move or delete their old User find-db files. However, the user may wish to re-collect the information into their brand new **User find-db**. This should be done in the same way as it was done with the previous version of the library -- _if_ it was done. This would keep Immediate mode optimized. + + +### Disabling Find-Db + +By default MIOpen will use the Find-Db. Users can disable the Find-Db by setting the environmental variable `MIOPEN_DEBUG_DISABLE_FIND_DB` to 1: +``` +export MIOPEN_DEBUG_DISABLE_FIND_DB=1 +``` + +**Note:** The System Find-Db has the ability to be cached into memory and may increase performance dramatically. To disable this option use the cmake configuration flag: +``` +-DMIOPEN_DEBUG_FIND_DB_CACHING=Off +``` + + From 8e47f793564646efbee552222f0fb74037b2f50b Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:24:27 -0800 Subject: [PATCH 23/67] Create cache.md --- docs/tutorials/cache.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 docs/tutorials/cache.md diff --git a/docs/tutorials/cache.md b/docs/tutorials/cache.md new file mode 100644 index 0000000000..e8a08ff5c7 --- /dev/null +++ b/docs/tutorials/cache.md @@ -0,0 +1,33 @@ +Kernel Cache +============ + +MIOpen will cache binary kernels to disk, so they don't need to be compiled the next time the application is run. This cache is stored by default in `$HOME/.cache/miopen`. This location can be customized at build time by setting the `MIOPEN_CACHE_DIR` cmake variable. + +Clear the cache +--------------- + +The cache can be cleared by simply deleting the cache directory (i.e., `$HOME/.cache/miopen`). This should only be needed for development purposes or to free disk space. The cache does not need to be cleared when upgrading MIOpen. + +Disabling the cache +------------------- + +The are several ways to disable the cache. This is generally useful for development purposes. The cache can be disabled during build by either setting `MIOPEN_CACHE_DIR` to an empty string, or setting `BUILD_DEV=ON` when configuring cmake. The cache can also be disabled at runtime by setting the `MIOPEN_DISABLE_CACHE` environment variable to true. + +Updating MIOpen and removing the cache +-------------------------------------- +For MIOpen version 2.3 and earlier, if the compiler changes, or the user modifies the kernels then the cache must be deleted for the MIOpen version in use; e.g., `rm -rf $HOME/.cache/miopen/`. More information about the cache can be found [here](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/cache.html). + +For MIOpen version 2.4 and later, MIOpen's kernel cache directory is versioned so that users' cached kernels will not collide when upgrading from earlier version. + +Installing pre-compiled kernels +------------------------------- +GPU architecture-specific pre-compiled kernel packages are available in the ROCm package repositories, to reduce the startup latency of MIOpen kernels. In essence, these packages have the kernel cache file mentioned above and install them in the ROCm installation directory along with other MIOpen artifacts. Thus, when launching a kernel, MIOpen will first check for the existence of a kernel in the kernel cache installed in the MIOpen installation directory. If the file does not exist or the required kernel is not found, the kernel is compiled and placed in the user's kernel cache. + +These packages are optional for the functioning of MIOpen and must be separately installed from MIOpen. Users who wish to conserve disk space may choose not to install these packages at the cost of higher startup latency. Users have the flexibility to only install kernel packages for installed device architecture, thus minimizing disk space usage. + +If MIOpen kernels package is not installed, or if we do not deliver the kernels suitable for the user's GPU, then the user will get warning message like this: +> MIOpen(HIP): Warning [SQLiteBase] Missing system database file:gfx906_60.kdb Performance may degrade + +The performance degradation mentioned in the warning only affects the network start-up time (aka "initial iteration time") and thus can be safely ignored. + +Please refer to the MIOpen installation instructions: [installing MIOpen kernels package](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#installing-miopen-kernels-package) for guidance on installing the MIOpen kernels package. From 07dd95b4185650df58b6f84b9bbe85577f61640f Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:25:06 -0800 Subject: [PATCH 24/67] Create perfdatabase.md --- docs/tutorials/perfdatabase.md | 67 ++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 docs/tutorials/perfdatabase.md diff --git a/docs/tutorials/perfdatabase.md b/docs/tutorials/perfdatabase.md new file mode 100644 index 0000000000..f28671b1dc --- /dev/null +++ b/docs/tutorials/perfdatabase.md @@ -0,0 +1,67 @@ +Performance Database +==================== + +Many of MIOpen kernels have parameters which affect their performance. Setting these parameters to optimal values allows reaching the best possible throughput. These optimal values depend on many things, including network configuration, GPU type, clock frequencies, ROCm version etc. Because of these dependencies and also due to enormous number of possible network configurations, it is virtually impossible to supply all values that users may need together with the library. Instead, MIOpen provides a set of pre-tuned values for the _most applicable_ network configurations, **and** also means for expanding the set of optimized values. MIOpen's performance database contains these pre-tuned parameter values as well as optimized parameters tuned by users. + +The performance database consists of two parts: +- **System Performance Database**, a system-wide storage which holds the pre-tuned values for the most applicable configurations, +- **User Performance Database**, a per-user storage which is intended to hold optimized values for arbitrary configurations. + +User PerfDb **always takes precedence** over System PerfDb. + +MIOpen also has auto-tuning functionality, which is able to find optimized kernel parameter values for a specific configuration. The auto-tune process may take a substantial amount of time, however, once the optimized values are found, they are stored in the User PerfDb. MIOpen then will automatically read and use these parameter values when needed again instead of running the expensive auto-tuning search. + +By default, System PerfDb resides within MIOpen's install location, while User PerfDb resides in the user's home directory. See [Setting up locations](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#setting-up-locations) for more information. + +The System PerfDb is not modified upon installation of MIOpen. + +## Auto-tuning the kernels. + +MIOpen performs auto-tuning during the following MIOpen API calls: +- `miopenFindConvolutionForwardAlgorithm()` +- `miopenFindConvolutionBackwardDataAlgorithm()` +- `miopenFindConvolutionBackwardWeightsAlgorithm()` + +During the call, auto-tuning is performed only for one _problem configuration_ (implicitly defined by the tensor descriptors passed to API function). + +The following conditions must be met for the auto-tune to begin: +- The applicable kernel(s) has tuning parameters. +- The passed value of `exhaustiveSearch` parameter is `true`, and +- Both System and User PerfDb do not yet contain values for the relevant _problem configuration_. + +The latter two conditions may be overridden by _enforcing_ the search by means of the following environment variable: +- `MIOPEN_FIND_ENFORCE` + +This variable may also be used for _removing_ values from User PerfDb, see below. + +### MIOPEN_FIND_ENFORCE + +Both symbolic (case-insensitive) and numeric values are supported. + +**NONE (1)** + +Setting the value to "NONE", or "1" will have no change in the default behavior. + +**DB_UPDATE (2)** + +Auto-tune will not be skipped even if PerfDb already contains optimized values. If auto-tune is requested via API, then MIOpen will perform it and update PerfDb. + +This mode can be used for fine-tuning the MIOpen installation on the user's system. When MIOpen is in this mode, the applications that use it may take quite long to finish. + +**SEARCH (3)** + +MIOpen will perform auto-tune even if not requested via MIOpen API. In other words, the library will behave as if `exhaustiveSearch` parameter set to `true` even this is not really so. If optimized values already reside in PerfDb, then auto-tune will not be performed. + +This mode allows for tuning the apps that do not anticipate means for getting the best performance from MIOpen. When MIOpen is in this mode, the first run of the user's app may take substantially longer time than expected. + +**SEARCH_DB_UPDATE (4)** + +A combination of SEARCH and DB_UPDATE. MIOpen performs auto-tune (and updates User PerfDb) on each `miopenFindConvolution*()` call. It is not recommended to use this mode except for debugging purposes. + +**DB_CLEAN (5)** + +Use with care. MIOpen **removes** optimized values related to given _problem configuration_ from the User PerfDb. Auto-tune is blocked, even if it is explicitly requested. System PerfDb left intact. + +### Updating MIOpen and the User Db + +It is important to note that if the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user performance database file. This will prevent older database entries from poluting the configurations shipped with the newer system database. The user perf db is named `miopen.udb` and is located at the user perf db path. From 523dd5ce574a10022dbbeeb395df069571dd1341 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:25:58 -0800 Subject: [PATCH 25/67] Create getting_started_fusionAPI.md --- docs/tutorials/getting_started_fusionAPI.md | 209 ++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 docs/tutorials/getting_started_fusionAPI.md diff --git a/docs/tutorials/getting_started_fusionAPI.md b/docs/tutorials/getting_started_fusionAPI.md new file mode 100644 index 0000000000..ed437ea4ee --- /dev/null +++ b/docs/tutorials/getting_started_fusionAPI.md @@ -0,0 +1,209 @@ +Fusion API: Getting Started +=========================== +## Introduction +Increasing depth of deep learning networks necessitate the need for novel mechanisms to improve performance on GPUs. One mechanism to achieve higher efficiency is to _fuse_ separate kernels into a single kernel to reduce off-chip memory access and avoid kernel launch overhead. This document outlines the addition of a Fusion API to the MIOpen library. The fusion API would allow users to specify operators that they wants to fuse in a single kernel, compile it and then launch the kernel. While not all combinations might be supported by the library, the API is flexible enough to allow the specification of many operations in any order from a finite set of supported operations. The API provides a mechanism to report unsupported combinations. + +A complete example of the Fusion API in the context of MIOpen is given [here](https://github.com/ROCmSoftwarePlatform/MIOpenExamples/tree/master/fusion). We will use code from the example project as we go along. The example project creates a fusion plan to merge the convolution, bias and activation operations. For a list of supported fusion operations and associated constraints please refer to the [Supported Fusions](#supported-fusions) section. The example depicts bare-bones code without any error checking or even populating the tensors with meaningful data in the interest of simplicity. + +The following list outlines the steps required + +- Create a fusion plan +- Create and add the convolution, bias and activation operators +- Compile the Fusion Plan +- Set the runtime arguments for each operator +- Execute the fusion plan +- Cleanup + +The above steps assume that an MIOpen handle object has already been initialized. Moreover, the order in which operators are created is important, since it represents the order of operations on the data itself. Therefore a fusion plan with convolution created before activation is a different fusion plan as opposed to if activation was added before convolution. + +The following sections further elaborate the above steps as well as give code examples to make these ideas concrete. + +### Intended Audience +The primary consumers of the fusion API are high level frameworks such as TensorFlow/XLA or PyTorch etc. + +## Create a Fusion Plan +A **Fusion Plan** is the data structure which holds all the metadata about the users fusion intent as well as logic to **Compile** and **Execute** a fusion plan. As mentioned earlier, a fusion plan holds the order in which different opertions would be applied on the data, but it also specifies the _axis_ of fusion as well. Currently only **vertical** (sequential) fusions are supported implying the flow of data between operations is sequential. + +A fusion plan is created using the API call `miopenCreateFusionPlan` with the signature: + +```cpp +miopenStatus_t +miopenCreateFusionPlan(miopenFusionPlanDescriptor_t* fusePlanDesc, +const miopenFusionDirection_t fuseDirection,const miopenTensorDescriptor_t inputDesc); +``` + +The *input tensor descriptor* specifies the geometry of the incoming data. Since the data geometry of the intermediate operations can be derived from the *input tensor descriptor*, therefore only the *input tensor descriptor* is required for the fusion plan and not for the individual operations. In our fusion example the following lines of code accomplish this: +```cpp +miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, input.desc); +``` +Where `fusePlanDesc` is an object of type `miopenFusionPlanDescriptor_t` and `input.desc` is the `miopenTensorDescriptor_t` object. + +## Create and add Operators +The fusion API introduces the notion of **operators** which represent different operations that are intended to be fused together by the API consumer. Currently, the API supports the following operators: + +* Convolution Forward +* Activation Forward +* BatchNorm Inference +* Bias Forward + +Notice that _Bias_ is a separate operator, although it is typically only used with convolution. This list is expected to grow as support for more operators is added to the API, moreover, operators for backward passes are in the works as well. + +The fusion API provides calls for the creation of the supported operators, here we would describe the process for the convolution operator, details for other operators may be found in the [miopen header file](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/fusion.html) + +Once the fusion plan descriptor is created, two or more operators can be added to it by using the individual operator creation API calls. Creation of an operator might fail if the API does not support the fusion of the operations being added and report back immediately to the user. For our example we need to add the Convolution, Bias and Activation operations to our freshly minted fusion plan. This is done using the following calls for the Convolution, Bias and Activation operations respectively: + +```cpp +miopenStatus_t +miopenCreateOpConvForward(miopenFusionPlanDescriptor_t fusePlanDesc, + miopenFusionOpDescriptor_t* convOp, + miopenConvolutionDescriptor_t convDesc, + const miopenTensorDescriptor_t wDesc); +miopenStatus_t +miopenCreateOpBiasForward(miopenFusionPlanDescriptor_t fusePlanDesc, + miopenFusionOpDescriptor_t* biasOp, + const miopenTensorDescriptor_t bDesc); + +miopenStatus_t +miopenCreateOpActivationForward(miopenFusionPlanDescriptor_t fusePlanDesc, + miopenFusionOpDescriptor_t* activOp, + miopenActivationMode_t mode); +``` + +The following lines in the fusion example project use these API calls to create and insert the operators in the fusion plan: + +```cpp +miopenCreateOpConvForward(fusePlanDesc, &convoOp, conv_desc, weights.desc); +miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bias.desc); +miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU); +``` + +It may be noted that `conv_desc` is the regular MIOpen Convolution descriptor and is created in the standard way before it is referenced here. For more details on creating and setting the convolution descriptor please refer to the example code as well as the [MIOpen documentation](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/convolution.html). In the above snippet `weights.desc` refers to the `miopenTensorDescriptor_t` for the convolution operations and `bias.desc` refers to the object of the same type for the bias operation. The order of insertion of operators indicates the order in which the operations would be performed on the data. Therefore, the above code implies that the convolution operation would be the first operation to execute on the incoming data, followed by the bias and activation operations. + +During this process, it is important that the returned codes be checked to make sure that the operations as well as their order is supported. The operator insertion might fail for a number of reasons such as unsupported sequence of operations, unsupported dimensions of the input or in case of convolution unsupported dimensions for the filters. In the above example, these aspects are ignored for the sake of simplicity. + +## Compile the Fusion Plan + +Following the operator addition, the user would compile the fusion plan, to populate the MIOpen kernel cache with the fused kernel and make it ready for execution. The API call that accomplishes this is: + +```cpp +miopenStatus_t +miopenCompileFusionPlan(miopenHandle_t handle, miopenFusionPlanDescriptor_t fusePlanDesc); +``` + +The corresponding code snippet in the example is as follows: + +```cpp +auto status = miopenCompileFusionPlan(mio::handle(), fusePlanDesc); +if (status != miopenStatusSuccess) { +return -1; +} +``` +In order to compile the fusion plan, the user is assumed to have acquired an MIOpen handle object, in the example code above this is accomplished using the `mio::handle()` helper function. While a fusion plan itself is not bound to a MIOpen handle object, it would however need to be recompiled for each handle separately. It may be noted that compilation of a fusion plan might fail for a number of reasons, moreover it is not assured that a fused version of the kernel would offer any performance improvement over the separately run kernels. + +Compiling a fusion plan is a costly operation in terms of run-time. Therefore, it is recommended that a fusion plan should only be compiled once and may be reused for execution with different runtime parameters as described in the next section. + +## Set the runtime arguments + +While the underlying MIOpen descriptor of the fusion operator specifies the data geometry and parameters, the fusion plan still needs access to the data to execute a successfully compiled fusion plan. The arguments mechanism in the Fusion API provides such data before a fusion plan may be executed. For example the convolution operator requires *weights* to carry out the convolution computation, a bias operator requires the actual bias values etc. Therefore, before a fusion plan may be executed, arguments required by each fusion operator need to be specified. To begin, we create the `miopenOperatorArgs_t` object using: + +```cpp +miopenStatus_t miopenCreateOperatorArgs(miopenOperatorArgs_t* args); +``` + +Once created, runtime arguments for each operation may be set. In our running example, the forward convolution operator requires the convolution weights argument which is supplied using the API call: + +```cpp +miopenStatus_t +miopenSetOpArgsConvForward(miopenOperatorArgs_t args, + const miopenFusionOpDescriptor_t convOp, + const void* alpha, + const void* beta, + const void* w); +``` + +Similarly the parameters for bias and activation are given by: + +```cpp +miopenStatus_t miopenSetOpArgsBiasForward(miopenOperatorArgs_t args, + const miopenFusionOpDescriptor_t biasOp, + const void* alpha, + const void* beta, + const void* bias); + +miopenStatus_t miopenSetOpArgsActivForward(miopenOperatorArgs_t args, + const miopenFusionOpDescriptor_t activOp, + const void* alpha, + const void* beta, + double activAlpha, + double activBeta, + double activGamma); +``` + +In our example code, we set the arguments for the operations as follows: + +```cpp +miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weights.data); +miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, + activ_beta, activ_gamma); +miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.data); +``` + +This separation between the fusion plan and the arguments required by each operator allows better reuse of the fusion plan with different arguments as well as avoids the necessity of recompiling the fusion plan to run the same combination of operators with different arguments. + +As mentioned in the section [Compile the Fusion Plan](#compile-the-fusion-plan) earlier, the compilation step for a fusion plan might be costly, therefore a fusion plan should only be compiled once in its lifetime. A fusion plan needs not be recompiled if the input desciptor or any of the parameters to the `miopenCreateOp*` API calls are different, otherwise a compiled fusion plan may be reused again and again with a different set of arguments. In our example this is demonstrated in lines 77 - 85 of `main.cpp`. + +## Execute a Fusion Plan + +Once the fusion plan has been compiled and arguments set for each operator, it may be executed with the API call given below passing it the actual data to be processed. + +```cpp +miopenStatus_t +miopenExecuteFusionPlan(const miopenHandle_t handle, + const miopenFusionPlanDescriptor_t fusePlanDesc, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + miopenOperatorArgs_t args); +``` + +The following code snippet in the example accomplishes the fusion plan execution: + +```cpp +miopenExecuteFusionPlan(mio::handle(), fusePlanDesc, input.desc, input.data, + output.desc, output.data, fusionArgs); +``` + +It may be noted that it is an error to attempt to execute a fusion plan that is either not compiled or has been invalidated by changing the input tensor descriptor or any of the operation parameters. + + +## Cleanup +Once the application is done with the fusion plan, the fusion plan and the fusion args objects may be destroyed using the API calls: + +```cpp +miopenStatus_t miopenDestroyFusionPlan(miopenFusionPlanDescriptor_t fusePlanDesc); +``` +Once the fusion plan object is destroyed, all the operations created are destroyed automatically and do not need any special cleanup. + + +## Supported Fusions +The tables below outlines the supported fusions for fp32 and fp16 as well as any applicable constraints. **(C = convolution, B = bias, N = batch normalization, A = activation)** +Fusion Plans with grouped convolutions are not supported. + + +![Convolution based fp32 fusion](data/fp32fusions.png) + + +![Convolution based fp16 fusion](data/fp16fusions.png) + + +## Performance Comparison to Non-Fused Kernels + + +The following graph depicts the speedup gained for a fused Convolution+Bias+Activation over a non-fused version, all configurations have a batch size of 64: + +![CBA Graph](data/cba.png) + +Speedup obtained by fusing Batchnorm (spatial mode) with Activation are presented in the graph below: + +![Batchnorm activation fusion](data/na.png) From b36bd8c51e3948e66b36718c88a948452f4ee131 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:27:11 -0800 Subject: [PATCH 26/67] Create debugging_and_logging.md --- docs/tutorials/debugging_and_logging.md | 249 ++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 docs/tutorials/debugging_and_logging.md diff --git a/docs/tutorials/debugging_and_logging.md b/docs/tutorials/debugging_and_logging.md new file mode 100644 index 0000000000..3ae5db123a --- /dev/null +++ b/docs/tutorials/debugging_and_logging.md @@ -0,0 +1,249 @@ +Debugging and Logging +===================== + +## Logging + +All logging messages output to standard error stream (`stderr`). The following environment variables can be used to control logging: + +* `MIOPEN_ENABLE_LOGGING` - Enables printing the basic layer by layer MIOpen API call information with actual parameters (configurations). Important for debugging. Disabled by default. + +* `MIOPEN_ENABLE_LOGGING_CMD` - A user can use this environmental variable to output the associated `MIOpenDriver` command line(s) onto console. Disabled by default. + +> **_NOTE 1:_ These two and other two-state ("boolean") environment variables can be set to the following values:** +> ``` +> 1, on, yes, true, enable, enabled - to enable feature +> 0, off, no, false, disable, disabled - to disable feature +> ``` + +* `MIOPEN_LOG_LEVEL` - In addition to API call information and driver commands, MIOpen prints various information related to the progress of its internal operations. This information can be useful both for debugging and for understanding the principles of operation of the library. The `MIOPEN_LOG_LEVEL` environment variable controls the verbosity of these messages. Allowed values are: + * 0 - Default. Works as level 4 for Release builds, level 5 for Debug builds. + * 1 - Quiet. No logging messages. + * 2 - Fatal errors only (not used yet). + * 3 - Errors and fatals. + * 4 - All errors and warnings. + * 5 - Info. All the above plus information for debugging purposes. + * 6 - Detailed info. All the above plus more detailed information for debugging. + * 7 - Trace: the most detailed debugging info plus all above. + +> **_NOTE 2:_ When asking for technical support, please include the console log obtained with the following settings:** +> ``` +> export MIOPEN_ENABLE_LOGGING=1 +> export MIOPEN_ENABLE_LOGGING_CMD=1 +> export MIOPEN_LOG_LEVEL=6 +> ``` + +* `MIOPEN_ENABLE_LOGGING_MPMT` - When enabled, each log line is prefixed with information which allows the user to identify records printed from different processes and/or threads. Useful for debugging multi-process/multi-threaded apps. + +* `MIOPEN_ENABLE_LOGGING_ELAPSED_TIME` - Adds a timestamp to each log line. Indicates the time elapsed since the previous log message, in milliseconds. + +## Layer Filtering + +The following list of environment variables allow for enabling/disabling various kinds of kernels and algorithms. This can be helpful for both debugging MIOpen and integration with frameworks. + +> **_NOTE 3:_ These variables can be set to the following values:** +> ``` +> 1, yes, true, enable, enabled - to enable kernels/algorithm +> 0, no, false, disable, disabled - to disable kernels/algorithm +> ``` + +If a variable is not set, then MIOpen behaves as if it is set to `enabled`, unless otherwise specified. So all kinds of kernels/algorithms are enabled by default and the below variables can be used for disabling them. + +> **_WARNING:_** **When the library is used with layer filtering, the results of `Find()` calls become narrower than during normal operation. This means that relevant find-db entries would not include some solutions that normally should be there.** **_Therefore the subsequent Immediate mode `Get()` calls may return incomplete information or even run into Fallback path._** + +In order to rehabilitate the Immediate mode, the user can: +- Re-enable all solvers and re-run the same `Find()` calls that have been run before, +- Or, completely remove the User find-db. + +### Filtering by algorithm + +These variables control the sets (families) of convolution Solutions. For example, Direct algorithm is implemented in several Solutions that use OpenCL, GCN assembly etc. The corresponding variable can disable them all. +* `MIOPEN_DEBUG_CONV_FFT` - FFT convolution algorithm. +* `MIOPEN_DEBUG_CONV_DIRECT` - Direct convolution algorithm. +* `MIOPEN_DEBUG_CONV_GEMM` - GEMM convolution algorithm. +* `MIOPEN_DEBUG_CONV_WINOGRAD` - Winograd convolution algorithm. +* `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM` - Implicit GEMM convolution algorithm. + +### Filtering by build method + +* `MIOPEN_DEBUG_GCN_ASM_KERNELS` - Kernels written in assembly language. Currently these used in many convolutions (some Direct solvers, Winograd kernels, fused convolutions), batch normalization. +* `MIOPEN_DEBUG_HIP_KERNELS` - Convoluton kernels written in HIP (today, all these implement ImplicitGemm algorithm). +* `MIOPEN_DEBUG_OPENCL_CONVOLUTIONS` - Convolution kernels written in OpenCL (note that _only_ convolutions affected). +* `MIOPEN_DEBUG_AMD_ROCM_PRECOMPILED_BINARIES` - Binary kernels. Right now the library does not use binaries. + +### Filtering out all Solutions except one + +* `MIOPEN_DEBUG_FIND_ONLY_SOLVER=solution_id`, where `solution_id` should be either numeric or string identifier of some Solution. Directly affects only `Find()` calls _(however there is some indirect connection to Immediate mode; please see the "Warning" above.)_ + - If `solution_id` denotes some applicable Solution, then only that Solution will be found (plus GEMM and FFT, if these applicable, see _Note 4_). + - Else, if `solution_id` is valid but not applicable, then `Find()` would fail with all algorithms (again, except GEMM and FFT, see _Note 4_) + - Otherwise the `solution_id` is invalid (i.e. it doesn't match any existing Solution), and the `Find()` call would fail. + +> **_NOTE 4:_** This env. variable does not affect the "gemm" and "fft" solutions. For now, GEMM and FFT can be disabled only at algorithm level (see above). + +### Filtering the Solutions on individual basis + +Some of the Solutions have individual controls available. These affect both Find and Immediate modes. _Note the "Warning" above._ + +Direct Solutions: +* `MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U` - `ConvAsm3x3U`. +* `MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U` - `ConvAsm1x1U`. +* `MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2` - `ConvAsm1x1UV2`. +* `MIOPEN_DEBUG_CONV_DIRECT_ASM_5X10U2V2` - `ConvAsm5x10u2v2f1`, `ConvAsm5x10u2v2b1`. +* `MIOPEN_DEBUG_CONV_DIRECT_ASM_7X7C3H224W224` - `ConvAsm7x7c3h224w224k64u2v2p3q3f1`. +* `MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3` - `ConvAsmBwdWrW3x3`. +* `MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW1X1` - `ConvAsmBwdWrW1x1`. +* `MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11` - `ConvOclDirectFwd11x11`. +* `MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN` - `ConvOclDirectFwdGen`. +* `MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD` - `ConvOclDirectFwd`. +* `MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1` - `ConvOclDirectFwd`. +* `MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW2` - `ConvOclBwdWrW2` (where n = `{1,2,4,8,16}`), and `ConvOclBwdWrW2NonTunable`. +* `MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW53` - `ConvOclBwdWrW53`. +* `MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW1X1` - `ConvOclBwdWrW1x1` + +Winograd Solutions: +* `MIOPEN_DEBUG_AMD_WINOGRAD_3X3` - `ConvBinWinograd3x3U`, FP32 Winograd Fwd/Bwd, filter size fixed to 3x3. +* `MIOPEN_DEBUG_AMD_WINOGRAD_RXS` - `ConvBinWinogradRxS`, FP32/FP16 F(3,3) Fwd/Bwd and FP32 F(3,2) WrW Winograd. Subsets: + * `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_WRW` - FP32 F(3,2) WrW convolutions only. + * `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD` - FP32/FP16 F(3,3) Fwd/Bwd. +* `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2` - `ConvBinWinogradRxSf3x2`, FP32/FP16 Fwd/Bwd F(3,2) Winograd. +* `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3` - `ConvBinWinogradRxSf2x3`, FP32/FP16 Fwd/Bwd F(2,3) Winograd, serves group convolutions only. +* `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1` - `ConvBinWinogradRxSf2x3g1`, FP32/FP16 Fwd/Bwd F(2,3) Winograd, for non-group convolutions. + +* Multi-pass Winograd: + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2` - `ConvWinograd3x3MultipassWrW<3-2>`, WrW F(3,2), stride 2 only. + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3` - `ConvWinograd3x3MultipassWrW<3-3>`, WrW F(3,3), stride 2 only. + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4` - `ConvWinograd3x3MultipassWrW<3-4>`, WrW F(3,4). + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5` - `ConvWinograd3x3MultipassWrW<3-5>`, WrW F(3,5). + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X6` - `ConvWinograd3x3MultipassWrW<3-6>`, WrW F(3,6). + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X3` - `ConvWinograd3x3MultipassWrW<5-3>`, WrW F(5,3). + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4` - `ConvWinograd3x3MultipassWrW<5-4>`, WrW F(5,4). + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2`: + * `ConvWinograd3x3MultipassWrW<7-2>`, WrW F(7,2) + * `ConvWinograd3x3MultipassWrW<7-2-1-1>`, WrW F(7x1,2x1) + * `ConvWinograd3x3MultipassWrW<1-1-7-2>`, WrW F(1x7,1x2) + * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3`: + * `ConvWinograd3x3MultipassWrW<7-3>`, WrW F(7,3) + * `ConvWinograd3x3MultipassWrW<7-3-1-1>`, WrW F(7x1,3x1) + * `ConvWinograd3x3MultipassWrW<1-1-7-3>`, WrW F(1x7,1x3) + * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F2X3` - `ConvMPBidirectWinograd<2-3>`, FWD/BWD F(2,3) + * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F3X3` - `ConvMPBidirectWinograd<3-3>`, FWD/BWD F(3,3) + * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F4X3` - `ConvMPBidirectWinograd<4-3>`, FWD/BWD F(4,3) + * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F5X3` - `ConvMPBidirectWinograd<5-3>`, FWD/BWD F(5,3) + * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F6X3` - `ConvMPBidirectWinograd<6-3>`, FWD/BWD F(6,3) + * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F2X3` - `ConvMPBidirectWinograd_xdlops<2-3>`, FWD/BWD F(2,3) + * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F3X3` - `ConvMPBidirectWinograd_xdlops<3-3>`, FWD/BWD F(3,3) + * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F4X3` - `ConvMPBidirectWinograd_xdlops<4-3>`, FWD/BWD F(4,3) + * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F5X3` - `ConvMPBidirectWinograd_xdlops<5-3>`, FWD/BWD F(5,3) + * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F6X3` - `ConvMPBidirectWinograd_xdlops<6-3>`, FWD/BWD F(6,3) + * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM - `ConvMPBidirectWinograd*`, FWD/BWD FP16 experemental mode. Disabled by default. This mode is experimental. Use it at your own risk. +* `MIOPEN_DEBUG_AMD_FUSED_WINOGRAD` - Fused FP32 F(3,3) Winograd, variable filter size. + +Implicit GEMM Solutions: +* ASM Implicit GEMM + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1` - `ConvAsmImplicitGemmV4R1DynamicFwd` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1` - `ConvAsmImplicitGemmV4R1DynamicFwd_1x1` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_V4R1` - `ConvAsmImplicitGemmV4R1DynamicBwd` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_V4R1` - `ConvAsmImplicitGemmV4R1DynamicWrw` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS` - `ConvAsmImplicitGemmGTCDynamicFwdXdlops` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS` - `ConvAsmImplicitGemmGTCDynamicBwdXdlops` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS` - `ConvAsmImplicitGemmGTCDynamicWrwXdlops` +* HIP Implicit GEMM + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1` - `ConvHipImplicitGemmV4R1Fwd` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4` - `ConvHipImplicitGemmV4R4Fwd` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1` - `ConvHipImplicitGemmBwdDataV1R1` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1` - `ConvHipImplicitGemmBwdDataV4R1` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1` - `ConvHipImplicitGemmV4R1WrW` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4` - `ConvHipImplicitGemmV4R4WrW` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_XDLOPS` - `ConvHipImplicitGemmForwardV4R4Xdlops` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R5_XDLOPS` - `ConvHipImplicitGemmForwardV4R5Xdlops` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1_XDLOPS` - `ConvHipImplicitGemmBwdDataV1R1Xdlops` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1_XDLOPS` - `ConvHipImplicitGemmBwdDataV4R1Xdlops` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_XDLOPS` - `ConvHipImplicitGemmWrwV4R4Xdlops` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_PADDED_GEMM_XDLOPS` - `ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm` + * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_PADDED_GEMM_XDLOPS` - `ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm` + +## rocBlas Logging and Behavior +The `ROCBLAS_LAYER` environmental variable can be set to output GEMM information: +* `ROCBLAS_LAYER=` - is not set, there is no logging +* `ROCBLAS_LAYER=1` - is set to 1, then there is trace logging +* `ROCBLAS_LAYER=2` - is set to 2, then there is bench logging +* `ROCBLAS_LAYER=3` - is set to 3, then there is both trace and bench logging + +Additionally, using environment variable "MIOPEN_GEMM_ENFORCE_BACKEND", can override the default behavior. The default behavior which is to use +both MIOpenGEMM and rocBlas depending on the input configuration: + +* `MIOPEN_GEMM_ENFORCE_BACKEND=1`, use rocBLAS if enabled +* `MIOPEN_GEMM_ENFORCE_BACKEND=2`, use MIOpenGEMM for FP32, use rocBLAS for FP16 if enabled +* `MIOPEN_GEMM_ENFORCE_BACKEND=3`, no gemm will be called +* `MIOPEN_GEMM_ENFORCE_BACKEND=4`, use MIOpenTensile for FP32, use rocBLAS for FP16 if enabled +* `MIOPEN_GEMM_ENFORCE_BACKEND=`, use default behavior + +To disable using rocBlas entirely, set the configuration flag `-DMIOPEN_USE_ROCBLAS=Off` during MIOpen configuration. + +More information on logging with rocBlas can be found [here](https://github.com/ROCmSoftwarePlatform/rocBLAS/wiki/5.Logging). + + +## Numerical Checking + +MIOpen provides the environmental variable `MIOPEN_CHECK_NUMERICS` to allow users to debug potential numerical abnormalities. Setting this variable will scan all inputs and outputs of each kernel called and attempt to detect infinities (infs), not-a-number (NaN), or all zeros. The environment variable has several settings that will help with debugging: + +* `MIOPEN_CHECK_NUMERICS=0x01`: Fully informative, prints results from all checks to console +* `MIOPEN_CHECK_NUMERICS=0x02`: Warning information, prints results only if abnormality detected +* `MIOPEN_CHECK_NUMERICS=0x04`: Throw error on detection, MIOpen execute MIOPEN_THROW on abnormal result +* `MIOPEN_CHECK_NUMERICS=0x08`: Abort on abnormal result, this will allow users to drop into a debugging session +* `MIOPEN_CHECK_NUMERICS=0x10`: Print stats, this will compute and print mean/absmean/min/max (note, this is much slower) + + +## Controlling Parallel Compilation + +MIOpen's Convolution Find() calls will compile and benchmark a set of `solvers` contained in `miopenConvAlgoPerf_t` this is done in parallel per `miopenConvAlgorithm_t`. Parallelism per algorithm is set to 20 threads. Typically there are far fewer threads spawned due to the limited number of kernels under any given algorithm. The level of parallelism can be controlled using the environment variable `MIOPEN_COMPILE_PARALLEL_LEVEL`. + +For example, to disable multi-threaded compilation: +``` +export MIOPEN_COMPILE_PARALLEL_LEVEL=1 +``` + + +## Experimental controls + +> **_NOTE 5: Using experimental controls may result in:_** +> * Performance drops +> * Computation inaccuracies +> * Run-time errors +> * Other kinds of unexpected behavior +> +> **_It is strongly recommended to use them only with the explicit permission or request of the library developers._** + +### Code Object (CO) version selection (EXPERIMENTAL) + +Different ROCm versions use Code Object files of different versions (or, in other words, formats). The library uses suitable version automatically. The following variables allow for experimenting and triaging possible problems related to CO version: +* `MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE` - Affects kernels written in GCN assembly language. + * `0` or unset - Automatically detect the required CO version and assemble to that version. This is the default. + * `1` - Do not auto-detect Code Object version, always assemble v2 Code Objects. + * `2` - Behave as if both CO v2 and v3 are supported (see `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER`). + * `3` - Always assemble v3 Code Objects. +* `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER` - This variable affects only assembly kernels, and only when ROCm supports both CO v2 and CO v3 (like ROCm 2.10). By default, the newer format is used (CO v3). When this variable is _enabled_, the behavior is reversed. +* `MIOPEN_DEBUG_OPENCL_ENFORCE_CODE_OBJECT_VERSION` - Enforces Code Object format for OpenCL kernels. Works with HIP backend only (`cmake ... -DMIOPEN_BACKEND=HIP...`). + * Unset - Automatically detect the required CO version. This is the default. + * `2` - Always build to CO v2. + * `3` - Always build to CO v3. + * `4` - Always build to CO v4. + +### Winograd Multi-pass Maximum Workspace throttling + +`MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_WORKSPACE_MAX` - `ConvWinograd3x3MultipassWrW`, WrW +`MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_WORKSPACE_MAX` - `ConvMPBidirectWinograd*`, FWD BWD + +Syntax of value: +* decimal or hex (with `0x` prefix) value that should fit into 64-bit unsigned integer. +* If syntax is violated, then the behavior is unspecified. + +Semantics: +* Sets the **_limit_** (max allowed workspace size) for Multi-pass (MP) Winograd Solutions, in bytes. +* Affects all MP Winograd Solutions. If a Solution needs more workspace than the limit, then it does not apply. +* If unset, then _the default_ limit is used. Current default is `2000000000` (~1.862 GiB) for gfx900 and gfx906/60 (or less CUs). No default limit is set for other GPUs. +* Special values: +``` + 0 - Use the default limit, as if the variable is unset. + 1 - Completely prohibit the use of workspace. +-1 - Remove the default limit. +``` From e745fe93a356f50e3d4aeff041a26b43557fbd75 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:28:29 -0800 Subject: [PATCH 27/67] Create MI200alternateimplementation.md --- docs/tutorials/MI200alternateimplementation.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 docs/tutorials/MI200alternateimplementation.md diff --git a/docs/tutorials/MI200alternateimplementation.md b/docs/tutorials/MI200alternateimplementation.md new file mode 100644 index 0000000000..c100b17a7c --- /dev/null +++ b/docs/tutorials/MI200alternateimplementation.md @@ -0,0 +1,13 @@ +## MI200 MFMA Behavior Specifics + +The MI200 MFMA_F16, MFMA_BF16 and MFMA_BF16_1K flush subnormal input/output data to zero. This behavior might affect the convolution operation in certain workloads due to the limited exponent range of the half-precision floating point datatypes. + +An alternate implementation for the half precision data-type is available in MIOpen which utilizes conversion instructions to utilizes the BFloat16 data-types larger exponent range, albeit with reduced accuracy. The following salients apply to this alternate implementation: + +* It is disabled by default in the Forward convolution operations. + +* It is enabled by default in the backward data and backward weights convolution operations. + +* The default MIOpen behaviors described above may be overridden using the `miopenSetConvolutionAttribute` API call and passing the convolution descriptor for the appropriate convolution operation and the `MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL` convolution attribute with a non-zero value to engage the alternate implementation. + +* The behavior might also be overridden using the `MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL` environment variable. The above variable when set to a value of `1` engages the alternate implementation while a value of `0` disables it. Keep in mind the environment variable impacts the convolution operation in all directions. From 30d9b49e5a0f1625334a10149f34c9c7af85f536 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:29:35 -0800 Subject: [PATCH 28/67] Create MIOpenportingguide.md --- docs/tutorials/MIOpenportingguide.md | 2062 ++++++++++++++++++++++++++ 1 file changed, 2062 insertions(+) create mode 100644 docs/tutorials/MIOpenportingguide.md diff --git a/docs/tutorials/MIOpenportingguide.md b/docs/tutorials/MIOpenportingguide.md new file mode 100644 index 0000000000..a23d0071ad --- /dev/null +++ b/docs/tutorials/MIOpenportingguide.md @@ -0,0 +1,2062 @@ +# MIOpen Porting Guide + + +## The key differences between MIOpen and cuDNN: +* MIOpen only supports 4-D tensors in the NCHW and NHWC storage format. This means all the __“\*Nd\*”__ APIs in cuDNN do not have a corresponding API in MIOpen. +* MIOpen only supports __`float(fp32)`__ data-type. +* MIOpen supports __2D Convolutions__ and __3D Convolutions__. +* MIOpen only supports __2D Pooling__. +* Calling miopenFindConvolution*Algorithm() is *mandatory* before calling any Convolution API. +* Typical calling sequence for Convolution APIs for MIOpen is: + * miopenConvolution*GetWorkSpaceSize() // returns the workspace size required by Find() + * miopenFindConvolution*Algorithm() // returns performance info about various algorithms + * miopenConvolution*() +* MIOpen does not support __Preferences__ for convolutions. +* MIOpen does not support Softmax modes. MIOpen implements the __SOFTMAX_MODE_CHANNEL__ flavor. +* MIOpen does not support __Transform-Tensor__, __Dropout__, __RNNs__, and __Divisive Normalization__. + +



+ +## Helpful MIOpen Environment Variables +`MIOPEN_ENABLE_LOGGING=1` – log all the MIOpen APIs called including the parameters passed to +those APIs. \ +`MIOPEN_DEBUG_AMD_ROCM_PRECOMPILED_BINARIES=0` – disable Winograd convolution +algorithm. \ +`MIOPEN_DEBUG_GCN_ASM_KERNELS=0` – disable hand-tuned asm. kernels for Direct convolution +algorithm. Fall-back to kernels written in high-level language. \ +`MIOPEN_DEBUG_CONV_FFT=0` – disable FFT convolution algorithm. \ +`MIOPEN_DEBUG_CONV_DIRECT=0` – disable Direct convolution algorithm. + +



+ + +## API differences + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Operation + + + +cuDNN API + + +MIOpen API +
+ + +```c++ +cudnnStatus_t +cudnnCreate( + cudnnHandle_t *handle) +``` + + +```c++ +miopenStatus_t +miopenCreate( + miopenHandle_t *handle) +``` +
+ + +```c++ +cudnnStatus_t +cudnnDestroy( + cudnnHandle_t handle) +``` + + +```c++ +miopenStatus_t +miopenDestroy( + miopenHandle_t handle) +``` +
+Handle + + +```c++ +cudnnStatus_t +cudnnSetStream( + cudnnHandle_t handle, + cudaStream_t streamId) +``` + + +```c++ +miopenStatus_t +miopenSetStream( + miopenHandle_t handle, + miopenAcceleratorQueue_t streamID) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetStream( + cudnnHandle_t handle, + cudaStream_t *streamId) +``` + + +```c++ +miopenStatus_t +miopenGetStream( + miopenHandle_t handle, + miopenAcceleratorQueue_t *streamID) +``` +
+ + +```c++ +cudnnStatus_t +cudnnCreateTensorDescriptor( + cudnnTensorDescriptor_t *tensorDesc) +``` + + +```c++ +miopenStatus_t +miopenCreateTensorDescriptor( + miopenTensorDescriptor_t + *tensorDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnSetTensor4dDescriptor( + cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, + int n, + int c, + int h, + int w) +``` + + +```c++ +// Only `NCHW` format is supported +miopenStatus_t miopenSet4dTensorDescriptor( + miopenTensorDescriptor_t tensorDesc, + miopenDataType_t dataType, + int n, + int c, + int h, + int w) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetTensor4dDescriptor( + cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t *dataType, + int *n, + int *c, + int *h, + int *w, + int *nStride, + int *cStride, + int *hStride, + int *wStride) +``` + + +```c++ +miopenStatus_t +miopenGet4dTensorDescriptor( + miopenTensorDescriptor_t tensorDesc, + miopenDataType_t *dataType, + int *n, + int *c, + int *h, + int *w, + int *nStride, + int *cStride, + int *hStride, + int *wStride) +``` +
+Tensor + + +```c++ +cudnnStatus_t +cudnnDestroyTensorDescriptor( + cudnnTensorDescriptor_t tensorDesc) +``` + + +```c++ +miopenStatus_t +miopenDestroyTensorDescriptor( + miopenTensorDescriptor_t tensorDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnAddTensor( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C) +``` + + +```c++ +//Set tensorOp to miopenOpTensorAdd +miopenStatus_t +miopenOpTensor( + miopenHandle_t handle, + miopenTensorOp_t tensorOp, + const void *alpha1, + constmiopenTensorDescriptor_t aDesc, + const void *A, + const void *alpha2, + const miopenTensorDescriptor_t bDesc, + const void *B, + const void *beta, + const miopenTensorDescriptor_t cDesc, + void *C) +// For Forward Bias use +// miopenConvolutionForwardBias. +``` +
+ + +```c++ +cudnnStatus_t +cudnnOpTensor( + cudnnHandle_t handle, + const cudnnOpTensorDescriptor_t opTensorDesc, + const void *alpha1, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *alpha2, + const cudnnTensorDescriptor_t bDesc, + const void *B, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C) +``` + + +```c++ +miopenStatus_t +miopenOpTensor( + miopenHandle_t handle, + miopenTensorOp_t tensorOp, + const void *alpha1, + const miopenTensorDescriptor_t aDesc, + const void *A, const void *alpha2, + const miopenTensorDescriptor_t bDesc, + const void *B, + const void *beta, + const miopenTensorDescriptor_t cDesc, + void *C) +``` +
+ + +```c++ +cudnnStatus_t +cudnnSetTensor( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t yDesc, + void *y, + const void *valuePtr) +``` + + +```c++ +miopenStatus_t +miopenSetTensor( + miopenHandle_t handle, + const miopenTensorDescriptor_t yDesc, + void *y, + const void *alpha) +``` +
+ + +```c++ +cudnnStatus_t +cudnnScaleTensor( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t yDesc, + void *y, + const void *alpha) +``` + + +```c++ +miopenStatus_t +miopenScaleTensor( + miopenHandle_t handle, + const miopenTensorDescriptor_t yDesc, + void *y, + const void *alpha) +``` +
+Filter + + +```c++ +cudnnStatus_t +cudnnCreateFilterDescriptor( + cudnnFilterDescriptor_t *filterDesc) +``` + + +```c++ +// All *FilterDescriptor* APIs are substituted by +// the respective TensorDescriptor APIs. +``` +
+ + +```c++ +cudnnStatus_t +cudnnCreateConvolutionDescriptor( + cudnnConvolutionDescriptor_t *convDesc) +``` + + +```c++ +miopenStatus_t +miopenCreateConvolutionDescriptor( + miopenConvolutionDescriptor_t *convDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnSetConvolution2dDescriptor( + cudnnConvolutionDescriptor_t convDesc, + int pad_h, + int pad_w, + int u, + int v, + int upscalex, + int upscaley, + cudnnConvolutionMode_t mode) +``` + + +```c++ +miopenStatus_t +miopenInitConvolutionDescriptor( + miopenConvolutionDescriptor_t convDesc, + miopenConvolutionMode_t mode, + int pad_h, + int pad_w, + int u, + int v, + int upscalex, + int upscaley) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetConvolution2dDescriptor( + const cudnnConvolutionDescriptor_t convDesc, + int *pad_h, + int *pad_y, + int *u, + int *v, + int *upscalex, + int *upscaley, + cudnnConvolutionMode_t *mode) +``` + + +```c++ +miopenStatus_t +miopenGetConvolutionDescriptor( + miopenConvolutionDescriptor_t convDesc, + miopenConvolutionMode_t *mode, + int *pad_h, + int *pad_y, + int *u, + int *v, + int *upscalex, + int *upscaley) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetConvolution2dForwardOutputDim( + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int *n, + int *c, + int *h, + int *w) +``` + + +```c++ +miopenStatus_t +miopenGetConvolutionForwardOutputDim( + miopenConvolutionDescriptor_t convDesc, + const miopenTensorDescriptor_t inputTensorDesc, + const miopenTensorDescriptor_t filterDesc, + int *n, + int *c, + int *h, + int *w) +``` +
+ + +```c++ +cudnnStatus_t +cudnnDestroyConvolutionDescriptor( + cudnnConvolutionDescriptor_t convDesc) +``` + + +```c++ +miopenStatus_t +miopenDestroyConvolutionDescriptor( + miopenConvolutionDescriptor_t convDesc) +``` +
+Convolution + + +```c++ +cudnnStatus_t +cudnnFindConvolutionForwardAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults) + +``` +```c++ +cudnnStatus_t +cudnnFindConvolutionForwardAlgorithmEx( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes) + +``` +```c++ +cudnnStatus_t +cudnnGetConvolutionForwardAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdPreference_t preference, + size_t memoryLimitInBytes, + cudnnConvolutionFwdAlgo_t *algo) +``` + + +```c++ +// FindConvolution() is mandatory. +// Allocate workspace prior to running this API. +// A table with times and memory requirements +// for different algorithms is returned. +// Users can choose the top-most algorithm if +// they only care about the fastest algorithm. +miopenStatus_t +miopenFindConvolutionForwardAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t xDesc, + const void *x, + const miopenTensorDescriptor_t wDesc, + const void *w, + const miopenConvolutionDescriptor_t convDesc, + const miopenTensorDescriptor_t yDesc, + void *y, + const int requestAlgoCount, + int *returnedAlgoCount, + miopenConvAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSize, + bool exhaustiveSearch) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdAlgo_t algo, + size_t *sizeInBytes) +``` + + +```c++ +miopenStatus_t +miopenConvolutionForwardGetWorkSpaceSize( + miopenHandle_t handle, + const miopenTensorDescriptor_t wDesc, + const miopenTensorDescriptor_t xDesc, + const miopenConvolutionDescriptor_t convDesc, + const miopenTensorDescriptor_t yDesc, + size_t *workSpaceSize) +``` +
+ + +```c++ +cudnnStatus_t +cudnnConvolutionForward( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y) +``` + + +```c++ +miopenStatus_t +miopenConvolutionForward( + miopenHandle_t handle, + const void *alpha, + const miopenTensorDescriptor_t xDesc, + const void *x, + const miopenTensorDescriptor_t wDesc, + const void *w, + const miopenConvolutionDescriptor_t convDesc, + miopenConvFwdAlgorithm_t algo, + const void *beta, + const miopenTensorDescriptor_t yDesc, + void *y, + void *workSpace, + size_t workSpaceSize) +``` +
+ + +```c++ +cudnnStatus_t +cudnnConvolutionBackwardBias( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dbDesc, + void *db) +``` + + +```c++ +miopenStatus_t +miopenConvolutionBackwardBias( + miopenHandle_t handle, + const void *alpha, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const miopenTensorDescriptor_t dbDesc, + void *db) +``` +
+ + +```c++ +cudnnStatus_t +cudnnFindConvolutionBackwardFilterAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) +``` +```c++ +cudnnStatus_t +cudnnFindConvolutionBackwardFilterAlgorithmEx( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *y, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes) + +``` +```c++ +cudnnStatus_t +cudnnGetConvolutionBackwardFilterAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + cudnnConvolutionBwdFilterPreference_t preference, + size_t memoryLimitInBytes, + cudnnConvolutionBwdFilterAlgo_t *algo) +``` + + +```c++ +// FindConvolution() is mandatory. +// Allocate workspace prior to running this API. +// A table with times and memory requirements +// for different algorithms is returned. +// Users can choose the top-most algorithm if +// they only care about the fastest algorithm. +miopenStatus_t +miopenFindConvolutionBackwardWeightsAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const miopenTensorDescriptor_t xDesc, + const void *x, + const miopenConvolutionDescriptor_t convDesc, + const miopenTensorDescriptor_t dwDesc, + void *dw, + const int requestAlgoCount, + int *returnedAlgoCount, + miopenConvAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSize, + bool exhaustiveSearch) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetConvolutionBackwardFilterWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + size_t *sizeInBytes) +``` + + +```c++ +miopenStatus_t +miopenConvolutionBackwardWeightsGetWorkSpaceSize( + miopenHandle_t handle, + const miopenTensorDescriptor_t dyDesc, + const miopenTensorDescriptor_t xDesc, + const miopenConvolutionDescriptor_t convDesc, + const miopenTensorDescriptor_t dwDesc, + size_t *workSpaceSize) +``` +
+ + +```c++ +cudnnStatus_t +cudnnConvolutionBackwardFilter( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnFilterDescriptor_t dwDesc, + void *dw) +``` + + +```c++ +miopenStatus_t +miopenConvolutionBackwardWeights( + miopenHandle_t handle, + const void *alpha, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const miopenTensorDescriptor_t xDesc, + const void *x, + const miopenConvolutionDescriptor_t convDesc, + miopenConvBwdWeightsAlgorithm_t algo, + const void *beta, + const miopenTensorDescriptor_t dwDesc, + void *dw, + void *workSpace, + size_t workSpaceSize) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetConvolutionBackwardDataWorkspaceSize( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataAlgo_t algo, + size_t *sizeInBytes) +``` + + +```c++ +miopenStatus_t +miopenConvolutionBackwardDataGetWorkSpaceSize( + miopenHandle_t handle, + const miopenTensorDescriptor_t dyDesc, + const miopenTensorDescriptor_t wDesc, + const miopenConvolutionDescriptor_t convDesc, + const miopenTensorDescriptor_t dxDesc, + size_t *workSpaceSize) +``` +
+ + +```c++ +cudnnStatus_t +cudnnFindConvolutionBackwardDataAlgorithm( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults) + +``` +```c++ +cudnnStatus_t +cudnnFindConvolutionBackwardDataAlgorithmEx( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes) + +``` +```c++ +cudnnStatus_t +cudnnGetConvolutionBackwardDataAlgorithm( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataPreference_t preference, + size_t memoryLimitInBytes, + cudnnConvolutionBwdDataAlgo_t *algo) +``` + + +```c++ +// FindConvolution() is mandatory. +// Allocate workspace prior to running this API. +// A table with times and memory requirements +// for different algorithms is returned. +// Users can choose the top-most algorithm if +// they only care about the fastest algorithm. +miopenStatus_t +miopenFindConvolutionBackwardDataAlgorithm( + miopenHandle_t handle, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const miopenTensorDescriptor_t wDesc, + const void *w, + const miopenConvolutionDescriptor_t convDesc, + const miopenTensorDescriptor_t dxDesc, + const void *dx, + const int requestAlgoCount, + int *returnedAlgoCount, + miopenConvAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSize, + bool exhaustiveSearch) +``` +
+ + +```c++ +cudnnStatus_t +cudnnConvolutionBackwardData( + cudnnHandle_t handle, + const void *alpha, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdDataAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx) +``` + + +```c++ + miopenStatus_t + miopenConvolutionBackwardData( + miopenHandle_t handle, + const void *alpha, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const miopenTensorDescriptor_t wDesc, + const void *w, + const miopenConvolutionDescriptor_t convDesc, + miopenConvBwdDataAlgorithm_t algo, + const void *beta, + const miopenTensorDescriptor_t dxDesc, + void *dx, + void *workSpace, + size_t workSpaceSize) +``` +
+Softmax + + +```c++ +cudnnStatus_t +cudnnSoftmaxForward( + cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y) +``` + + +```c++ +miopenStatus_t +miopenSoftmaxForward( + miopenHandle_t handle, + const void *alpha, + const miopenTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const miopenTensorDescriptor_t yDesc, + void *y) +``` +
+ + +```c++ +cudnnStatus_t +cudnnSoftmaxBackward( + cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx) +``` + + +```c++ +miopenStatus_t +miopenSoftmaxBackward( + miopenHandle_t handle, + const void *alpha, + const miopenTensorDescriptor_t yDesc, + const void *y, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const miopenTensorDescriptor_t dxDesc, + void *dx) +``` +
+ + +```c++ +cudnnStatus_t +cudnnCreatePoolingDescriptor( + cudnnPoolingDescriptor_t *poolingDesc) + +``` + + +```c++ +miopenStatus_t +miopenCreatePoolingDescriptor( + miopenPoolingDescriptor_t *poolDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnSetPooling2dDescriptor( + cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t mode, + cudnnNanPropagation_t maxpoolingNanOpt, + int windowHeight, + int windowWidth, + int verticalPadding, + int horizontalPadding, + int verticalStride, + int horizontalStride) +``` + + +```c++ +miopenStatus_t +miopenSet2dPoolingDescriptor( + miopenPoolingDescriptor_t poolDesc, + miopenPoolingMode_t mode, + int windowHeight, + int windowWidth, + int pad_h, + int pad_w, + int u, + int v) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetPooling2dDescriptor( + const cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *windowHeight, + int *windowWidth, + int *verticalPadding, + int *horizontalPadding, + int *verticalStride, + int *horizontalStride) +``` + + +```c++ +miopenStatus_t +miopenGet2dPoolingDescriptor( + const miopenPoolingDescriptor_t poolDesc, + miopenPoolingMode_t *mode, + int *windowHeight, + int *windowWidth, + int *pad_h, + int *pad_w, + int *u, + int *v) +``` +
+Pooling + + +```c++ +cudnnStatus_t +cudnnGetPooling2dForwardOutputDim( + const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int *n, + int *c, + int *h, + int *w) +``` + + +```c++ +miopenStatus_t +miopenGetPoolingForwardOutputDim( + const miopenPoolingDescriptor_t poolDesc, + const miopenTensorDescriptor_t tensorDesc, + int *n, + int *c, + int *h, + int *w) +``` +
+ + +```c++ +cudnnStatus_t +cudnnDestroyPoolingDescriptor( + cudnnPoolingDescriptor_t poolingDesc) +``` + + +```c++ +miopenStatus_t +miopenDestroyPoolingDescriptor( + miopenPoolingDescriptor_t poolDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnPoolingForward( + cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y) +``` + + +```c++ +miopenStatus_t +miopenPoolingForward( + miopenHandle_t handle, + const miopenPoolingDescriptor_t poolDesc, + const void *alpha, + const miopenTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const miopenTensorDescriptor_t yDesc, + void *y, + bool do_backward, + void *workSpace, + size_t workSpaceSize) +``` +
+ + + + +```c++ +miopenStatus_t +miopenPoolingGetWorkSpaceSize( + const miopenTensorDescriptor_t yDesc, + size_t *workSpaceSize) +``` +
+ + +```c++ +cudnnStatus_t +cudnnPoolingBackward( + cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx) +``` + + +```c++ +miopenStatus_t +miopenPoolingBackward( + miopenHandle_t handle, + const miopenPoolingDescriptor_t poolDesc, + const void *alpha, + const miopenTensorDescriptor_t yDesc, + const void *y, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const miopenTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const miopenTensorDescriptor_t dxDesc, + void *dx, + const void *workspace) +``` +
+ + +```c++ +cudnnStatus_t +cudnnCreateActivationDescriptor( + cudnnActivationDescriptor_t *activationDesc) +``` + + +```c++ +miopenStatus_t +miopenCreateActivationDescriptor( + miopenActivationDescriptor_t *activDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnSetActivationDescriptor( + cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t mode, + cudnnNanPropagation_t reluNanOpt, + double reluCeiling) +``` + + +```c++ +miopenStatus_t +miopenSetActivationDescriptor( + const miopenActivationDescriptor_t activDesc, + miopenActivationMode_t mode, + double activAlpha, + double activBeta, + double activPower) +``` +
+Activation + + +```c++ +cudnnStatus_t +cudnnGetActivationDescriptor( + const cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t *mode, + cudnnNanPropagation_t *reluNanOpt, + double *reluCeiling) +``` + + +```c++ +miopenStatus_t +miopenGetActivationDescriptor( + const miopenActivationDescriptor_t activDesc, + miopenActivationMode_t *mode, + double *activAlpha, + double *activBeta, + double *activPower) +``` +
+ + +```c++ +cudnnStatus_t +cudnnDestroyActivationDescriptor( + cudnnActivationDescriptor_t activationDesc) +``` + + +```c++ +miopenStatus_t +miopenDestroyActivationDescriptor( + miopenActivationDescriptor_t activDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnActivationForward( + cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y) +``` + + +```c++ +miopenStatus_t +miopenActivationForward( + miopenHandle_t handle, + const miopenActivationDescriptor_t activDesc, + const void *alpha, + const miopenTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const miopenTensorDescriptor_t yDesc, + void *y) +``` +
+ + +```c++ +cudnnStatus_t +cudnnActivationBackward( + cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx) +``` + + +```c++ +miopenStatus_t +miopenActivationBackward( + miopenHandle_t handle, + const miopenActivationDescriptor_t activDesc, + const void *alpha, + const miopenTensorDescriptor_t yDesc, + const void *y, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const miopenTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const miopenTensorDescriptor_t dxDesc, + void *dx) +``` +
+ + +```c++ +cudnnStatus_t +cudnnCreateLRNDescriptor( + cudnnLRNDescriptor_t *normDesc) +``` + + +```c++ +miopenStatus_t +miopenCreateLRNDescriptor( + miopenLRNDescriptor_t + *lrnDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnSetLRNDescriptor( + cudnnLRNDescriptor_t normDesc, + unsigned lrnN, + double lrnAlpha, + double lrnBeta, + double lrnK) +``` + + +```c++ +miopenStatus_t +miopenSetLRNDescriptor( + const miopenLRNDescriptor_t lrnDesc, + miopenLRNMode_t mode, + unsigned lrnN, + double lrnAlpha, + double lrnBeta, + double lrnK) +``` +
+ + +```c++ +cudnnStatus_t +cudnnGetLRNDescriptor( + cudnnLRNDescriptor_t normDesc, + unsigned* lrnN, + double* lrnAlpha, + double* lrnBeta, + double* lrnK) +``` + + +```c++ +miopenStatus_t +miopenGetLRNDescriptor( + const miopenLRNDescriptor_t lrnDesc, + miopenLRNMode_t *mode, + unsigned *lrnN, + double *lrnAlpha, + double *lrnBeta, + double *lrnK) + +``` +
+ LRN + + +```c++ +cudnnStatus_t +cudnnDestroyLRNDescriptor( + cudnnLRNDescriptor_t lrnDesc) +``` + + +```c++ +miopenStatus_t +miopenDestroyLRNDescriptor( + miopenLRNDescriptor_t lrnDesc) +``` +
+ + +```c++ +cudnnStatus_t +cudnnLRNCrossChannelForward( + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void* alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y) +``` + + +```c++ +miopenStatus_t +miopenLRNForward( + miopenHandle_t handle, + const miopenLRNDescriptor_t lrnDesc, + const void *alpha, + const miopenTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const miopenTensorDescriptor_t yDesc, + void *y, + bool do_backward, + void *workspace) +``` +
+ + +```c++ +cudnnStatus_t +cudnnLRNCrossChannelBackward( + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void* alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx) +``` + + +```c++ +miopenStatus_t +miopenLRNBackward( + miopenHandle_t handle, + const miopenLRNDescriptor_t lrnDesc, + const void *alpha, + const miopenTensorDescriptor_t yDesc, + const void *y, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const miopenTensorDescriptor_t xDesc, + const void *x, const void *beta, + const miopenTensorDescriptor_t dxDesc, + void *dx, + const void *workspace) +``` +
+ + + + + +```c++ +miopenStatus_t +miopenLRNGetWorkSpaceSize( + const miopenTensorDescriptor_t yDesc, + size_t *workSpaceSize) +``` +
+ + +```c++ +cudnnStatus_t +cudnnDeriveBNTensorDescriptor( + cudnnTensorDescriptor_t derivedBnDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnBatchNormMode_t mode) +``` + + +```c++ +miopenStatus_t +miopenDeriveBNTensorDescriptor( + miopenTensorDescriptor_t derivedBnDesc, + const miopenTensorDescriptor_t xDesc, + miopenBatchNormMode_t bn_mode) +``` +
+ + +```c++ +cudnnStatus_t +cudnnBatchNormalizationForwardTraining( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + void *alpha, + void *beta, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t yDesc, + void *y, + const cudnnTensorDescriptor_t + bnScaleBiasMeanVarDesc, + void *bnScale, + void *bnBias, + double exponentialAverageFactor, + void *resultRunningMean, + void *resultRunningVariance, + double epsilon, + void *resultSaveMean, + void *resultSaveInvVariance) +``` + + +```c++ +miopenStatus_t +miopenBatchNormalizationForwardTraining( + miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void *alpha, + void *beta, + const miopenTensorDescriptor_t xDesc, + const void *x, + const miopenTensorDescriptor_t yDesc, + void *y, + const miopenTensorDescriptor_t + bnScaleBiasMeanVarDesc, + void *bnScale, + void *bnBias, + double expAvgFactor, + void *resultRunningMean, + void *resultRunningVariance, + double epsilon, + void *resultSaveMean, + void *resultSaveInvVariance) +``` +
+ Batch Normalization + + +```c++ +cudnnStatus_t +cudnnnBatchNormalizationForwardInference( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + void *alpha, + void *beta, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t yDesc, + void *y, + const cudnnTensorDescriptor_t + bnScaleBiasMeanVarDesc, + const void *bnScale, + void *bnBias, + const void *estimatedMean, + const void *estimatedVariance, + double epsilon) +``` + + +```c++ +miopenStatus_t +miopenBatchNormalizationForwardInference( + miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void *alpha, + void *beta, + const miopenTensorDescriptor_t xDesc, + const void *x, + const miopenTensorDescriptor_t yDesc, + void *y, + const miopenTensorDescriptor_t + bnScaleBiasMeanVarDesc, + void *bnScale, + void *bnBias, + void *estimatedMean, + void *estimatedVariance, + double epsilon) +``` +
+ + +```c++ +cudnnStatus_t +cudnnBatchNormalizationBackward( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const cudnnTensorDescriptor_t + bnScaleBiasDiffDesc, + const void *bnScale, + void *resultBnScaleDiff, + void *resultBnBiasDiff, + double epsilon, + const void *savedMean, + const void *savedInvVariance) +``` + + +```c++ +miopenStatus_t +miopenBatchNormalizationBackward( + miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void *x, + const miopenTensorDescriptor_t dyDesc, + const void *dy, + const miopenTensorDescriptor_t dxDesc, + void *dx, + const miopenTensorDescriptor_t + bnScaleBiasDiffDesc, + const void *bnScale, + void *resultBnScaleDiff, + void *resultBnBiasDiff, + double epsilon, + const void *savedMean, + const void *savedInvVariance) +``` +
+ +

+
From 0ebea5e3301e1b607a1853cfd2f000daff67eb61 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:31:02 -0800 Subject: [PATCH 29/67] Create index.rst --- docs/tutorials/index.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/tutorials/index.rst diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docs/tutorials/index.rst @@ -0,0 +1 @@ + From ca4c03b952630559b73ab6728680b38103c00f33 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:33:25 -0800 Subject: [PATCH 30/67] Create install.md --- docs/tutorials/quick-start/install.md | 77 +++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 docs/tutorials/quick-start/install.md diff --git a/docs/tutorials/quick-start/install.md b/docs/tutorials/quick-start/install.md new file mode 100644 index 0000000000..0932cd2563 --- /dev/null +++ b/docs/tutorials/quick-start/install.md @@ -0,0 +1,77 @@ +## Prerequisites + +* More information about ROCm stack via [ROCm Information Portal](https://docs.amd.com/). +* A ROCm enabled platform, more info [here](https://rocm.github.io/install.html). +* Base software stack, which includes: + * HIP - + * HIP and HCC libraries and header files. + * OpenCL - OpenCL libraries and header files. +* [MIOpenGEMM](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM) - enable various functionalities including transposed and dilated convolutions. + * This is optional on the HIP backend, and required on the OpenCL backend. + * Users can enable this library using the cmake configuration flag `-DMIOPEN_USE_MIOPENGEMM=On`, which is enabled by default when OpenCL backend is chosen. +* [ROCm cmake](https://github.com/RadeonOpenCompute/rocm-cmake) - provide cmake modules for common build tasks needed for the ROCM software stack. +* [Half](http://half.sourceforge.net/) - IEEE 754-based half-precision floating point library +* [Boost](http://www.boost.org/) + * MIOpen uses `boost-system` and `boost-filesystem` packages to enable persistent [kernel cache](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/cache.html) + * Version 1.79 is recommended, older version may need patches to work on newer systems, e.g. boost1{69,70,72} w/glibc-2.34 +* [SQLite3](https://sqlite.org/index.html) - reading and writing performance database +* [MIOpenTENSILE](https://github.com/ROCmSoftwarePlatform/MIOpenTensile) - users can enable this library using the cmake configuration flag`-DMIOPEN_USE_MIOPENTENSILE=On`. (deprecated after ROCm 5.1.1) +* [rocBLAS](https://github.com/ROCmSoftwarePlatform/rocBLAS) - AMD library for Basic Linear Algebra Subprograms (BLAS) on the ROCm platform. + * Minimum version branch for pre-ROCm 3.5 [master-rocm-2.10](https://github.com/ROCmSoftwarePlatform/rocBLAS/tree/master-rocm-2.10) + * Minimum version branch for post-ROCm 3.5 [master-rocm-3.5](https://github.com/ROCmSoftwarePlatform/rocBLAS/releases/tag/rocm-3.5.0) +* [MLIR](https://github.com/ROCmSoftwarePlatform/llvm-project-mlir) - (Multi-Level Intermediate Representation) with its MIOpen dialect to support and complement kernel development. +* [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) - C++ templated device library for GEMM-like and reduction-like operators. + +## Installing MIOpen with pre-built packages + +MIOpen can be installed on Ubuntu using `apt-get`. + +For OpenCL backend: `apt-get install miopen-opencl` + +For HIP backend: `apt-get install miopen-hip` + +Currently both the backends cannot be installed on the same system simultaneously. If a different backend other than what currently exists on the system is desired, please uninstall the existing backend completely and then install the new backend. + +## Installing MIOpen kernels package + +MIOpen provides an optional pre-compiled kernels package to reduce the startup latency. These precompiled kernels comprise a select set of popular input configurations and will expand in future release to contain additional coverage. + +Note that all compiled kernels are locally cached in the folder `$HOME/.cache/miopen/`, so precompiled kernels reduce the startup latency only for the first execution of a neural network. Precompiled kernels do not reduce startup time on subsequent runs. + +To install the kernels package for your GPU architecture, use the following command: + +``` +apt-get install miopenkernels-- +``` + +Where `` is the GPU architecture ( for example, `gfx900`, `gfx906`, `gfx1030` ) and `` is the number of CUs available in the GPU (for example 56 or 64 etc). + +Not installing these packages would not impact the functioning of MIOpen, since MIOpen will compile these kernels on the target machine once the kernel is run. However, the compilation step may significantly increase the startup time for different operations. + +The script `utils/install_precompiled_kernels.sh` provided as part of MIOpen automates the above process, it queries the user machine for the GPU architecture and then installs the appropriate package. It may be invoked as: + +``` +./utils/install_precompiled_kernels.sh +``` + +The above script depends on the __rocminfo__ package to query the GPU architecture. + +More info can be found [here](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/develop/docs/cache.md#installing-pre-compiled-kernels). + +## Installing the dependencies + +The dependencies can be installed with the `install_deps.cmake`, script: `cmake -P install_deps.cmake` + +This will install by default to `/usr/local` but it can be installed in another location with `--prefix` argument: +``` +cmake -P install_deps.cmake --prefix +``` +An example cmake step can be: +``` +cmake -P install_deps.cmake --minimum --prefix /root/MIOpen/install_dir +``` +This prefix can used to specify the dependency path during the configuration phase using the `CMAKE_PREFIX_PATH`. + +* MIOpen's HIP backend uses [rocBLAS](https://github.com/ROCmSoftwarePlatform/rocBLAS) by default. Users can install rocBLAS minimum release by using `apt-get install rocblas`. To disable using rocBLAS set the configuration flag `-DMIOPEN_USE_ROCBLAS=Off`. rocBLAS is *not* available for the OpenCL backend. + +* MIOpen's OpenCL backend uses [MIOpenGEMM](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM) by default. Users can install MIOpenGEMM minimum release by using `apt-get install miopengemm`. From 70ee48ce25dc49af4a75acb41c7c3eb4acc21cc3 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:33:57 -0800 Subject: [PATCH 31/67] Create embed.md --- docs/tutorials/quick-start/embed.md | 99 +++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 docs/tutorials/quick-start/embed.md diff --git a/docs/tutorials/quick-start/embed.md b/docs/tutorials/quick-start/embed.md new file mode 100644 index 0000000000..5065d8a56e --- /dev/null +++ b/docs/tutorials/quick-start/embed.md @@ -0,0 +1,99 @@ +Building MIOpen for Embedded Systems +==================================== + + + +### Install dependencies +Install minimum dependencies (default location /usr/local): +``` +cmake -P install_deps.cmake --minimum --prefix /some/local/dir +``` + +Create build directory: +``` +mkdir build; cd build; +``` + +### Configuring for an embedded build +Minimal static build configuration line without embedded precompiled kernels package, or Find-Db: +``` +CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BACKEND=HIP -DMIOPEN_EMBED_BUILD=On -DCMAKE_PREFIX_PATH="/some/local/dir" .. +``` + +To enable HIP kernels in MIOpen while using embedded builds add: `-DMIOPEN_USE_HIP_KERNELS=On` to the configure line. +For example: +``` +CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BACKEND=HIP -DMIOPEN_USE_HIP_KERNELS=On -DMIOPEN_EMBED_BUILD=On -DCMAKE_PREFIX_PATH="/some/local/dir" .. +``` + + +### Embedding Find-Db and Performance database: +The Find-db provides a database of known convolution inputs. This allows user to have the best tuned kernels for their network. Embedding find-db requires a semi-colon separated list of architecture CU pairs to embed on-disk DBs in the binary; e.g., gfx906_60;gfx900_56. + +Example: +``` +CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_EMBED_BUILD=On -DMIOPEN_EMBED_DB=gfx900_56 .. +``` + +This will configure the build directory for embedding not just the find-db, but also the performance database. + +### Embedding the precompiled kernels package: +To prevent the loss of performance due to compile time overhead, a build of MIOpen can take advantage of embedding the precompiled kernels package. The precompiled kernels package contains convolution kernels of known inputs and allows the user to avoid compiling kernels during runtime. + +### Embedding precompiled package + +#### Using a package install +To install the precompiled kernels package use the command: +``` +apt-get install miopenkernels-- +``` +Where `` is the GPU architecture (for example, gfx900, gfx906) and `` is the number of CUs available in the GPU (for example 56 or 64 etc). + +Not installing the precompiled kernel package would not impact the functioning of MIOpen, since MIOpen will compile these kernels on the target machine once the kernel is run, however, the compilation step may significantly increase the startup time for different operations. + +The script `utils/install_precompiled_kernels.sh` provided as part of MIOpen automates the above process, it queries the user machine for the GPU architecture and then installs the appropriate package. It may be invoked as: +``` +./utils/install_precompiled_kernels.sh +``` + +To embed the precompiled kernels package, configure cmake using the `MIOPEN_BINCACHE_PATH` +Example: +``` +CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=/path/to/package/install -DMIOPEN_EMBED_BUILD=On .. +``` + +#### Using the URL to a kernels binary +Alternatively, the flag `MIOPEN_BINCACHE_PATH` can be used with a URL that contains the binary. +Example: +``` +CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=/URL/to/binary -DMIOPEN_EMBED_BUILD=On .. +``` + +Precompiled kernels packages are installed in `/opt/rocm/miopen/share/miopen/db`. +An example with the architecture gfx900 with 56 compute units: +``` +CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=/opt/rocm/miopen/share/miopen/db/gfx900_56.kdb -DMIOPEN_EMBED_BUILD=On .. +``` + + +As of ROCm 3.8 / MIOpen 2.7 precompiled kernels binaries are located at [repo.radeon.com](http://repo.radeon.com/rocm/miopen-kernel/) +For example for the architecture gfx906 with 64 compute units: +``` +CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=http://repo.radeon.com/rocm/miopen-kernel/rel-3.8/gfx906_60.kdb -DMIOPEN_EMBED_BUILD=On .. +``` + +### Full configuration line: +Putting it all together, building MIOpen statically, and embedding the performance database, find-db, and the precompiled kernels binary: +``` +CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=/path/to/package/install -DMIOPEN_EMBED_BUILD=On -DMIOPEN_EMBED_DB=gfx900_56 .. +``` + +After configuration is complete, run: +``` +make -j +``` + + + + + From 759eb1dc29833034066b86ff85774ea42aa32f12 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:35:06 -0800 Subject: [PATCH 32/67] Create driver.md --- docs/tutorials/quick-start/driver.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 docs/tutorials/quick-start/driver.md diff --git a/docs/tutorials/quick-start/driver.md b/docs/tutorials/quick-start/driver.md new file mode 100644 index 0000000000..df090b4366 --- /dev/null +++ b/docs/tutorials/quick-start/driver.md @@ -0,0 +1,9 @@ +## Building the driver + +MIOpen provides an [application-driver](https://github.com/ROCmSoftwarePlatform/MIOpen/tree/master/driver) which can be used to execute any one particular layer in isolation and measure performance and verification of the library. + +The driver can be built using the `MIOpenDriver` target: + +` cmake --build . --config Release --target MIOpenDriver ` **OR** ` make MIOpenDriver ` + +Documentation on how to run the driver is [here](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/driver.html). From a2c18ca584e50213a0decc82e2e3f80abb4037fb Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:37:31 -0800 Subject: [PATCH 33/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 82713fdde8..716e947717 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -19,20 +19,20 @@ root: index subtrees: - caption: What is MIOpen? entries: - - file: find_and_immediate.rst - - file: finddb.rst - - file: cache.rst - - file: perfdatabase.rst - - file: Getting_Started_FusionAPI.rst - - file: DebugAndLogging.rst - - file: MI200AlternateImplementation.rst - - file: MIOpen_Porting_Guide.rst + - file: tutorials/find_and_immediate.rst + - file: tutorials/finddb.rst + - file: tutorials/cache.rst + - file: tutorials/perfdatabase.rst + - file: tutorials/Getting_Started_FusionAPI.rst + - file: tutorials/DebugAndLogging.rst + - file: tutorials/MI200AlternateImplementation.rst + - file: tutorials/MIOpen_Porting_Guide.rst - caption: Quick-start entries: - - file: reference/install.rst - - file: reference/embed.rst - - file: reference/driver.rst + - file: tutorials/install.rst + - file: tutorials/embed.rst + - file: tutorials/driver.rst - caption: API reference entries: - file: reference/apireference.rst From f5f9f2527bbc4e376d018df05f16a0d2da839807 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:38:39 -0800 Subject: [PATCH 34/67] Delete docs/DebugAndLogging.md --- docs/DebugAndLogging.md | 249 ---------------------------------------- 1 file changed, 249 deletions(-) delete mode 100644 docs/DebugAndLogging.md diff --git a/docs/DebugAndLogging.md b/docs/DebugAndLogging.md deleted file mode 100644 index 3ae5db123a..0000000000 --- a/docs/DebugAndLogging.md +++ /dev/null @@ -1,249 +0,0 @@ -Debugging and Logging -===================== - -## Logging - -All logging messages output to standard error stream (`stderr`). The following environment variables can be used to control logging: - -* `MIOPEN_ENABLE_LOGGING` - Enables printing the basic layer by layer MIOpen API call information with actual parameters (configurations). Important for debugging. Disabled by default. - -* `MIOPEN_ENABLE_LOGGING_CMD` - A user can use this environmental variable to output the associated `MIOpenDriver` command line(s) onto console. Disabled by default. - -> **_NOTE 1:_ These two and other two-state ("boolean") environment variables can be set to the following values:** -> ``` -> 1, on, yes, true, enable, enabled - to enable feature -> 0, off, no, false, disable, disabled - to disable feature -> ``` - -* `MIOPEN_LOG_LEVEL` - In addition to API call information and driver commands, MIOpen prints various information related to the progress of its internal operations. This information can be useful both for debugging and for understanding the principles of operation of the library. The `MIOPEN_LOG_LEVEL` environment variable controls the verbosity of these messages. Allowed values are: - * 0 - Default. Works as level 4 for Release builds, level 5 for Debug builds. - * 1 - Quiet. No logging messages. - * 2 - Fatal errors only (not used yet). - * 3 - Errors and fatals. - * 4 - All errors and warnings. - * 5 - Info. All the above plus information for debugging purposes. - * 6 - Detailed info. All the above plus more detailed information for debugging. - * 7 - Trace: the most detailed debugging info plus all above. - -> **_NOTE 2:_ When asking for technical support, please include the console log obtained with the following settings:** -> ``` -> export MIOPEN_ENABLE_LOGGING=1 -> export MIOPEN_ENABLE_LOGGING_CMD=1 -> export MIOPEN_LOG_LEVEL=6 -> ``` - -* `MIOPEN_ENABLE_LOGGING_MPMT` - When enabled, each log line is prefixed with information which allows the user to identify records printed from different processes and/or threads. Useful for debugging multi-process/multi-threaded apps. - -* `MIOPEN_ENABLE_LOGGING_ELAPSED_TIME` - Adds a timestamp to each log line. Indicates the time elapsed since the previous log message, in milliseconds. - -## Layer Filtering - -The following list of environment variables allow for enabling/disabling various kinds of kernels and algorithms. This can be helpful for both debugging MIOpen and integration with frameworks. - -> **_NOTE 3:_ These variables can be set to the following values:** -> ``` -> 1, yes, true, enable, enabled - to enable kernels/algorithm -> 0, no, false, disable, disabled - to disable kernels/algorithm -> ``` - -If a variable is not set, then MIOpen behaves as if it is set to `enabled`, unless otherwise specified. So all kinds of kernels/algorithms are enabled by default and the below variables can be used for disabling them. - -> **_WARNING:_** **When the library is used with layer filtering, the results of `Find()` calls become narrower than during normal operation. This means that relevant find-db entries would not include some solutions that normally should be there.** **_Therefore the subsequent Immediate mode `Get()` calls may return incomplete information or even run into Fallback path._** - -In order to rehabilitate the Immediate mode, the user can: -- Re-enable all solvers and re-run the same `Find()` calls that have been run before, -- Or, completely remove the User find-db. - -### Filtering by algorithm - -These variables control the sets (families) of convolution Solutions. For example, Direct algorithm is implemented in several Solutions that use OpenCL, GCN assembly etc. The corresponding variable can disable them all. -* `MIOPEN_DEBUG_CONV_FFT` - FFT convolution algorithm. -* `MIOPEN_DEBUG_CONV_DIRECT` - Direct convolution algorithm. -* `MIOPEN_DEBUG_CONV_GEMM` - GEMM convolution algorithm. -* `MIOPEN_DEBUG_CONV_WINOGRAD` - Winograd convolution algorithm. -* `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM` - Implicit GEMM convolution algorithm. - -### Filtering by build method - -* `MIOPEN_DEBUG_GCN_ASM_KERNELS` - Kernels written in assembly language. Currently these used in many convolutions (some Direct solvers, Winograd kernels, fused convolutions), batch normalization. -* `MIOPEN_DEBUG_HIP_KERNELS` - Convoluton kernels written in HIP (today, all these implement ImplicitGemm algorithm). -* `MIOPEN_DEBUG_OPENCL_CONVOLUTIONS` - Convolution kernels written in OpenCL (note that _only_ convolutions affected). -* `MIOPEN_DEBUG_AMD_ROCM_PRECOMPILED_BINARIES` - Binary kernels. Right now the library does not use binaries. - -### Filtering out all Solutions except one - -* `MIOPEN_DEBUG_FIND_ONLY_SOLVER=solution_id`, where `solution_id` should be either numeric or string identifier of some Solution. Directly affects only `Find()` calls _(however there is some indirect connection to Immediate mode; please see the "Warning" above.)_ - - If `solution_id` denotes some applicable Solution, then only that Solution will be found (plus GEMM and FFT, if these applicable, see _Note 4_). - - Else, if `solution_id` is valid but not applicable, then `Find()` would fail with all algorithms (again, except GEMM and FFT, see _Note 4_) - - Otherwise the `solution_id` is invalid (i.e. it doesn't match any existing Solution), and the `Find()` call would fail. - -> **_NOTE 4:_** This env. variable does not affect the "gemm" and "fft" solutions. For now, GEMM and FFT can be disabled only at algorithm level (see above). - -### Filtering the Solutions on individual basis - -Some of the Solutions have individual controls available. These affect both Find and Immediate modes. _Note the "Warning" above._ - -Direct Solutions: -* `MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U` - `ConvAsm3x3U`. -* `MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U` - `ConvAsm1x1U`. -* `MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2` - `ConvAsm1x1UV2`. -* `MIOPEN_DEBUG_CONV_DIRECT_ASM_5X10U2V2` - `ConvAsm5x10u2v2f1`, `ConvAsm5x10u2v2b1`. -* `MIOPEN_DEBUG_CONV_DIRECT_ASM_7X7C3H224W224` - `ConvAsm7x7c3h224w224k64u2v2p3q3f1`. -* `MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3` - `ConvAsmBwdWrW3x3`. -* `MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW1X1` - `ConvAsmBwdWrW1x1`. -* `MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11` - `ConvOclDirectFwd11x11`. -* `MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN` - `ConvOclDirectFwdGen`. -* `MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD` - `ConvOclDirectFwd`. -* `MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1` - `ConvOclDirectFwd`. -* `MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW2` - `ConvOclBwdWrW2` (where n = `{1,2,4,8,16}`), and `ConvOclBwdWrW2NonTunable`. -* `MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW53` - `ConvOclBwdWrW53`. -* `MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW1X1` - `ConvOclBwdWrW1x1` - -Winograd Solutions: -* `MIOPEN_DEBUG_AMD_WINOGRAD_3X3` - `ConvBinWinograd3x3U`, FP32 Winograd Fwd/Bwd, filter size fixed to 3x3. -* `MIOPEN_DEBUG_AMD_WINOGRAD_RXS` - `ConvBinWinogradRxS`, FP32/FP16 F(3,3) Fwd/Bwd and FP32 F(3,2) WrW Winograd. Subsets: - * `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_WRW` - FP32 F(3,2) WrW convolutions only. - * `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_FWD_BWD` - FP32/FP16 F(3,3) Fwd/Bwd. -* `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F3X2` - `ConvBinWinogradRxSf3x2`, FP32/FP16 Fwd/Bwd F(3,2) Winograd. -* `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3` - `ConvBinWinogradRxSf2x3`, FP32/FP16 Fwd/Bwd F(2,3) Winograd, serves group convolutions only. -* `MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1` - `ConvBinWinogradRxSf2x3g1`, FP32/FP16 Fwd/Bwd F(2,3) Winograd, for non-group convolutions. - -* Multi-pass Winograd: - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X2` - `ConvWinograd3x3MultipassWrW<3-2>`, WrW F(3,2), stride 2 only. - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X3` - `ConvWinograd3x3MultipassWrW<3-3>`, WrW F(3,3), stride 2 only. - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X4` - `ConvWinograd3x3MultipassWrW<3-4>`, WrW F(3,4). - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X5` - `ConvWinograd3x3MultipassWrW<3-5>`, WrW F(3,5). - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F3X6` - `ConvWinograd3x3MultipassWrW<3-6>`, WrW F(3,6). - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X3` - `ConvWinograd3x3MultipassWrW<5-3>`, WrW F(5,3). - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F5X4` - `ConvWinograd3x3MultipassWrW<5-4>`, WrW F(5,4). - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X2`: - * `ConvWinograd3x3MultipassWrW<7-2>`, WrW F(7,2) - * `ConvWinograd3x3MultipassWrW<7-2-1-1>`, WrW F(7x1,2x1) - * `ConvWinograd3x3MultipassWrW<1-1-7-2>`, WrW F(1x7,1x2) - * `MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_F7X3`: - * `ConvWinograd3x3MultipassWrW<7-3>`, WrW F(7,3) - * `ConvWinograd3x3MultipassWrW<7-3-1-1>`, WrW F(7x1,3x1) - * `ConvWinograd3x3MultipassWrW<1-1-7-3>`, WrW F(1x7,1x3) - * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F2X3` - `ConvMPBidirectWinograd<2-3>`, FWD/BWD F(2,3) - * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F3X3` - `ConvMPBidirectWinograd<3-3>`, FWD/BWD F(3,3) - * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F4X3` - `ConvMPBidirectWinograd<4-3>`, FWD/BWD F(4,3) - * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F5X3` - `ConvMPBidirectWinograd<5-3>`, FWD/BWD F(5,3) - * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_F6X3` - `ConvMPBidirectWinograd<6-3>`, FWD/BWD F(6,3) - * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F2X3` - `ConvMPBidirectWinograd_xdlops<2-3>`, FWD/BWD F(2,3) - * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F3X3` - `ConvMPBidirectWinograd_xdlops<3-3>`, FWD/BWD F(3,3) - * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F4X3` - `ConvMPBidirectWinograd_xdlops<4-3>`, FWD/BWD F(4,3) - * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F5X3` - `ConvMPBidirectWinograd_xdlops<5-3>`, FWD/BWD F(5,3) - * `MIOPEN_DEBUG_AMD_MP_BD_XDLOPS_WINOGRAD_F6X3` - `ConvMPBidirectWinograd_xdlops<6-3>`, FWD/BWD F(6,3) - * `MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM - `ConvMPBidirectWinograd*`, FWD/BWD FP16 experemental mode. Disabled by default. This mode is experimental. Use it at your own risk. -* `MIOPEN_DEBUG_AMD_FUSED_WINOGRAD` - Fused FP32 F(3,3) Winograd, variable filter size. - -Implicit GEMM Solutions: -* ASM Implicit GEMM - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1` - `ConvAsmImplicitGemmV4R1DynamicFwd` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_V4R1_1X1` - `ConvAsmImplicitGemmV4R1DynamicFwd_1x1` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_V4R1` - `ConvAsmImplicitGemmV4R1DynamicBwd` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_V4R1` - `ConvAsmImplicitGemmV4R1DynamicWrw` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS` - `ConvAsmImplicitGemmGTCDynamicFwdXdlops` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS` - `ConvAsmImplicitGemmGTCDynamicBwdXdlops` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS` - `ConvAsmImplicitGemmGTCDynamicWrwXdlops` -* HIP Implicit GEMM - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1` - `ConvHipImplicitGemmV4R1Fwd` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4` - `ConvHipImplicitGemmV4R4Fwd` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1` - `ConvHipImplicitGemmBwdDataV1R1` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1` - `ConvHipImplicitGemmBwdDataV4R1` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1` - `ConvHipImplicitGemmV4R1WrW` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4` - `ConvHipImplicitGemmV4R4WrW` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_XDLOPS` - `ConvHipImplicitGemmForwardV4R4Xdlops` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R5_XDLOPS` - `ConvHipImplicitGemmForwardV4R5Xdlops` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1_XDLOPS` - `ConvHipImplicitGemmBwdDataV1R1Xdlops` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V4R1_XDLOPS` - `ConvHipImplicitGemmBwdDataV4R1Xdlops` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_XDLOPS` - `ConvHipImplicitGemmWrwV4R4Xdlops` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_PADDED_GEMM_XDLOPS` - `ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm` - * `MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_PADDED_GEMM_XDLOPS` - `ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm` - -## rocBlas Logging and Behavior -The `ROCBLAS_LAYER` environmental variable can be set to output GEMM information: -* `ROCBLAS_LAYER=` - is not set, there is no logging -* `ROCBLAS_LAYER=1` - is set to 1, then there is trace logging -* `ROCBLAS_LAYER=2` - is set to 2, then there is bench logging -* `ROCBLAS_LAYER=3` - is set to 3, then there is both trace and bench logging - -Additionally, using environment variable "MIOPEN_GEMM_ENFORCE_BACKEND", can override the default behavior. The default behavior which is to use -both MIOpenGEMM and rocBlas depending on the input configuration: - -* `MIOPEN_GEMM_ENFORCE_BACKEND=1`, use rocBLAS if enabled -* `MIOPEN_GEMM_ENFORCE_BACKEND=2`, use MIOpenGEMM for FP32, use rocBLAS for FP16 if enabled -* `MIOPEN_GEMM_ENFORCE_BACKEND=3`, no gemm will be called -* `MIOPEN_GEMM_ENFORCE_BACKEND=4`, use MIOpenTensile for FP32, use rocBLAS for FP16 if enabled -* `MIOPEN_GEMM_ENFORCE_BACKEND=`, use default behavior - -To disable using rocBlas entirely, set the configuration flag `-DMIOPEN_USE_ROCBLAS=Off` during MIOpen configuration. - -More information on logging with rocBlas can be found [here](https://github.com/ROCmSoftwarePlatform/rocBLAS/wiki/5.Logging). - - -## Numerical Checking - -MIOpen provides the environmental variable `MIOPEN_CHECK_NUMERICS` to allow users to debug potential numerical abnormalities. Setting this variable will scan all inputs and outputs of each kernel called and attempt to detect infinities (infs), not-a-number (NaN), or all zeros. The environment variable has several settings that will help with debugging: - -* `MIOPEN_CHECK_NUMERICS=0x01`: Fully informative, prints results from all checks to console -* `MIOPEN_CHECK_NUMERICS=0x02`: Warning information, prints results only if abnormality detected -* `MIOPEN_CHECK_NUMERICS=0x04`: Throw error on detection, MIOpen execute MIOPEN_THROW on abnormal result -* `MIOPEN_CHECK_NUMERICS=0x08`: Abort on abnormal result, this will allow users to drop into a debugging session -* `MIOPEN_CHECK_NUMERICS=0x10`: Print stats, this will compute and print mean/absmean/min/max (note, this is much slower) - - -## Controlling Parallel Compilation - -MIOpen's Convolution Find() calls will compile and benchmark a set of `solvers` contained in `miopenConvAlgoPerf_t` this is done in parallel per `miopenConvAlgorithm_t`. Parallelism per algorithm is set to 20 threads. Typically there are far fewer threads spawned due to the limited number of kernels under any given algorithm. The level of parallelism can be controlled using the environment variable `MIOPEN_COMPILE_PARALLEL_LEVEL`. - -For example, to disable multi-threaded compilation: -``` -export MIOPEN_COMPILE_PARALLEL_LEVEL=1 -``` - - -## Experimental controls - -> **_NOTE 5: Using experimental controls may result in:_** -> * Performance drops -> * Computation inaccuracies -> * Run-time errors -> * Other kinds of unexpected behavior -> -> **_It is strongly recommended to use them only with the explicit permission or request of the library developers._** - -### Code Object (CO) version selection (EXPERIMENTAL) - -Different ROCm versions use Code Object files of different versions (or, in other words, formats). The library uses suitable version automatically. The following variables allow for experimenting and triaging possible problems related to CO version: -* `MIOPEN_DEBUG_AMD_ROCM_METADATA_ENFORCE` - Affects kernels written in GCN assembly language. - * `0` or unset - Automatically detect the required CO version and assemble to that version. This is the default. - * `1` - Do not auto-detect Code Object version, always assemble v2 Code Objects. - * `2` - Behave as if both CO v2 and v3 are supported (see `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER`). - * `3` - Always assemble v3 Code Objects. -* `MIOPEN_DEBUG_AMD_ROCM_METADATA_PREFER_OLDER` - This variable affects only assembly kernels, and only when ROCm supports both CO v2 and CO v3 (like ROCm 2.10). By default, the newer format is used (CO v3). When this variable is _enabled_, the behavior is reversed. -* `MIOPEN_DEBUG_OPENCL_ENFORCE_CODE_OBJECT_VERSION` - Enforces Code Object format for OpenCL kernels. Works with HIP backend only (`cmake ... -DMIOPEN_BACKEND=HIP...`). - * Unset - Automatically detect the required CO version. This is the default. - * `2` - Always build to CO v2. - * `3` - Always build to CO v3. - * `4` - Always build to CO v4. - -### Winograd Multi-pass Maximum Workspace throttling - -`MIOPEN_DEBUG_AMD_WINOGRAD_MPASS_WORKSPACE_MAX` - `ConvWinograd3x3MultipassWrW`, WrW -`MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_WORKSPACE_MAX` - `ConvMPBidirectWinograd*`, FWD BWD - -Syntax of value: -* decimal or hex (with `0x` prefix) value that should fit into 64-bit unsigned integer. -* If syntax is violated, then the behavior is unspecified. - -Semantics: -* Sets the **_limit_** (max allowed workspace size) for Multi-pass (MP) Winograd Solutions, in bytes. -* Affects all MP Winograd Solutions. If a Solution needs more workspace than the limit, then it does not apply. -* If unset, then _the default_ limit is used. Current default is `2000000000` (~1.862 GiB) for gfx900 and gfx906/60 (or less CUs). No default limit is set for other GPUs. -* Special values: -``` - 0 - Use the default limit, as if the variable is unset. - 1 - Completely prohibit the use of workspace. --1 - Remove the default limit. -``` From 7f29f72d872c81af6e67471a4c849bfe68cec5ff Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:38:54 -0800 Subject: [PATCH 35/67] Delete docs/Getting_Started_FusionAPI.md --- docs/Getting_Started_FusionAPI.md | 209 ------------------------------ 1 file changed, 209 deletions(-) delete mode 100644 docs/Getting_Started_FusionAPI.md diff --git a/docs/Getting_Started_FusionAPI.md b/docs/Getting_Started_FusionAPI.md deleted file mode 100644 index ed437ea4ee..0000000000 --- a/docs/Getting_Started_FusionAPI.md +++ /dev/null @@ -1,209 +0,0 @@ -Fusion API: Getting Started -=========================== -## Introduction -Increasing depth of deep learning networks necessitate the need for novel mechanisms to improve performance on GPUs. One mechanism to achieve higher efficiency is to _fuse_ separate kernels into a single kernel to reduce off-chip memory access and avoid kernel launch overhead. This document outlines the addition of a Fusion API to the MIOpen library. The fusion API would allow users to specify operators that they wants to fuse in a single kernel, compile it and then launch the kernel. While not all combinations might be supported by the library, the API is flexible enough to allow the specification of many operations in any order from a finite set of supported operations. The API provides a mechanism to report unsupported combinations. - -A complete example of the Fusion API in the context of MIOpen is given [here](https://github.com/ROCmSoftwarePlatform/MIOpenExamples/tree/master/fusion). We will use code from the example project as we go along. The example project creates a fusion plan to merge the convolution, bias and activation operations. For a list of supported fusion operations and associated constraints please refer to the [Supported Fusions](#supported-fusions) section. The example depicts bare-bones code without any error checking or even populating the tensors with meaningful data in the interest of simplicity. - -The following list outlines the steps required - -- Create a fusion plan -- Create and add the convolution, bias and activation operators -- Compile the Fusion Plan -- Set the runtime arguments for each operator -- Execute the fusion plan -- Cleanup - -The above steps assume that an MIOpen handle object has already been initialized. Moreover, the order in which operators are created is important, since it represents the order of operations on the data itself. Therefore a fusion plan with convolution created before activation is a different fusion plan as opposed to if activation was added before convolution. - -The following sections further elaborate the above steps as well as give code examples to make these ideas concrete. - -### Intended Audience -The primary consumers of the fusion API are high level frameworks such as TensorFlow/XLA or PyTorch etc. - -## Create a Fusion Plan -A **Fusion Plan** is the data structure which holds all the metadata about the users fusion intent as well as logic to **Compile** and **Execute** a fusion plan. As mentioned earlier, a fusion plan holds the order in which different opertions would be applied on the data, but it also specifies the _axis_ of fusion as well. Currently only **vertical** (sequential) fusions are supported implying the flow of data between operations is sequential. - -A fusion plan is created using the API call `miopenCreateFusionPlan` with the signature: - -```cpp -miopenStatus_t -miopenCreateFusionPlan(miopenFusionPlanDescriptor_t* fusePlanDesc, -const miopenFusionDirection_t fuseDirection,const miopenTensorDescriptor_t inputDesc); -``` - -The *input tensor descriptor* specifies the geometry of the incoming data. Since the data geometry of the intermediate operations can be derived from the *input tensor descriptor*, therefore only the *input tensor descriptor* is required for the fusion plan and not for the individual operations. In our fusion example the following lines of code accomplish this: -```cpp -miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, input.desc); -``` -Where `fusePlanDesc` is an object of type `miopenFusionPlanDescriptor_t` and `input.desc` is the `miopenTensorDescriptor_t` object. - -## Create and add Operators -The fusion API introduces the notion of **operators** which represent different operations that are intended to be fused together by the API consumer. Currently, the API supports the following operators: - -* Convolution Forward -* Activation Forward -* BatchNorm Inference -* Bias Forward - -Notice that _Bias_ is a separate operator, although it is typically only used with convolution. This list is expected to grow as support for more operators is added to the API, moreover, operators for backward passes are in the works as well. - -The fusion API provides calls for the creation of the supported operators, here we would describe the process for the convolution operator, details for other operators may be found in the [miopen header file](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/fusion.html) - -Once the fusion plan descriptor is created, two or more operators can be added to it by using the individual operator creation API calls. Creation of an operator might fail if the API does not support the fusion of the operations being added and report back immediately to the user. For our example we need to add the Convolution, Bias and Activation operations to our freshly minted fusion plan. This is done using the following calls for the Convolution, Bias and Activation operations respectively: - -```cpp -miopenStatus_t -miopenCreateOpConvForward(miopenFusionPlanDescriptor_t fusePlanDesc, - miopenFusionOpDescriptor_t* convOp, - miopenConvolutionDescriptor_t convDesc, - const miopenTensorDescriptor_t wDesc); -miopenStatus_t -miopenCreateOpBiasForward(miopenFusionPlanDescriptor_t fusePlanDesc, - miopenFusionOpDescriptor_t* biasOp, - const miopenTensorDescriptor_t bDesc); - -miopenStatus_t -miopenCreateOpActivationForward(miopenFusionPlanDescriptor_t fusePlanDesc, - miopenFusionOpDescriptor_t* activOp, - miopenActivationMode_t mode); -``` - -The following lines in the fusion example project use these API calls to create and insert the operators in the fusion plan: - -```cpp -miopenCreateOpConvForward(fusePlanDesc, &convoOp, conv_desc, weights.desc); -miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bias.desc); -miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU); -``` - -It may be noted that `conv_desc` is the regular MIOpen Convolution descriptor and is created in the standard way before it is referenced here. For more details on creating and setting the convolution descriptor please refer to the example code as well as the [MIOpen documentation](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/convolution.html). In the above snippet `weights.desc` refers to the `miopenTensorDescriptor_t` for the convolution operations and `bias.desc` refers to the object of the same type for the bias operation. The order of insertion of operators indicates the order in which the operations would be performed on the data. Therefore, the above code implies that the convolution operation would be the first operation to execute on the incoming data, followed by the bias and activation operations. - -During this process, it is important that the returned codes be checked to make sure that the operations as well as their order is supported. The operator insertion might fail for a number of reasons such as unsupported sequence of operations, unsupported dimensions of the input or in case of convolution unsupported dimensions for the filters. In the above example, these aspects are ignored for the sake of simplicity. - -## Compile the Fusion Plan - -Following the operator addition, the user would compile the fusion plan, to populate the MIOpen kernel cache with the fused kernel and make it ready for execution. The API call that accomplishes this is: - -```cpp -miopenStatus_t -miopenCompileFusionPlan(miopenHandle_t handle, miopenFusionPlanDescriptor_t fusePlanDesc); -``` - -The corresponding code snippet in the example is as follows: - -```cpp -auto status = miopenCompileFusionPlan(mio::handle(), fusePlanDesc); -if (status != miopenStatusSuccess) { -return -1; -} -``` -In order to compile the fusion plan, the user is assumed to have acquired an MIOpen handle object, in the example code above this is accomplished using the `mio::handle()` helper function. While a fusion plan itself is not bound to a MIOpen handle object, it would however need to be recompiled for each handle separately. It may be noted that compilation of a fusion plan might fail for a number of reasons, moreover it is not assured that a fused version of the kernel would offer any performance improvement over the separately run kernels. - -Compiling a fusion plan is a costly operation in terms of run-time. Therefore, it is recommended that a fusion plan should only be compiled once and may be reused for execution with different runtime parameters as described in the next section. - -## Set the runtime arguments - -While the underlying MIOpen descriptor of the fusion operator specifies the data geometry and parameters, the fusion plan still needs access to the data to execute a successfully compiled fusion plan. The arguments mechanism in the Fusion API provides such data before a fusion plan may be executed. For example the convolution operator requires *weights* to carry out the convolution computation, a bias operator requires the actual bias values etc. Therefore, before a fusion plan may be executed, arguments required by each fusion operator need to be specified. To begin, we create the `miopenOperatorArgs_t` object using: - -```cpp -miopenStatus_t miopenCreateOperatorArgs(miopenOperatorArgs_t* args); -``` - -Once created, runtime arguments for each operation may be set. In our running example, the forward convolution operator requires the convolution weights argument which is supplied using the API call: - -```cpp -miopenStatus_t -miopenSetOpArgsConvForward(miopenOperatorArgs_t args, - const miopenFusionOpDescriptor_t convOp, - const void* alpha, - const void* beta, - const void* w); -``` - -Similarly the parameters for bias and activation are given by: - -```cpp -miopenStatus_t miopenSetOpArgsBiasForward(miopenOperatorArgs_t args, - const miopenFusionOpDescriptor_t biasOp, - const void* alpha, - const void* beta, - const void* bias); - -miopenStatus_t miopenSetOpArgsActivForward(miopenOperatorArgs_t args, - const miopenFusionOpDescriptor_t activOp, - const void* alpha, - const void* beta, - double activAlpha, - double activBeta, - double activGamma); -``` - -In our example code, we set the arguments for the operations as follows: - -```cpp -miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weights.data); -miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, - activ_beta, activ_gamma); -miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.data); -``` - -This separation between the fusion plan and the arguments required by each operator allows better reuse of the fusion plan with different arguments as well as avoids the necessity of recompiling the fusion plan to run the same combination of operators with different arguments. - -As mentioned in the section [Compile the Fusion Plan](#compile-the-fusion-plan) earlier, the compilation step for a fusion plan might be costly, therefore a fusion plan should only be compiled once in its lifetime. A fusion plan needs not be recompiled if the input desciptor or any of the parameters to the `miopenCreateOp*` API calls are different, otherwise a compiled fusion plan may be reused again and again with a different set of arguments. In our example this is demonstrated in lines 77 - 85 of `main.cpp`. - -## Execute a Fusion Plan - -Once the fusion plan has been compiled and arguments set for each operator, it may be executed with the API call given below passing it the actual data to be processed. - -```cpp -miopenStatus_t -miopenExecuteFusionPlan(const miopenHandle_t handle, - const miopenFusionPlanDescriptor_t fusePlanDesc, - const miopenTensorDescriptor_t inputDesc, - const void* input, - const miopenTensorDescriptor_t outputDesc, - void* output, - miopenOperatorArgs_t args); -``` - -The following code snippet in the example accomplishes the fusion plan execution: - -```cpp -miopenExecuteFusionPlan(mio::handle(), fusePlanDesc, input.desc, input.data, - output.desc, output.data, fusionArgs); -``` - -It may be noted that it is an error to attempt to execute a fusion plan that is either not compiled or has been invalidated by changing the input tensor descriptor or any of the operation parameters. - - -## Cleanup -Once the application is done with the fusion plan, the fusion plan and the fusion args objects may be destroyed using the API calls: - -```cpp -miopenStatus_t miopenDestroyFusionPlan(miopenFusionPlanDescriptor_t fusePlanDesc); -``` -Once the fusion plan object is destroyed, all the operations created are destroyed automatically and do not need any special cleanup. - - -## Supported Fusions -The tables below outlines the supported fusions for fp32 and fp16 as well as any applicable constraints. **(C = convolution, B = bias, N = batch normalization, A = activation)** -Fusion Plans with grouped convolutions are not supported. - - -![Convolution based fp32 fusion](data/fp32fusions.png) - - -![Convolution based fp16 fusion](data/fp16fusions.png) - - -## Performance Comparison to Non-Fused Kernels - - -The following graph depicts the speedup gained for a fused Convolution+Bias+Activation over a non-fused version, all configurations have a batch size of 64: - -![CBA Graph](data/cba.png) - -Speedup obtained by fusing Batchnorm (spatial mode) with Activation are presented in the graph below: - -![Batchnorm activation fusion](data/na.png) From 29d889b742d295ac0df85afe7c9946d0122b60b3 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:39:11 -0800 Subject: [PATCH 36/67] Delete docs/MI200AlternateImplementation.md --- docs/MI200AlternateImplementation.md | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 docs/MI200AlternateImplementation.md diff --git a/docs/MI200AlternateImplementation.md b/docs/MI200AlternateImplementation.md deleted file mode 100644 index f5354eb0c0..0000000000 --- a/docs/MI200AlternateImplementation.md +++ /dev/null @@ -1,13 +0,0 @@ -## MI200 MFMA Behavior Specifics - -The MI200 MFMA_F16, MFMA_BF16 and MFMA_BF16_1K flush subnormal input/output data to zero. This behavior might affect the convolution operation in certain workloads due to the limited exponent range of the half-precision floating point datatypes. - -An alternate implementation for the half precision data-type is available in MIOpen which utilizes conversion instructions to utilizes the BFloat16 data-types larger exponent range, albeit with reduced accuracy. The following salients apply to this alternate implementation: - -* It is disabled by default in the Forward convolution operations. - -* It is enabled by default in the backward data and backward weights convolution operations. - -* The default MIOpen behaviors described above may be overridden using the `miopenSetConvolutionAttribute` API call and passing the convolution descriptor for the appropriate convolution operation and the `MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL` convolution attribute with a non-zero value to engage the alternate implementation. - -* The behavior might also be overridden using the `MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL` environment variable. The above variable when set to a value of `1` engages the alternate implementation while a value of `0` disables it. Keep in mind the environment variable impacts the convolution operation in all directions. \ No newline at end of file From a98c1791fbe175e8f1df6076e28c335f7ad11a1b Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:39:23 -0800 Subject: [PATCH 37/67] Delete docs/MIOpen_Porting_Guide.md --- docs/MIOpen_Porting_Guide.md | 2063 ---------------------------------- 1 file changed, 2063 deletions(-) delete mode 100644 docs/MIOpen_Porting_Guide.md diff --git a/docs/MIOpen_Porting_Guide.md b/docs/MIOpen_Porting_Guide.md deleted file mode 100644 index 52ccf15434..0000000000 --- a/docs/MIOpen_Porting_Guide.md +++ /dev/null @@ -1,2063 +0,0 @@ - -# MIOpen Porting Guide - - -## The key differences between MIOpen and cuDNN: -* MIOpen only supports 4-D tensors in the NCHW and NHWC storage format. This means all the __“\*Nd\*”__ APIs in cuDNN do not have a corresponding API in MIOpen. -* MIOpen only supports __`float(fp32)`__ data-type. -* MIOpen supports __2D Convolutions__ and __3D Convolutions__. -* MIOpen only supports __2D Pooling__. -* Calling miopenFindConvolution*Algorithm() is *mandatory* before calling any Convolution API. -* Typical calling sequence for Convolution APIs for MIOpen is: - * miopenConvolution*GetWorkSpaceSize() // returns the workspace size required by Find() - * miopenFindConvolution*Algorithm() // returns performance info about various algorithms - * miopenConvolution*() -* MIOpen does not support __Preferences__ for convolutions. -* MIOpen does not support Softmax modes. MIOpen implements the __SOFTMAX_MODE_CHANNEL__ flavor. -* MIOpen does not support __Transform-Tensor__, __Dropout__, __RNNs__, and __Divisive Normalization__. - -



- -## Helpful MIOpen Environment Variables -`MIOPEN_ENABLE_LOGGING=1` – log all the MIOpen APIs called including the parameters passed to -those APIs. \ -`MIOPEN_DEBUG_AMD_ROCM_PRECOMPILED_BINARIES=0` – disable Winograd convolution -algorithm. \ -`MIOPEN_DEBUG_GCN_ASM_KERNELS=0` – disable hand-tuned asm. kernels for Direct convolution -algorithm. Fall-back to kernels written in high-level language. \ -`MIOPEN_DEBUG_CONV_FFT=0` – disable FFT convolution algorithm. \ -`MIOPEN_DEBUG_CONV_DIRECT=0` – disable Direct convolution algorithm. - -



- - -## API differences - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-Operation - - - -cuDNN API - - -MIOpen API -
- - -```c++ -cudnnStatus_t -cudnnCreate( - cudnnHandle_t *handle) -``` - - -```c++ -miopenStatus_t -miopenCreate( - miopenHandle_t *handle) -``` -
- - -```c++ -cudnnStatus_t -cudnnDestroy( - cudnnHandle_t handle) -``` - - -```c++ -miopenStatus_t -miopenDestroy( - miopenHandle_t handle) -``` -
-Handle - - -```c++ -cudnnStatus_t -cudnnSetStream( - cudnnHandle_t handle, - cudaStream_t streamId) -``` - - -```c++ -miopenStatus_t -miopenSetStream( - miopenHandle_t handle, - miopenAcceleratorQueue_t streamID) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetStream( - cudnnHandle_t handle, - cudaStream_t *streamId) -``` - - -```c++ -miopenStatus_t -miopenGetStream( - miopenHandle_t handle, - miopenAcceleratorQueue_t *streamID) -``` -
- - -```c++ -cudnnStatus_t -cudnnCreateTensorDescriptor( - cudnnTensorDescriptor_t *tensorDesc) -``` - - -```c++ -miopenStatus_t -miopenCreateTensorDescriptor( - miopenTensorDescriptor_t - *tensorDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnSetTensor4dDescriptor( - cudnnTensorDescriptor_t tensorDesc, - cudnnTensorFormat_t format, - cudnnDataType_t dataType, - int n, - int c, - int h, - int w) -``` - - -```c++ -// Only `NCHW` format is supported -miopenStatus_t miopenSet4dTensorDescriptor( - miopenTensorDescriptor_t tensorDesc, - miopenDataType_t dataType, - int n, - int c, - int h, - int w) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetTensor4dDescriptor( - cudnnTensorDescriptor_t tensorDesc, - cudnnDataType_t *dataType, - int *n, - int *c, - int *h, - int *w, - int *nStride, - int *cStride, - int *hStride, - int *wStride) -``` - - -```c++ -miopenStatus_t -miopenGet4dTensorDescriptor( - miopenTensorDescriptor_t tensorDesc, - miopenDataType_t *dataType, - int *n, - int *c, - int *h, - int *w, - int *nStride, - int *cStride, - int *hStride, - int *wStride) -``` -
-Tensor - - -```c++ -cudnnStatus_t -cudnnDestroyTensorDescriptor( - cudnnTensorDescriptor_t tensorDesc) -``` - - -```c++ -miopenStatus_t -miopenDestroyTensorDescriptor( - miopenTensorDescriptor_t tensorDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnAddTensor( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t aDesc, - const void *A, - const void *beta, - const cudnnTensorDescriptor_t cDesc, - void *C) -``` - - -```c++ -//Set tensorOp to miopenOpTensorAdd -miopenStatus_t -miopenOpTensor( - miopenHandle_t handle, - miopenTensorOp_t tensorOp, - const void *alpha1, - constmiopenTensorDescriptor_t aDesc, - const void *A, - const void *alpha2, - const miopenTensorDescriptor_t bDesc, - const void *B, - const void *beta, - const miopenTensorDescriptor_t cDesc, - void *C) -// For Forward Bias use -// miopenConvolutionForwardBias. -``` -
- - -```c++ -cudnnStatus_t -cudnnOpTensor( - cudnnHandle_t handle, - const cudnnOpTensorDescriptor_t opTensorDesc, - const void *alpha1, - const cudnnTensorDescriptor_t aDesc, - const void *A, - const void *alpha2, - const cudnnTensorDescriptor_t bDesc, - const void *B, - const void *beta, - const cudnnTensorDescriptor_t cDesc, - void *C) -``` - - -```c++ -miopenStatus_t -miopenOpTensor( - miopenHandle_t handle, - miopenTensorOp_t tensorOp, - const void *alpha1, - const miopenTensorDescriptor_t aDesc, - const void *A, const void *alpha2, - const miopenTensorDescriptor_t bDesc, - const void *B, - const void *beta, - const miopenTensorDescriptor_t cDesc, - void *C) -``` -
- - -```c++ -cudnnStatus_t -cudnnSetTensor( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t yDesc, - void *y, - const void *valuePtr) -``` - - -```c++ -miopenStatus_t -miopenSetTensor( - miopenHandle_t handle, - const miopenTensorDescriptor_t yDesc, - void *y, - const void *alpha) -``` -
- - -```c++ -cudnnStatus_t -cudnnScaleTensor( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t yDesc, - void *y, - const void *alpha) -``` - - -```c++ -miopenStatus_t -miopenScaleTensor( - miopenHandle_t handle, - const miopenTensorDescriptor_t yDesc, - void *y, - const void *alpha) -``` -
-Filter - - -```c++ -cudnnStatus_t -cudnnCreateFilterDescriptor( - cudnnFilterDescriptor_t *filterDesc) -``` - - -```c++ -// All *FilterDescriptor* APIs are substituted by -// the respective TensorDescriptor APIs. -``` -
- - -```c++ -cudnnStatus_t -cudnnCreateConvolutionDescriptor( - cudnnConvolutionDescriptor_t *convDesc) -``` - - -```c++ -miopenStatus_t -miopenCreateConvolutionDescriptor( - miopenConvolutionDescriptor_t *convDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnSetConvolution2dDescriptor( - cudnnConvolutionDescriptor_t convDesc, - int pad_h, - int pad_w, - int u, - int v, - int upscalex, - int upscaley, - cudnnConvolutionMode_t mode) -``` - - -```c++ -miopenStatus_t -miopenInitConvolutionDescriptor( - miopenConvolutionDescriptor_t convDesc, - miopenConvolutionMode_t mode, - int pad_h, - int pad_w, - int u, - int v, - int upscalex, - int upscaley) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetConvolution2dDescriptor( - const cudnnConvolutionDescriptor_t convDesc, - int *pad_h, - int *pad_y, - int *u, - int *v, - int *upscalex, - int *upscaley, - cudnnConvolutionMode_t *mode) -``` - - -```c++ -miopenStatus_t -miopenGetConvolutionDescriptor( - miopenConvolutionDescriptor_t convDesc, - miopenConvolutionMode_t *mode, - int *pad_h, - int *pad_y, - int *u, - int *v, - int *upscalex, - int *upscaley) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetConvolution2dForwardOutputDim( - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t inputTensorDesc, - const cudnnFilterDescriptor_t filterDesc, - int *n, - int *c, - int *h, - int *w) -``` - - -```c++ -miopenStatus_t -miopenGetConvolutionForwardOutputDim( - miopenConvolutionDescriptor_t convDesc, - const miopenTensorDescriptor_t inputTensorDesc, - const miopenTensorDescriptor_t filterDesc, - int *n, - int *c, - int *h, - int *w) -``` -
- - -```c++ -cudnnStatus_t -cudnnDestroyConvolutionDescriptor( - cudnnConvolutionDescriptor_t convDesc) -``` - - -```c++ -miopenStatus_t -miopenDestroyConvolutionDescriptor( - miopenConvolutionDescriptor_t convDesc) -``` -
-Convolution - - -```c++ -cudnnStatus_t -cudnnFindConvolutionForwardAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t wDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t yDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionFwdAlgoPerf_t *perfResults) - -``` -```c++ -cudnnStatus_t -cudnnFindConvolutionForwardAlgorithmEx( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t yDesc, - void *y, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionFwdAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSizeInBytes) - -``` -```c++ -cudnnStatus_t -cudnnGetConvolutionForwardAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t wDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t yDesc, - cudnnConvolutionFwdPreference_t preference, - size_t memoryLimitInBytes, - cudnnConvolutionFwdAlgo_t *algo) -``` - - -```c++ -// FindConvolution() is mandatory. -// Allocate workspace prior to running this API. -// A table with times and memory requirements -// for different algorithms is returned. -// Users can choose the top-most algorithm if -// they only care about the fastest algorithm. -miopenStatus_t -miopenFindConvolutionForwardAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t xDesc, - const void *x, - const miopenTensorDescriptor_t wDesc, - const void *w, - const miopenConvolutionDescriptor_t convDesc, - const miopenTensorDescriptor_t yDesc, - void *y, - const int requestAlgoCount, - int *returnedAlgoCount, - miopenConvAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSize, - bool exhaustiveSearch) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetConvolutionForwardWorkspaceSize( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t wDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t yDesc, - cudnnConvolutionFwdAlgo_t algo, - size_t *sizeInBytes) -``` - - -```c++ -miopenStatus_t -miopenConvolutionForwardGetWorkSpaceSize( - miopenHandle_t handle, - const miopenTensorDescriptor_t wDesc, - const miopenTensorDescriptor_t xDesc, - const miopenConvolutionDescriptor_t convDesc, - const miopenTensorDescriptor_t yDesc, - size_t *workSpaceSize) -``` -
- - -```c++ -cudnnStatus_t -cudnnConvolutionForward( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionFwdAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y) -``` - - -```c++ -miopenStatus_t -miopenConvolutionForward( - miopenHandle_t handle, - const void *alpha, - const miopenTensorDescriptor_t xDesc, - const void *x, - const miopenTensorDescriptor_t wDesc, - const void *w, - const miopenConvolutionDescriptor_t convDesc, - miopenConvFwdAlgorithm_t algo, - const void *beta, - const miopenTensorDescriptor_t yDesc, - void *y, - void *workSpace, - size_t workSpaceSize) -``` -
- - -```c++ -cudnnStatus_t -cudnnConvolutionBackwardBias( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const void *beta, - const cudnnTensorDescriptor_t dbDesc, - void *db) -``` - - -```c++ -miopenStatus_t -miopenConvolutionBackwardBias( - miopenHandle_t handle, - const void *alpha, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const void *beta, - const miopenTensorDescriptor_t dbDesc, - void *db) -``` -
- - -```c++ -cudnnStatus_t -cudnnFindConvolutionBackwardFilterAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t dwDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) -``` -```c++ -cudnnStatus_t -cudnnFindConvolutionBackwardFilterAlgorithmEx( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnTensorDescriptor_t dyDesc, - const void *y, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t dwDesc, - void *dw, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSizeInBytes) - -``` -```c++ -cudnnStatus_t -cudnnGetConvolutionBackwardFilterAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t dwDesc, - cudnnConvolutionBwdFilterPreference_t preference, - size_t memoryLimitInBytes, - cudnnConvolutionBwdFilterAlgo_t *algo) -``` - - -```c++ -// FindConvolution() is mandatory. -// Allocate workspace prior to running this API. -// A table with times and memory requirements -// for different algorithms is returned. -// Users can choose the top-most algorithm if -// they only care about the fastest algorithm. -miopenStatus_t -miopenFindConvolutionBackwardWeightsAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const miopenTensorDescriptor_t xDesc, - const void *x, - const miopenConvolutionDescriptor_t convDesc, - const miopenTensorDescriptor_t dwDesc, - void *dw, - const int requestAlgoCount, - int *returnedAlgoCount, - miopenConvAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSize, - bool exhaustiveSearch) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetConvolutionBackwardFilterWorkspaceSize( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t gradDesc, - cudnnConvolutionBwdFilterAlgo_t algo, - size_t *sizeInBytes) -``` - - -```c++ -miopenStatus_t -miopenConvolutionBackwardWeightsGetWorkSpaceSize( - miopenHandle_t handle, - const miopenTensorDescriptor_t dyDesc, - const miopenTensorDescriptor_t xDesc, - const miopenConvolutionDescriptor_t convDesc, - const miopenTensorDescriptor_t dwDesc, - size_t *workSpaceSize) -``` -
- - -```c++ -cudnnStatus_t -cudnnConvolutionBackwardFilter( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionBwdFilterAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnFilterDescriptor_t dwDesc, - void *dw) -``` - - -```c++ -miopenStatus_t -miopenConvolutionBackwardWeights( - miopenHandle_t handle, - const void *alpha, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const miopenTensorDescriptor_t xDesc, - const void *x, - const miopenConvolutionDescriptor_t convDesc, - miopenConvBwdWeightsAlgorithm_t algo, - const void *beta, - const miopenTensorDescriptor_t dwDesc, - void *dw, - void *workSpace, - size_t workSpaceSize) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetConvolutionBackwardDataWorkspaceSize( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t dxDesc, - cudnnConvolutionBwdDataAlgo_t algo, - size_t *sizeInBytes) -``` - - -```c++ -miopenStatus_t -miopenConvolutionBackwardDataGetWorkSpaceSize( - miopenHandle_t handle, - const miopenTensorDescriptor_t dyDesc, - const miopenTensorDescriptor_t wDesc, - const miopenConvolutionDescriptor_t convDesc, - const miopenTensorDescriptor_t dxDesc, - size_t *workSpaceSize) -``` -
- - -```c++ -cudnnStatus_t -cudnnFindConvolutionBackwardDataAlgorithm( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t dxDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdDataAlgoPerf_t *perfResults) - -``` -```c++ -cudnnStatus_t -cudnnFindConvolutionBackwardDataAlgorithmEx( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t dxDesc, - void *dx, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdDataAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSizeInBytes) - -``` -```c++ -cudnnStatus_t -cudnnGetConvolutionBackwardDataAlgorithm( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t dxDesc, - cudnnConvolutionBwdDataPreference_t preference, - size_t memoryLimitInBytes, - cudnnConvolutionBwdDataAlgo_t *algo) -``` - - -```c++ -// FindConvolution() is mandatory. -// Allocate workspace prior to running this API. -// A table with times and memory requirements -// for different algorithms is returned. -// Users can choose the top-most algorithm if -// they only care about the fastest algorithm. -miopenStatus_t -miopenFindConvolutionBackwardDataAlgorithm( - miopenHandle_t handle, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const miopenTensorDescriptor_t wDesc, - const void *w, - const miopenConvolutionDescriptor_t convDesc, - const miopenTensorDescriptor_t dxDesc, - const void *dx, - const int requestAlgoCount, - int *returnedAlgoCount, - miopenConvAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSize, - bool exhaustiveSearch) -``` -
- - -```c++ -cudnnStatus_t -cudnnConvolutionBackwardData( - cudnnHandle_t handle, - const void *alpha, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionBwdDataAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx) -``` - - -```c++ - miopenStatus_t - miopenConvolutionBackwardData( - miopenHandle_t handle, - const void *alpha, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const miopenTensorDescriptor_t wDesc, - const void *w, - const miopenConvolutionDescriptor_t convDesc, - miopenConvBwdDataAlgorithm_t algo, - const void *beta, - const miopenTensorDescriptor_t dxDesc, - void *dx, - void *workSpace, - size_t workSpaceSize) -``` -
-Softmax - - -```c++ -cudnnStatus_t -cudnnSoftmaxForward( - cudnnHandle_t handle, - cudnnSoftmaxAlgorithm_t algo, - cudnnSoftmaxMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y) -``` - - -```c++ -miopenStatus_t -miopenSoftmaxForward( - miopenHandle_t handle, - const void *alpha, - const miopenTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const miopenTensorDescriptor_t yDesc, - void *y) -``` -
- - -```c++ -cudnnStatus_t -cudnnSoftmaxBackward( - cudnnHandle_t handle, - cudnnSoftmaxAlgorithm_t algo, - cudnnSoftmaxMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx) -``` - - -```c++ -miopenStatus_t -miopenSoftmaxBackward( - miopenHandle_t handle, - const void *alpha, - const miopenTensorDescriptor_t yDesc, - const void *y, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const void *beta, - const miopenTensorDescriptor_t dxDesc, - void *dx) -``` -
- - -```c++ -cudnnStatus_t -cudnnCreatePoolingDescriptor( - cudnnPoolingDescriptor_t *poolingDesc) - -``` - - -```c++ -miopenStatus_t -miopenCreatePoolingDescriptor( - miopenPoolingDescriptor_t *poolDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnSetPooling2dDescriptor( - cudnnPoolingDescriptor_t poolingDesc, - cudnnPoolingMode_t mode, - cudnnNanPropagation_t maxpoolingNanOpt, - int windowHeight, - int windowWidth, - int verticalPadding, - int horizontalPadding, - int verticalStride, - int horizontalStride) -``` - - -```c++ -miopenStatus_t -miopenSet2dPoolingDescriptor( - miopenPoolingDescriptor_t poolDesc, - miopenPoolingMode_t mode, - int windowHeight, - int windowWidth, - int pad_h, - int pad_w, - int u, - int v) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetPooling2dDescriptor( - const cudnnPoolingDescriptor_t poolingDesc, - cudnnPoolingMode_t *mode, - cudnnNanPropagation_t *maxpoolingNanOpt, - int *windowHeight, - int *windowWidth, - int *verticalPadding, - int *horizontalPadding, - int *verticalStride, - int *horizontalStride) -``` - - -```c++ -miopenStatus_t -miopenGet2dPoolingDescriptor( - const miopenPoolingDescriptor_t poolDesc, - miopenPoolingMode_t *mode, - int *windowHeight, - int *windowWidth, - int *pad_h, - int *pad_w, - int *u, - int *v) -``` -
-Pooling - - -```c++ -cudnnStatus_t -cudnnGetPooling2dForwardOutputDim( - const cudnnPoolingDescriptor_t poolingDesc, - const cudnnTensorDescriptor_t inputTensorDesc, - int *n, - int *c, - int *h, - int *w) -``` - - -```c++ -miopenStatus_t -miopenGetPoolingForwardOutputDim( - const miopenPoolingDescriptor_t poolDesc, - const miopenTensorDescriptor_t tensorDesc, - int *n, - int *c, - int *h, - int *w) -``` -
- - -```c++ -cudnnStatus_t -cudnnDestroyPoolingDescriptor( - cudnnPoolingDescriptor_t poolingDesc) -``` - - -```c++ -miopenStatus_t -miopenDestroyPoolingDescriptor( - miopenPoolingDescriptor_t poolDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnPoolingForward( - cudnnHandle_t handle, - const cudnnPoolingDescriptor_t poolingDesc, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y) -``` - - -```c++ -miopenStatus_t -miopenPoolingForward( - miopenHandle_t handle, - const miopenPoolingDescriptor_t poolDesc, - const void *alpha, - const miopenTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const miopenTensorDescriptor_t yDesc, - void *y, - bool do_backward, - void *workSpace, - size_t workSpaceSize) -``` -
- - - - -```c++ -miopenStatus_t -miopenPoolingGetWorkSpaceSize( - const miopenTensorDescriptor_t yDesc, - size_t *workSpaceSize) -``` -
- - -```c++ -cudnnStatus_t -cudnnPoolingBackward( - cudnnHandle_t handle, - const cudnnPoolingDescriptor_t poolingDesc, - const void *alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx) -``` - - -```c++ -miopenStatus_t -miopenPoolingBackward( - miopenHandle_t handle, - const miopenPoolingDescriptor_t poolDesc, - const void *alpha, - const miopenTensorDescriptor_t yDesc, - const void *y, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const miopenTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const miopenTensorDescriptor_t dxDesc, - void *dx, - const void *workspace) -``` -
- - -```c++ -cudnnStatus_t -cudnnCreateActivationDescriptor( - cudnnActivationDescriptor_t *activationDesc) -``` - - -```c++ -miopenStatus_t -miopenCreateActivationDescriptor( - miopenActivationDescriptor_t *activDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnSetActivationDescriptor( - cudnnActivationDescriptor_t activationDesc, - cudnnActivationMode_t mode, - cudnnNanPropagation_t reluNanOpt, - double reluCeiling) -``` - - -```c++ -miopenStatus_t -miopenSetActivationDescriptor( - const miopenActivationDescriptor_t activDesc, - miopenActivationMode_t mode, - double activAlpha, - double activBeta, - double activPower) -``` -
-Activation - - -```c++ -cudnnStatus_t -cudnnGetActivationDescriptor( - const cudnnActivationDescriptor_t activationDesc, - cudnnActivationMode_t *mode, - cudnnNanPropagation_t *reluNanOpt, - double *reluCeiling) -``` - - -```c++ -miopenStatus_t -miopenGetActivationDescriptor( - const miopenActivationDescriptor_t activDesc, - miopenActivationMode_t *mode, - double *activAlpha, - double *activBeta, - double *activPower) -``` -
- - -```c++ -cudnnStatus_t -cudnnDestroyActivationDescriptor( - cudnnActivationDescriptor_t activationDesc) -``` - - -```c++ -miopenStatus_t -miopenDestroyActivationDescriptor( - miopenActivationDescriptor_t activDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnActivationForward( - cudnnHandle_t handle, - cudnnActivationDescriptor_t activationDesc, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y) -``` - - -```c++ -miopenStatus_t -miopenActivationForward( - miopenHandle_t handle, - const miopenActivationDescriptor_t activDesc, - const void *alpha, - const miopenTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const miopenTensorDescriptor_t yDesc, - void *y) -``` -
- - -```c++ -cudnnStatus_t -cudnnActivationBackward( - cudnnHandle_t handle, - cudnnActivationDescriptor_t activationDesc, - const void *alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx) -``` - - -```c++ -miopenStatus_t -miopenActivationBackward( - miopenHandle_t handle, - const miopenActivationDescriptor_t activDesc, - const void *alpha, - const miopenTensorDescriptor_t yDesc, - const void *y, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const miopenTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const miopenTensorDescriptor_t dxDesc, - void *dx) -``` -
- - -```c++ -cudnnStatus_t -cudnnCreateLRNDescriptor( - cudnnLRNDescriptor_t *normDesc) -``` - - -```c++ -miopenStatus_t -miopenCreateLRNDescriptor( - miopenLRNDescriptor_t - *lrnDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnSetLRNDescriptor( - cudnnLRNDescriptor_t normDesc, - unsigned lrnN, - double lrnAlpha, - double lrnBeta, - double lrnK) -``` - - -```c++ -miopenStatus_t -miopenSetLRNDescriptor( - const miopenLRNDescriptor_t lrnDesc, - miopenLRNMode_t mode, - unsigned lrnN, - double lrnAlpha, - double lrnBeta, - double lrnK) -``` -
- - -```c++ -cudnnStatus_t -cudnnGetLRNDescriptor( - cudnnLRNDescriptor_t normDesc, - unsigned* lrnN, - double* lrnAlpha, - double* lrnBeta, - double* lrnK) -``` - - -```c++ -miopenStatus_t -miopenGetLRNDescriptor( - const miopenLRNDescriptor_t lrnDesc, - miopenLRNMode_t *mode, - unsigned *lrnN, - double *lrnAlpha, - double *lrnBeta, - double *lrnK) - -``` -
- LRN - - -```c++ -cudnnStatus_t -cudnnDestroyLRNDescriptor( - cudnnLRNDescriptor_t lrnDesc) -``` - - -```c++ -miopenStatus_t -miopenDestroyLRNDescriptor( - miopenLRNDescriptor_t lrnDesc) -``` -
- - -```c++ -cudnnStatus_t -cudnnLRNCrossChannelForward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnLRNMode_t lrnMode, - const void* alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y) -``` - - -```c++ -miopenStatus_t -miopenLRNForward( - miopenHandle_t handle, - const miopenLRNDescriptor_t lrnDesc, - const void *alpha, - const miopenTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const miopenTensorDescriptor_t yDesc, - void *y, - bool do_backward, - void *workspace) -``` -
- - -```c++ -cudnnStatus_t -cudnnLRNCrossChannelBackward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnLRNMode_t lrnMode, - const void* alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx) -``` - - -```c++ -miopenStatus_t -miopenLRNBackward( - miopenHandle_t handle, - const miopenLRNDescriptor_t lrnDesc, - const void *alpha, - const miopenTensorDescriptor_t yDesc, - const void *y, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const miopenTensorDescriptor_t xDesc, - const void *x, const void *beta, - const miopenTensorDescriptor_t dxDesc, - void *dx, - const void *workspace) -``` -
- - - - - -```c++ -miopenStatus_t -miopenLRNGetWorkSpaceSize( - const miopenTensorDescriptor_t yDesc, - size_t *workSpaceSize) -``` -
- - -```c++ -cudnnStatus_t -cudnnDeriveBNTensorDescriptor( - cudnnTensorDescriptor_t derivedBnDesc, - const cudnnTensorDescriptor_t xDesc, - cudnnBatchNormMode_t mode) -``` - - -```c++ -miopenStatus_t -miopenDeriveBNTensorDescriptor( - miopenTensorDescriptor_t derivedBnDesc, - const miopenTensorDescriptor_t xDesc, - miopenBatchNormMode_t bn_mode) -``` -
- - -```c++ -cudnnStatus_t -cudnnBatchNormalizationForwardTraining( - cudnnHandle_t handle, - cudnnBatchNormMode_t mode, - void *alpha, - void *beta, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnTensorDescriptor_t yDesc, - void *y, - const cudnnTensorDescriptor_t - bnScaleBiasMeanVarDesc, - void *bnScale, - void *bnBias, - double exponentialAverageFactor, - void *resultRunningMean, - void *resultRunningVariance, - double epsilon, - void *resultSaveMean, - void *resultSaveInvVariance) -``` - - -```c++ -miopenStatus_t -miopenBatchNormalizationForwardTraining( - miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - void *alpha, - void *beta, - const miopenTensorDescriptor_t xDesc, - const void *x, - const miopenTensorDescriptor_t yDesc, - void *y, - const miopenTensorDescriptor_t - bnScaleBiasMeanVarDesc, - void *bnScale, - void *bnBias, - double expAvgFactor, - void *resultRunningMean, - void *resultRunningVariance, - double epsilon, - void *resultSaveMean, - void *resultSaveInvVariance) -``` -
- Batch Normalization - - -```c++ -cudnnStatus_t -cudnnnBatchNormalizationForwardInference( - cudnnHandle_t handle, - cudnnBatchNormMode_t mode, - void *alpha, - void *beta, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnTensorDescriptor_t yDesc, - void *y, - const cudnnTensorDescriptor_t - bnScaleBiasMeanVarDesc, - const void *bnScale, - void *bnBias, - const void *estimatedMean, - const void *estimatedVariance, - double epsilon) -``` - - -```c++ -miopenStatus_t -miopenBatchNormalizationForwardInference( - miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - void *alpha, - void *beta, - const miopenTensorDescriptor_t xDesc, - const void *x, - const miopenTensorDescriptor_t yDesc, - void *y, - const miopenTensorDescriptor_t - bnScaleBiasMeanVarDesc, - void *bnScale, - void *bnBias, - void *estimatedMean, - void *estimatedVariance, - double epsilon) -``` -
- - -```c++ -cudnnStatus_t -cudnnBatchNormalizationBackward( - cudnnHandle_t handle, - cudnnBatchNormMode_t mode, - const void *alphaDataDiff, - const void *betaDataDiff, - const void *alphaParamDiff, - const void *betaParamDiff, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t dxDesc, - void *dx, - const cudnnTensorDescriptor_t - bnScaleBiasDiffDesc, - const void *bnScale, - void *resultBnScaleDiff, - void *resultBnBiasDiff, - double epsilon, - const void *savedMean, - const void *savedInvVariance) -``` - - -```c++ -miopenStatus_t -miopenBatchNormalizationBackward( - miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - const void *alphaDataDiff, - const void *betaDataDiff, - const void *alphaParamDiff, - const void *betaParamDiff, - const miopenTensorDescriptor_t xDesc, - const void *x, - const miopenTensorDescriptor_t dyDesc, - const void *dy, - const miopenTensorDescriptor_t dxDesc, - void *dx, - const miopenTensorDescriptor_t - bnScaleBiasDiffDesc, - const void *bnScale, - void *resultBnScaleDiff, - void *resultBnBiasDiff, - double epsilon, - const void *savedMean, - const void *savedInvVariance) -``` -
- -

-
From b136967f3397ff880e204acef7c3cd1ef7e3cc81 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:39:35 -0800 Subject: [PATCH 38/67] Delete docs/apireference.rst --- docs/apireference.rst | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 docs/apireference.rst diff --git a/docs/apireference.rst b/docs/apireference.rst deleted file mode 100644 index 4f69fcdaf0..0000000000 --- a/docs/apireference.rst +++ /dev/null @@ -1,26 +0,0 @@ - -API Reference -============= - - -.. toctree:: - :maxdepth: 4 - :caption: Contents: - - datatypes - handle - tensor - activation - convolution - rnn - batchnorm - lrn - pooling - softmax - fusion - loss - dropout - reduction - layernorm - sum - From 9468d3924bc9aef47d382094aea22ff5bb576c0a Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:39:48 -0800 Subject: [PATCH 39/67] Delete docs/cache.md --- docs/cache.md | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 docs/cache.md diff --git a/docs/cache.md b/docs/cache.md deleted file mode 100644 index e8a08ff5c7..0000000000 --- a/docs/cache.md +++ /dev/null @@ -1,33 +0,0 @@ -Kernel Cache -============ - -MIOpen will cache binary kernels to disk, so they don't need to be compiled the next time the application is run. This cache is stored by default in `$HOME/.cache/miopen`. This location can be customized at build time by setting the `MIOPEN_CACHE_DIR` cmake variable. - -Clear the cache ---------------- - -The cache can be cleared by simply deleting the cache directory (i.e., `$HOME/.cache/miopen`). This should only be needed for development purposes or to free disk space. The cache does not need to be cleared when upgrading MIOpen. - -Disabling the cache -------------------- - -The are several ways to disable the cache. This is generally useful for development purposes. The cache can be disabled during build by either setting `MIOPEN_CACHE_DIR` to an empty string, or setting `BUILD_DEV=ON` when configuring cmake. The cache can also be disabled at runtime by setting the `MIOPEN_DISABLE_CACHE` environment variable to true. - -Updating MIOpen and removing the cache --------------------------------------- -For MIOpen version 2.3 and earlier, if the compiler changes, or the user modifies the kernels then the cache must be deleted for the MIOpen version in use; e.g., `rm -rf $HOME/.cache/miopen/`. More information about the cache can be found [here](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/cache.html). - -For MIOpen version 2.4 and later, MIOpen's kernel cache directory is versioned so that users' cached kernels will not collide when upgrading from earlier version. - -Installing pre-compiled kernels -------------------------------- -GPU architecture-specific pre-compiled kernel packages are available in the ROCm package repositories, to reduce the startup latency of MIOpen kernels. In essence, these packages have the kernel cache file mentioned above and install them in the ROCm installation directory along with other MIOpen artifacts. Thus, when launching a kernel, MIOpen will first check for the existence of a kernel in the kernel cache installed in the MIOpen installation directory. If the file does not exist or the required kernel is not found, the kernel is compiled and placed in the user's kernel cache. - -These packages are optional for the functioning of MIOpen and must be separately installed from MIOpen. Users who wish to conserve disk space may choose not to install these packages at the cost of higher startup latency. Users have the flexibility to only install kernel packages for installed device architecture, thus minimizing disk space usage. - -If MIOpen kernels package is not installed, or if we do not deliver the kernels suitable for the user's GPU, then the user will get warning message like this: -> MIOpen(HIP): Warning [SQLiteBase] Missing system database file:gfx906_60.kdb Performance may degrade - -The performance degradation mentioned in the warning only affects the network start-up time (aka "initial iteration time") and thus can be safely ignored. - -Please refer to the MIOpen installation instructions: [installing MIOpen kernels package](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#installing-miopen-kernels-package) for guidance on installing the MIOpen kernels package. From bae376fa86a7307d1dbb5f789b757107363b97a9 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:40:16 -0800 Subject: [PATCH 40/67] Delete docs/driver.md --- docs/driver.md | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 docs/driver.md diff --git a/docs/driver.md b/docs/driver.md deleted file mode 100644 index df090b4366..0000000000 --- a/docs/driver.md +++ /dev/null @@ -1,9 +0,0 @@ -## Building the driver - -MIOpen provides an [application-driver](https://github.com/ROCmSoftwarePlatform/MIOpen/tree/master/driver) which can be used to execute any one particular layer in isolation and measure performance and verification of the library. - -The driver can be built using the `MIOpenDriver` target: - -` cmake --build . --config Release --target MIOpenDriver ` **OR** ` make MIOpenDriver ` - -Documentation on how to run the driver is [here](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/driver.html). From 83a7bad7cc5e79875e779a90efdd4b0dcdfb2f34 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:40:36 -0800 Subject: [PATCH 41/67] Create index.rst --- docs/reference/index.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/reference/index.rst diff --git a/docs/reference/index.rst b/docs/reference/index.rst new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/docs/reference/index.rst @@ -0,0 +1 @@ + From b4130e0bcac895403b028ef6ba06cf39230c132a Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:40:54 -0800 Subject: [PATCH 42/67] Delete docs/find_and_immediate.md --- docs/find_and_immediate.md | 189 ------------------------------------- 1 file changed, 189 deletions(-) delete mode 100644 docs/find_and_immediate.md diff --git a/docs/find_and_immediate.md b/docs/find_and_immediate.md deleted file mode 100644 index 313ea9c26e..0000000000 --- a/docs/find_and_immediate.md +++ /dev/null @@ -1,189 +0,0 @@ -Find and Immediate Mode -======================= - - - -## Find API - -MIOpen contains several convolution algorithms for each stage of training or inference. Pre-MIOpen version 2.0 users needed to call Find methods in order generate a set of applicable algorithms. - -A typical workflow for the find stage: - -``` -miopenConvolutionForwardGetWorkSpaceSize(handle, - weightTensorDesc, - inputTensorDesc, - convDesc, - outputTensorDesc, - &maxWorkSpaceSize); - -// < allocate workspace > - - -// NOTE: -// miopenFindConvolution*() call is expensive in terms of execution time and required workspace. -// Therefore it is highly recommended to save off the selected algorithm and workspace required so that -// can be reused later within the lifetime of the same MIOpen handle object. -// In this way, there should be is no need to invoke miopenFind*() more than once per application lifetime. - -miopenFindConvolutionForwardAlgorithm(handle, - inputTensorDesc, - input_device_mem, - weightTensorDesc, - weight_device_mem, - convDesc, - outputTensorDesc, - output_device_mem,, - request_algo_count, - &ret_algo_count, - perf_results, - workspace_device_mem, - maxWorkSpaceSize, - 1); - -// < select fastest algorithm > - -// < free previously allocated workspace and allocate workspace required for the selected algorithm> - -miopenConvolutionForward(handle, &alpha, - inputTensorDesc, - input_device_mem, - weightTensorDesc, - weight_device_mem, - convDesc, - perf_results[0].fwd_algo, // use the fastest algo - &beta, - outputTensorDesc, - output_device_mem, - workspace_device_mem, - perf_results[0].memory); //workspace size -``` - - -The results of Find() are returned in an array of `miopenConvAlgoPerf_t` structs in order of performance, with the fastest at index 0. - -This call sequence is executed once per session as it is inherently expensive. Of those, `miopenFindConvolution*()` is the most expensive call. It caches its own results on disk, so the subsequent calls during the same MIOpen session will execute faster. However, it is better to remember results of `miopenFindConvolution*()` in the application, as recommended above. - -Internally MIOpen's Find calls will compile and benchmark a set of `solvers` contained in `miopenConvAlgoPerf_t` this is done in parallel per `miopenConvAlgorithm_t`. The level of parallelism can be controlled using an environment variable. See the debugging section [controlling parallel compilation](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/DebugAndLogging.html#controlling-parallel-compilation) for more details. - - -## Immediate Mode API - -MIOpen v2.0 introduces the immediate which removes the requirement for the `miopenFindConvolution*()` calls and their associated runtime costs. In this mode, the user can query the MIOpen runtime for all the supported _solutions_ for a given convolution configuration. These solutions may either be using the same algorithm or different ones. The sequence of operations for in immediate mode is similar to launching regular convolutions in MIOpen i.e. through the use of the `miopenFindConvolution*()` API. However, in this case the different APIs have much lower runtime cost. A typical convolution call would be similar to the following sequence of calls: - -* The user constructs the MIOpen handle and relevant descriptors such as the convolution descriptor as usual. -* With the above data structures, the user calls `miopenConvolution*GetSolutionCount` to get the **maximum** number of supported solutions for the convolution descriptor in question. -* The count obtained above is used to allocate memory for the `miopenConvSolution_t` structure introduced in MIOpen v2.0 -* The user calls `miopenConvolution*GetSolution` to populate the `miopenConvSolution_t` structures allocated above. The returned list is ordered in the order of best performance, thus the first element would be the fastest. -* While the above structure returns the amount of workspace required for an algorithm, the user may inquire the amount of a workspace required for a known solution id by using the `miopenConvolution*GetSolutionWorkspaceSize` API call. However, this is not a requirement, since the strucure returned by `miopenConvolution*GetSolution` would already have this information. -* Now the user may initiate the convolution operation in _immediate_ mode by calling `miopenConvolution*Immediate`. Which would populate the output tensor descriptor with the respective convolution result. However, the first call to `miopenConvolution*Immediate` may consume more time since the kernel may not be present in the kernel cache and may need to be compiled. -* Optionally, the user may compile the solution of choice by calling `miopenConvolution*CompileSolution` which would ensure that the kernel represented by the chosen solution is populated in the kernel cache a priori, removing the necessity for compiling the kernel in question. - - -``` -miopenConvolutionForwardGetSolutionCount(handle, - weightTensorDesc, - inputTensorDesc, - convDesc, - outputTensorDesc, - &solutionCount); - - -// < allocate an array of miopenConvSolution_t of size solutionCount > - - -miopenConvolutionForwardGetSolution(handle, - weightTensorDesc, - inputTensorDesc, - convDesc, - outputTensorDesc, - solutionCount, - &actualCount, - solutions); - -// < select a solution from solutions array > - -miopenConvolutionForwardGetSolutionWorkspaceSize(handle, - weightTensorDesc, - inputTensorDesc, - convDesc, - outputTensorDesc, - selected->solution_id, - &ws_size); - -// < allocate solution workspace of size ws_size > - - -// This stage is optional -miopenConvolutionForwardCompileSolution(handle, - weightTensorDesc, - inputTensorDesc, - convDesc, - outputTensorDesc, - selected->solution_id); - - - - miopenConvolutionForwardImmediate(handle, - weightTensor, - weight_device_mem, - inputTensorDesc, - input_device_mem, - convDesc, - outputTensorDesc, - output_device_mem, - workspace_device_mem, - ws_size, - selected->solution_id); -``` - -## Immediate Mode Fallback - -The immediate mode is underpinned by the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html), however it may not contain every configuration of interest. If Find-Db encounters a database miss it has two fallback paths it can take, depending on whether the cmake variable MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK is set to ON or OFF. However, if the user requires the best possible performance they should run the Find stage at least once. - -### 1. AI-based Heuristic Fallback (Default) - -If MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK is set to ON, which it is by default, Immediate Mode's behavior on a database miss is to use an AI-based heurisitic to pick the optimal solution. First, the applicability of the AI-based heuristic for the given configuration is checked. If the heuristic is applicable, it feeds various parameters of the given configuration into a neural network which has been tuned to predict the optimal solution with 90% accuracy. - -### 2. Weighted Throughput Index Based Fallback - -When MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK is set to OFF, or the AI Heuristic is not applicable for the given convolution configuration, Immediate mode's behavior on encountering a database miss is to use a Weighted Thoughput Index (WTI) based mechanism to estimate which solution would be optimal based upon parameters of the convolution configuration. - - - -## Limitations of Immediate Mode - -### Architectual Limitations -The system Find-Db has only been populated for the following architectures: - * gfx906 with 64 CUs - * gfx906 with 60 CUs - * gfx900 with 64 CUs - * gfx900 with 56 CUs - -If the user's architecture is not listed above they will need to run the Find API once on their system per application in order to take advantage of immediate mode's more efficient behavior. - - -### Backend Limitations - -OpenCL support for immediate mode via the fallback is limited to fp32 datatypes. This is because this current release's fallback path goes through GEMM which on the OpenCL is serviced through MIOpenGEMM -- which itself only contains support for fp32. The HIP backend uses rocBLAS as its fallback path which contains a richer set of datatypes. - - -### Find Modes - -MIOpen provides a set of Find modes which are used to accelerate the Find calls. The different modes are set by using the environment variable `MIOPEN_FIND_MODE`, and setting it to one of the values: - -- `NORMAL`, or `1`: Normal Find: This is the full Find mode call, which will benchmark all the solvers and return a list. -- `FAST`, or `2`: Fast Find: Checks the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html) for an entry. If there is a Find-Db hit, use that entry. If there is a miss, utilize the Immediate mode fallback. If Start-up times are expected to be faster, but worse GPU performance. -- `HYBRID`, or `3`, or unset `MIOPEN_FIND_MODE`: Hybrid Find: Checks the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html) for an entry. If there is a Find-Db hit, use that entry. If there is a miss, use the existing Find machinery. Slower start-up times than Fast Find, but no GPU performance drop. -- `4`: This value is reserved and should not be used. -- `DYNAMIC_HYBRID`, or `5`: Dynamic Hybrid Find: Checks the [Find-Db](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/finddb.html) for an entry. If there is a Find-Db hit, uses that entry. If there is a miss, uses the existing Find machinery with skipping non-dynamic kernels. Faster start-up times than Hybrid Find, but GPU performance may be a bit worse. - - Currently, the default Find mode is `DYNAMIC_HYBRID`. To run the full `NORMAL` Find mode, set the environment as: - ``` - export MIOPEN_FIND_MODE=NORMAL - ``` - Or, - ``` - export MIOPEN_FIND_MODE=1 - ``` - From 3a194ee53624226edbbfbb4ef5ae0aea2b26a3f0 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:41:07 -0800 Subject: [PATCH 43/67] Delete docs/embed.md --- docs/embed.md | 100 -------------------------------------------------- 1 file changed, 100 deletions(-) delete mode 100644 docs/embed.md diff --git a/docs/embed.md b/docs/embed.md deleted file mode 100644 index f5cf0d9ae3..0000000000 --- a/docs/embed.md +++ /dev/null @@ -1,100 +0,0 @@ - -Building MIOpen for Embedded Systems -==================================== - - - -### Install dependencies -Install minimum dependencies (default location /usr/local): -``` -cmake -P install_deps.cmake --minimum --prefix /some/local/dir -``` - -Create build directory: -``` -mkdir build; cd build; -``` - -### Configuring for an embedded build -Minimal static build configuration line without embedded precompiled kernels package, or Find-Db: -``` -CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BACKEND=HIP -DMIOPEN_EMBED_BUILD=On -DCMAKE_PREFIX_PATH="/some/local/dir" .. -``` - -To enable HIP kernels in MIOpen while using embedded builds add: `-DMIOPEN_USE_HIP_KERNELS=On` to the configure line. -For example: -``` -CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BACKEND=HIP -DMIOPEN_USE_HIP_KERNELS=On -DMIOPEN_EMBED_BUILD=On -DCMAKE_PREFIX_PATH="/some/local/dir" .. -``` - - -### Embedding Find-Db and Performance database: -The Find-db provides a database of known convolution inputs. This allows user to have the best tuned kernels for their network. Embedding find-db requires a semi-colon separated list of architecture CU pairs to embed on-disk DBs in the binary; e.g., gfx906_60;gfx900_56. - -Example: -``` -CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_EMBED_BUILD=On -DMIOPEN_EMBED_DB=gfx900_56 .. -``` - -This will configure the build directory for embedding not just the find-db, but also the performance database. - -### Embedding the precompiled kernels package: -To prevent the loss of performance due to compile time overhead, a build of MIOpen can take advantage of embedding the precompiled kernels package. The precompiled kernels package contains convolution kernels of known inputs and allows the user to avoid compiling kernels during runtime. - -### Embedding precompiled package - -#### Using a package install -To install the precompiled kernels package use the command: -``` -apt-get install miopenkernels-- -``` -Where `` is the GPU architecture (for example, gfx900, gfx906) and `` is the number of CUs available in the GPU (for example 56 or 64 etc). - -Not installing the precompiled kernel package would not impact the functioning of MIOpen, since MIOpen will compile these kernels on the target machine once the kernel is run, however, the compilation step may significantly increase the startup time for different operations. - -The script `utils/install_precompiled_kernels.sh` provided as part of MIOpen automates the above process, it queries the user machine for the GPU architecture and then installs the appropriate package. It may be invoked as: -``` -./utils/install_precompiled_kernels.sh -``` - -To embed the precompiled kernels package, configure cmake using the `MIOPEN_BINCACHE_PATH` -Example: -``` -CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=/path/to/package/install -DMIOPEN_EMBED_BUILD=On .. -``` - -#### Using the URL to a kernels binary -Alternatively, the flag `MIOPEN_BINCACHE_PATH` can be used with a URL that contains the binary. -Example: -``` -CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=/URL/to/binary -DMIOPEN_EMBED_BUILD=On .. -``` - -Precompiled kernels packages are installed in `/opt/rocm/miopen/share/miopen/db`. -An example with the architecture gfx900 with 56 compute units: -``` -CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=/opt/rocm/miopen/share/miopen/db/gfx900_56.kdb -DMIOPEN_EMBED_BUILD=On .. -``` - - -As of ROCm 3.8 / MIOpen 2.7 precompiled kernels binaries are located at [repo.radeon.com](http://repo.radeon.com/rocm/miopen-kernel/) -For example for the architecture gfx906 with 64 compute units: -``` -CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=http://repo.radeon.com/rocm/miopen-kernel/rel-3.8/gfx906_60.kdb -DMIOPEN_EMBED_BUILD=On .. -``` - -### Full configuration line: -Putting it all together, building MIOpen statically, and embedding the performance database, find-db, and the precompiled kernels binary: -``` -CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_BINCACHE_PATH=/path/to/package/install -DMIOPEN_EMBED_BUILD=On -DMIOPEN_EMBED_DB=gfx900_56 .. -``` - -After configuration is complete, run: -``` -make -j -``` - - - - - From bc5e1967b75c8faf6ab3ea23cc8a98eee00ca3da Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:41:49 -0800 Subject: [PATCH 44/67] Delete docs/finddb.md --- docs/finddb.md | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 docs/finddb.md diff --git a/docs/finddb.md b/docs/finddb.md deleted file mode 100644 index e6e21b901b..0000000000 --- a/docs/finddb.md +++ /dev/null @@ -1,44 +0,0 @@ -Find-Db Database -================ - -Prior to MIOpen 2.0, users utilized calls such as `miopenFindConvolution*Algorithm()` to gather a set of convolution algorithms in the form of an array of `miopenConvSolution_t` structs. This process is time consuming because it requires online benchmarking of competing algorithms. In MIOpen 2.0 an [immediate mode](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/find_and_immediate.html) is introduced. - -Immediate mode is based on a database which contains the results of calls to the legacy Find() stage. This database is called `Find-Db`. It consists of two parts: -- **System Find-Db**, a system-wide storage which holds the pre-run values for the most applicable configurations, -- **User Find-Db**, a per-user storage which is intended to hold results for arbitrary user-run configurations. It also performs double duty as a cache for the Find() stage. - -The User Find-Db **always takes precedence** over System Find-Db. - -By default, System Find-Db resides within MIOpen's install location, while User Find-Db resides in the user's home directory. See [Setting up locations](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#setting-up-locations) for more information. - - * The System Find-Db is *not* modified upon installation of MIOpen. - * There are separate Find databases for HIP and OpenCL backends. - -### Populating the User Find-Db - -MIOpen collects Find-db information during the following MIOpen API calls: -- `miopenFindConvolutionForwardAlgorithm()` -- `miopenFindConvolutionBackwardDataAlgorithm()` -- `miopenFindConvolutionBackwardWeightsAlgorithm()` - -During the call, find data entries are collected for one _problem configuration_ (implicitly defined by the tensor descriptors and convolution descriptor passed to API function). - - -### Updating MIOpen and the User Find-Db - -When the user installs a new version of MIOpen, the new version of MIOpen will _ignore_ old **User find-db*** files. Thus, the user is _not required_ to move or delete their old User find-db files. However, the user may wish to re-collect the information into their brand new **User find-db**. This should be done in the same way as it was done with the previous version of the library -- _if_ it was done. This would keep Immediate mode optimized. - - -### Disabling Find-Db - -By default MIOpen will use the Find-Db. Users can disable the Find-Db by setting the environmental variable `MIOPEN_DEBUG_DISABLE_FIND_DB` to 1: -``` -export MIOPEN_DEBUG_DISABLE_FIND_DB=1 -``` - -**Note:** The System Find-Db has the ability to be cached into memory and may increase performance dramatically. To disable this option use the cmake configuration flag: -``` --DMIOPEN_DEBUG_FIND_DB_CACHING=Off -``` - - From cc59168aea308884f4a900f958fdb0f56d697510 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:42:01 -0800 Subject: [PATCH 45/67] Delete docs/install.md --- docs/install.md | 77 ------------------------------------------------- 1 file changed, 77 deletions(-) delete mode 100644 docs/install.md diff --git a/docs/install.md b/docs/install.md deleted file mode 100644 index 0932cd2563..0000000000 --- a/docs/install.md +++ /dev/null @@ -1,77 +0,0 @@ -## Prerequisites - -* More information about ROCm stack via [ROCm Information Portal](https://docs.amd.com/). -* A ROCm enabled platform, more info [here](https://rocm.github.io/install.html). -* Base software stack, which includes: - * HIP - - * HIP and HCC libraries and header files. - * OpenCL - OpenCL libraries and header files. -* [MIOpenGEMM](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM) - enable various functionalities including transposed and dilated convolutions. - * This is optional on the HIP backend, and required on the OpenCL backend. - * Users can enable this library using the cmake configuration flag `-DMIOPEN_USE_MIOPENGEMM=On`, which is enabled by default when OpenCL backend is chosen. -* [ROCm cmake](https://github.com/RadeonOpenCompute/rocm-cmake) - provide cmake modules for common build tasks needed for the ROCM software stack. -* [Half](http://half.sourceforge.net/) - IEEE 754-based half-precision floating point library -* [Boost](http://www.boost.org/) - * MIOpen uses `boost-system` and `boost-filesystem` packages to enable persistent [kernel cache](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/cache.html) - * Version 1.79 is recommended, older version may need patches to work on newer systems, e.g. boost1{69,70,72} w/glibc-2.34 -* [SQLite3](https://sqlite.org/index.html) - reading and writing performance database -* [MIOpenTENSILE](https://github.com/ROCmSoftwarePlatform/MIOpenTensile) - users can enable this library using the cmake configuration flag`-DMIOPEN_USE_MIOPENTENSILE=On`. (deprecated after ROCm 5.1.1) -* [rocBLAS](https://github.com/ROCmSoftwarePlatform/rocBLAS) - AMD library for Basic Linear Algebra Subprograms (BLAS) on the ROCm platform. - * Minimum version branch for pre-ROCm 3.5 [master-rocm-2.10](https://github.com/ROCmSoftwarePlatform/rocBLAS/tree/master-rocm-2.10) - * Minimum version branch for post-ROCm 3.5 [master-rocm-3.5](https://github.com/ROCmSoftwarePlatform/rocBLAS/releases/tag/rocm-3.5.0) -* [MLIR](https://github.com/ROCmSoftwarePlatform/llvm-project-mlir) - (Multi-Level Intermediate Representation) with its MIOpen dialect to support and complement kernel development. -* [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) - C++ templated device library for GEMM-like and reduction-like operators. - -## Installing MIOpen with pre-built packages - -MIOpen can be installed on Ubuntu using `apt-get`. - -For OpenCL backend: `apt-get install miopen-opencl` - -For HIP backend: `apt-get install miopen-hip` - -Currently both the backends cannot be installed on the same system simultaneously. If a different backend other than what currently exists on the system is desired, please uninstall the existing backend completely and then install the new backend. - -## Installing MIOpen kernels package - -MIOpen provides an optional pre-compiled kernels package to reduce the startup latency. These precompiled kernels comprise a select set of popular input configurations and will expand in future release to contain additional coverage. - -Note that all compiled kernels are locally cached in the folder `$HOME/.cache/miopen/`, so precompiled kernels reduce the startup latency only for the first execution of a neural network. Precompiled kernels do not reduce startup time on subsequent runs. - -To install the kernels package for your GPU architecture, use the following command: - -``` -apt-get install miopenkernels-- -``` - -Where `` is the GPU architecture ( for example, `gfx900`, `gfx906`, `gfx1030` ) and `` is the number of CUs available in the GPU (for example 56 or 64 etc). - -Not installing these packages would not impact the functioning of MIOpen, since MIOpen will compile these kernels on the target machine once the kernel is run. However, the compilation step may significantly increase the startup time for different operations. - -The script `utils/install_precompiled_kernels.sh` provided as part of MIOpen automates the above process, it queries the user machine for the GPU architecture and then installs the appropriate package. It may be invoked as: - -``` -./utils/install_precompiled_kernels.sh -``` - -The above script depends on the __rocminfo__ package to query the GPU architecture. - -More info can be found [here](https://github.com/ROCmSoftwarePlatform/MIOpen/blob/develop/docs/cache.md#installing-pre-compiled-kernels). - -## Installing the dependencies - -The dependencies can be installed with the `install_deps.cmake`, script: `cmake -P install_deps.cmake` - -This will install by default to `/usr/local` but it can be installed in another location with `--prefix` argument: -``` -cmake -P install_deps.cmake --prefix -``` -An example cmake step can be: -``` -cmake -P install_deps.cmake --minimum --prefix /root/MIOpen/install_dir -``` -This prefix can used to specify the dependency path during the configuration phase using the `CMAKE_PREFIX_PATH`. - -* MIOpen's HIP backend uses [rocBLAS](https://github.com/ROCmSoftwarePlatform/rocBLAS) by default. Users can install rocBLAS minimum release by using `apt-get install rocblas`. To disable using rocBLAS set the configuration flag `-DMIOPEN_USE_ROCBLAS=Off`. rocBLAS is *not* available for the OpenCL backend. - -* MIOpen's OpenCL backend uses [MIOpenGEMM](https://github.com/ROCmSoftwarePlatform/MIOpenGEMM) by default. Users can install MIOpenGEMM minimum release by using `apt-get install miopengemm`. From c516a261cba0cb1226192d41bd6829516e6279a7 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:43:00 -0800 Subject: [PATCH 46/67] Delete docs/perfdatabase.md --- docs/perfdatabase.md | 67 -------------------------------------------- 1 file changed, 67 deletions(-) delete mode 100644 docs/perfdatabase.md diff --git a/docs/perfdatabase.md b/docs/perfdatabase.md deleted file mode 100644 index f28671b1dc..0000000000 --- a/docs/perfdatabase.md +++ /dev/null @@ -1,67 +0,0 @@ -Performance Database -==================== - -Many of MIOpen kernels have parameters which affect their performance. Setting these parameters to optimal values allows reaching the best possible throughput. These optimal values depend on many things, including network configuration, GPU type, clock frequencies, ROCm version etc. Because of these dependencies and also due to enormous number of possible network configurations, it is virtually impossible to supply all values that users may need together with the library. Instead, MIOpen provides a set of pre-tuned values for the _most applicable_ network configurations, **and** also means for expanding the set of optimized values. MIOpen's performance database contains these pre-tuned parameter values as well as optimized parameters tuned by users. - -The performance database consists of two parts: -- **System Performance Database**, a system-wide storage which holds the pre-tuned values for the most applicable configurations, -- **User Performance Database**, a per-user storage which is intended to hold optimized values for arbitrary configurations. - -User PerfDb **always takes precedence** over System PerfDb. - -MIOpen also has auto-tuning functionality, which is able to find optimized kernel parameter values for a specific configuration. The auto-tune process may take a substantial amount of time, however, once the optimized values are found, they are stored in the User PerfDb. MIOpen then will automatically read and use these parameter values when needed again instead of running the expensive auto-tuning search. - -By default, System PerfDb resides within MIOpen's install location, while User PerfDb resides in the user's home directory. See [Setting up locations](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/install.html#setting-up-locations) for more information. - -The System PerfDb is not modified upon installation of MIOpen. - -## Auto-tuning the kernels. - -MIOpen performs auto-tuning during the following MIOpen API calls: -- `miopenFindConvolutionForwardAlgorithm()` -- `miopenFindConvolutionBackwardDataAlgorithm()` -- `miopenFindConvolutionBackwardWeightsAlgorithm()` - -During the call, auto-tuning is performed only for one _problem configuration_ (implicitly defined by the tensor descriptors passed to API function). - -The following conditions must be met for the auto-tune to begin: -- The applicable kernel(s) has tuning parameters. -- The passed value of `exhaustiveSearch` parameter is `true`, and -- Both System and User PerfDb do not yet contain values for the relevant _problem configuration_. - -The latter two conditions may be overridden by _enforcing_ the search by means of the following environment variable: -- `MIOPEN_FIND_ENFORCE` - -This variable may also be used for _removing_ values from User PerfDb, see below. - -### MIOPEN_FIND_ENFORCE - -Both symbolic (case-insensitive) and numeric values are supported. - -**NONE (1)** - -Setting the value to "NONE", or "1" will have no change in the default behavior. - -**DB_UPDATE (2)** - -Auto-tune will not be skipped even if PerfDb already contains optimized values. If auto-tune is requested via API, then MIOpen will perform it and update PerfDb. - -This mode can be used for fine-tuning the MIOpen installation on the user's system. When MIOpen is in this mode, the applications that use it may take quite long to finish. - -**SEARCH (3)** - -MIOpen will perform auto-tune even if not requested via MIOpen API. In other words, the library will behave as if `exhaustiveSearch` parameter set to `true` even this is not really so. If optimized values already reside in PerfDb, then auto-tune will not be performed. - -This mode allows for tuning the apps that do not anticipate means for getting the best performance from MIOpen. When MIOpen is in this mode, the first run of the user's app may take substantially longer time than expected. - -**SEARCH_DB_UPDATE (4)** - -A combination of SEARCH and DB_UPDATE. MIOpen performs auto-tune (and updates User PerfDb) on each `miopenFindConvolution*()` call. It is not recommended to use this mode except for debugging purposes. - -**DB_CLEAN (5)** - -Use with care. MIOpen **removes** optimized values related to given _problem configuration_ from the User PerfDb. Auto-tune is blocked, even if it is explicitly requested. System PerfDb left intact. - -### Updating MIOpen and the User Db - -It is important to note that if the user installs a new version of MIOpen, it is recommended that the user move, or delete their old user performance database file. This will prevent older database entries from poluting the configurations shipped with the newer system database. The user perf db is named `miopen.udb` and is located at the user perf db path. From e26621d69c42752c59c0c893de6c7f10ea3baca3 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:45:12 -0800 Subject: [PATCH 47/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 716e947717..2ead1deec5 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -30,9 +30,9 @@ subtrees: - caption: Quick-start entries: - - file: tutorials/install.rst - - file: tutorials/embed.rst - - file: tutorials/driver.rst + - file: tutorials/quick-start/install.rst + - file: tutorials/quick-start/embed.rst + - file: tutorials/quick-start/driver.rst - caption: API reference entries: - file: reference/apireference.rst From 2ec24605886a75b1280b2c15bb9df159da60e282 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:51:04 -0800 Subject: [PATCH 48/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 2ead1deec5..72599bf386 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -19,6 +19,7 @@ root: index subtrees: - caption: What is MIOpen? entries: + - file: tutorials/what-is-MIOpen.rst - file: tutorials/find_and_immediate.rst - file: tutorials/finddb.rst - file: tutorials/cache.rst @@ -33,10 +34,13 @@ subtrees: - file: tutorials/quick-start/install.rst - file: tutorials/quick-start/embed.rst - file: tutorials/quick-start/driver.rst + + - file: reference/index.rst - caption: API reference entries: - file: reference/apireference.rst title: API library + - file: tutorials/index.rst - caption: Tutorials entries: From 1762b8b1fc987e14f6620a54c523beb0cecf29d3 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:54:03 -0800 Subject: [PATCH 49/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 72599bf386..533b7b2b85 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -21,13 +21,21 @@ subtrees: entries: - file: tutorials/what-is-MIOpen.rst - file: tutorials/find_and_immediate.rst + title: Find and Immediate - file: tutorials/finddb.rst + title: Find database - file: tutorials/cache.rst + title: Cache - file: tutorials/perfdatabase.rst + title: Performance database - file: tutorials/Getting_Started_FusionAPI.rst + title: Getting started with Fusion API - file: tutorials/DebugAndLogging.rst + title: Debugging and logging - file: tutorials/MI200AlternateImplementation.rst + title: Alternate implementation for MI200 - file: tutorials/MIOpen_Porting_Guide.rst + title: MIOpen Porting Guide - caption: Quick-start entries: From df8b27df3e830dbc277e4848990083792d6e53bc Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:55:45 -0800 Subject: [PATCH 50/67] Update _toc.yml.in Add titles --- docs/sphinx/_toc.yml.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 533b7b2b85..ac43aa62db 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -40,8 +40,11 @@ subtrees: - caption: Quick-start entries: - file: tutorials/quick-start/install.rst + title: MIOpen installation - file: tutorials/quick-start/embed.rst + title: Embedding MIOpen - file: tutorials/quick-start/driver.rst + title: About MIOpen driver - file: reference/index.rst - caption: API reference From 71f26d3c05b3290a0787d8895d7142edd630348e Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:59:44 -0800 Subject: [PATCH 51/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index ac43aa62db..04007355c9 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -38,13 +38,14 @@ subtrees: title: MIOpen Porting Guide - caption: Quick-start - entries: - - file: tutorials/quick-start/install.rst - title: MIOpen installation - - file: tutorials/quick-start/embed.rst - title: Embedding MIOpen - - file: tutorials/quick-start/driver.rst - title: About MIOpen driver + subtrees: + - entries: + - file: tutorials/quick-start/install.rst + title: MIOpen installation + - file: tutorials/quick-start/embed.rst + title: Embedding MIOpen + - file: tutorials/quick-start/driver.rst + title: About MIOpen driver - file: reference/index.rst - caption: API reference @@ -54,11 +55,14 @@ subtrees: - file: tutorials/index.rst - caption: Tutorials - entries: - - file: tutorials/install.rst - - file: tutorials/embed.rst - - file: tutorials/driver.rst - title: Installing and building MIOpen + subtrees + - entries: + - file: tutorials/install.rst + title: MIOpen installation + - file: tutorials/embed.rst + title: Embedding MIOpen + - file: tutorials/driver.rst + title: Installing and building MIOpen Contributing to this documentation ======================================================= From 671a10c3d9db3a7ec8c8157600c1f56cb0287a28 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 20:17:48 -0800 Subject: [PATCH 52/67] Update index.rst --- docs/reference/index.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 8b13789179..c087fa8d59 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -1 +1,5 @@ +Indices and tables +================== +* :ref:`genindex` +* :ref:`search` From 98c702efcecde732aed83c1fb8dd491f7668c268 Mon Sep 17 00:00:00 2001 From: srawat <120587655+SwRaw@users.noreply.github.com> Date: Tue, 12 Dec 2023 10:52:45 +0530 Subject: [PATCH 53/67] Update .readthedocs.yaml --- .readthedocs.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 9e6678abe5..fd918a157d 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,13 +6,13 @@ version: 2 sphinx: configuration: docs/conf.py -formats: [htmlzip, pdf, epub] +formats: [htmlzip, pdf] python: install: - requirements: docs/sphinx/requirements.txt build: - os: ubuntu-22.04 + os: ubuntu-20.04 tools: python: "3.8" From 3cf98ef210e34a54cd58eb51f63be0eabcfe6721 Mon Sep 17 00:00:00 2001 From: srawat <120587655+SwRaw@users.noreply.github.com> Date: Tue, 12 Dec 2023 10:58:58 +0530 Subject: [PATCH 54/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 04007355c9..524b6caa51 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -2,13 +2,9 @@ :description: MIOpen documentation and API reference library :keywords: MIOpen, ROCm, API, documentation -******************************************************************** -MIOpen documentation -******************************************************************** +# Welcome to the MIOpen docs home page! To learn more, see :ref:`what-is-MIOpen` -Welcome to the MIOpen docs home page! To learn more, see :ref:`what-is-MIOpen` - -Our documentation is structured as follows: +# Our documentation is structured as follows: # Anywhere {branch} is used, the branch name will be substituted. # These comments will also be removed. From 0ffdf184438750c03a33e3bd587c7751bae4c0f0 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Mon, 11 Dec 2023 21:55:14 -0800 Subject: [PATCH 55/67] Update _toc.yml.in --- docs/sphinx/_toc.yml.in | 70 +++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 524b6caa51..8e785f01bd 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -1,38 +1,32 @@ -.. meta:: - :description: MIOpen documentation and API reference library - :keywords: MIOpen, ROCm, API, documentation - -# Welcome to the MIOpen docs home page! To learn more, see :ref:`what-is-MIOpen` - -# Our documentation is structured as follows: - # Anywhere {branch} is used, the branch name will be substituted. # These comments will also be removed. defaults: numbered: False maxdepth: 7 root: index -subtrees: - caption: What is MIOpen? - entries: - - file: tutorials/what-is-MIOpen.rst - - file: tutorials/find_and_immediate.rst - title: Find and Immediate - - file: tutorials/finddb.rst - title: Find database - - file: tutorials/cache.rst - title: Cache - - file: tutorials/perfdatabase.rst - title: Performance database - - file: tutorials/Getting_Started_FusionAPI.rst - title: Getting started with Fusion API - - file: tutorials/DebugAndLogging.rst - title: Debugging and logging - - file: tutorials/MI200AlternateImplementation.rst - title: Alternate implementation for MI200 - - file: tutorials/MIOpen_Porting_Guide.rst - title: MIOpen Porting Guide + subtrees: + - entries: + - file: tutorials/what-is-MIOpen.rst + title: What is MIOpen? + - file: tutorials/find_and_immediate.rst + title: Find and Immediate + - file: tutorials/finddb.rst + title: Find database + - file: tutorials/cache.rst + title: Cache + - file: tutorials/perfdatabase.rst + title: Performance database + - file: tutorials/Getting_Started_FusionAPI.rst + title: Getting started with Fusion API + - file: tutorials/DebugAndLogging.rst + title: Debugging and logging + - file: tutorials/MI200AlternateImplementation.rst + title: Alternate implementation for MI200 + - file: tutorials/MIOpen_Porting_Guide.rst + title: MIOpen Porting Guide + - file: tutorials/index.rst - caption: Quick-start subtrees: - entries: @@ -45,13 +39,14 @@ subtrees: - file: reference/index.rst - caption: API reference - entries: - - file: reference/apireference.rst - title: API library + subtrees: + - entries + - file: reference/apireference.rst + title: API library - file: tutorials/index.rst - caption: Tutorials - subtrees + subtrees: - entries: - file: tutorials/install.rst title: MIOpen installation @@ -60,16 +55,3 @@ subtrees: - file: tutorials/driver.rst title: Installing and building MIOpen -Contributing to this documentation -======================================================= - -We welcome collaboration! If you'd like to contribute to our documentation, you can find instructions -in our `Contributing to ROCm `_ section, which discusses: - -* `Document structure `_ -* `Toolchains `_ -* `Documentation builds `_ -* `How to provide feedback `_ - -Licensing information for all ROCm components is listed on our -`Licensing `_ page. From 6af3b718757076522cd173df18ab2b711f15cb1a Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Tue, 12 Dec 2023 07:09:15 -0800 Subject: [PATCH 56/67] Update _toc.yml.in From 64d6b3acacfa4d55dd685e13820d3a8430fe6a77 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Tue, 12 Dec 2023 07:53:30 -0800 Subject: [PATCH 57/67] Update what-is-MIOpen.rst --- docs/tutorials/what-is-MIOpen.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/what-is-MIOpen.rst b/docs/tutorials/what-is-MIOpen.rst index b31c022f60..b28f2f9c93 100644 --- a/docs/tutorials/what-is-MIOpen.rst +++ b/docs/tutorials/what-is-MIOpen.rst @@ -13,5 +13,5 @@ What is MIOpen? MIOpen is AMD’s deep learning primitives library, which provides highly optimized, and hand-tuned implementations of different operators such as convolution, batch normalization, pooling, softmax, activation and layers for Recurrent Neural Networks (RNNs), used in both training and inference. Moreover, MIOpen is fully open-source including all its -GPU kernels; complementing AMD’s open-source ROCm stack [10]. MIOpen is the first to extend the open-source +GPU kernels; complementing AMD’s open source ROCm stack [10]. MIOpen is the first to extend the open source advantage into GPU vendor libraries thereby, continuing to embark on the same ethos as the deep learning community. From e75956ff3bc5519e99619f852d3dca5c9b32e4a2 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Tue, 12 Dec 2023 08:05:19 -0800 Subject: [PATCH 58/67] Update what-is-MIOpen.rst --- docs/tutorials/what-is-MIOpen.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/what-is-MIOpen.rst b/docs/tutorials/what-is-MIOpen.rst index b28f2f9c93..7eea4610d3 100644 --- a/docs/tutorials/what-is-MIOpen.rst +++ b/docs/tutorials/what-is-MIOpen.rst @@ -12,6 +12,6 @@ What is MIOpen? MIOpen is AMD’s deep learning primitives library, which provides highly optimized, and hand-tuned implementations of different operators such as convolution, batch normalization, pooling, softmax, activation and layers for Recurrent Neural -Networks (RNNs), used in both training and inference. Moreover, MIOpen is fully open-source including all its -GPU kernels; complementing AMD’s open source ROCm stack [10]. MIOpen is the first to extend the open source +Networks (RNNs), used in both training and inference. Moreover, MIOpen is fully open source including all its +GPU kernels; complementing AMD’s open source ROCm stack. MIOpen is the first to extend the open source advantage into GPU vendor libraries thereby, continuing to embark on the same ethos as the deep learning community. From e79fa33e4be65f8d5cd600a8b6f44b949b1ccf70 Mon Sep 17 00:00:00 2001 From: Roopa Malavally <56051583+Rmalavally@users.noreply.github.com> Date: Tue, 12 Dec 2023 08:06:12 -0800 Subject: [PATCH 59/67] Update what-is-MIOpen.rst Minor tweaks --- docs/tutorials/what-is-MIOpen.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/tutorials/what-is-MIOpen.rst b/docs/tutorials/what-is-MIOpen.rst index 7eea4610d3..1ef06dd523 100644 --- a/docs/tutorials/what-is-MIOpen.rst +++ b/docs/tutorials/what-is-MIOpen.rst @@ -10,8 +10,4 @@ What is MIOpen? ********************* -MIOpen is AMD’s deep learning primitives library, which provides highly optimized, and hand-tuned implementations of -different operators such as convolution, batch normalization, pooling, softmax, activation and layers for Recurrent Neural -Networks (RNNs), used in both training and inference. Moreover, MIOpen is fully open source including all its -GPU kernels; complementing AMD’s open source ROCm stack. MIOpen is the first to extend the open source -advantage into GPU vendor libraries thereby, continuing to embark on the same ethos as the deep learning community. +MIOpen is AMD’s deep learning primitives library, which provides highly optimized and hand-tuned implementations of different operators such as convolution, batch normalization, pooling, softmax, activation and layers for Recurrent Neural Networks (RNNs), used in both training and inference. Moreover, MIOpen is fully open source, including all its GPU kernels, complementing AMD’s open source ROCm stack. MIOpen is the first to extend the open source advantage into GPU vendor libraries thereby, continuing to embark on the same ethos as the deep learning community. From 054b3d1d7e39d61ce4660ec9f349c4c255a351b5 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 18 Jan 2024 22:59:55 +0000 Subject: [PATCH 60/67] Rmalavally-patch-1 : edit index.rst file and added install.rst --- docs/reference/index.rst | 16 ++++-- docs/tutorials/quick-start/install.rst | 71 ++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 4 deletions(-) create mode 100644 docs/tutorials/quick-start/install.rst diff --git a/docs/reference/index.rst b/docs/reference/index.rst index c087fa8d59..57a20a2af2 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -1,5 +1,13 @@ Indices and tables -================== - -* :ref:`genindex` -* :ref:`search` +======== +.. toctree:: + what-is-MIOpen + find_and_immediate + finddb + cache + perfdatabase + Getting_Started_FusionAPI + DebugAndLogging + MI200AlternateImplementation + MIOpen_Porting_Guide + \ No newline at end of file diff --git a/docs/tutorials/quick-start/install.rst b/docs/tutorials/quick-start/install.rst new file mode 100644 index 0000000000..371f3e331c --- /dev/null +++ b/docs/tutorials/quick-start/install.rst @@ -0,0 +1,71 @@ +## Building MIOpen from source + +### Configuring with cmake + +First create a build directory: + +```shell +mkdir build; cd build; +``` + +Next configure cmake. The preferred backend for MIOpen can be set using the `-DMIOPEN_BACKEND` cmake variable. + +### For the HIP backend (ROCm 3.5 and later), run + +Set the C++ compiler to `clang++`. + +```shell +export CXX= +cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH=";;" .. +``` + +An example cmake step can be: + +```shell +export CXX=/opt/rocm/llvm/bin/clang++ && \ +cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. +``` + +Note: When specifying the path for the `CMAKE_PREFIX_PATH` variable, **do not** use the `~` shorthand for the user home directory. + +### For OpenCL, run + +```shell +cmake -DMIOPEN_BACKEND=OpenCL .. +``` + +The above assumes that OpenCL is installed in one of the standard locations. If not, then manually set these cmake variables: + +```shell +cmake -DMIOPEN_BACKEND=OpenCL -DMIOPEN_HIP_COMPILER= -DOPENCL_LIBRARIES= -DOPENCL_INCLUDE_DIRS= .. +``` + +And an example setting the dependency path for an envirnment in ROCm 3.5 and later: + +```shell +cmake -DMIOPEN_BACKEND=OpenCL -DMIOPEN_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. +``` + +### Setting Up Locations + +By default the install location is set to '/opt/rocm', this can be set by using `CMAKE_INSTALL_PREFIX`: + +```shell +cmake -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX= .. +``` + +## Using docker + +The easiest way is to use docker. You can build the top-level docker file: + +```shell +docker build -t miopen-image . +``` + +Then to enter the development environment use `docker run`, for example: + +```shell +docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image +``` + +Prebuilt docker images can be found on [ROCm's public docker hub here](https://hub.docker.com/r/rocm/miopen/tags). \ No newline at end of file From d3cc79b0b552281834f3129aa7087d7abf126613 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 18 Jan 2024 23:16:15 +0000 Subject: [PATCH 61/67] Rmalavally-patch-1 : fix code block in install.rst --- docs/tutorials/quick-start/install.rst | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/tutorials/quick-start/install.rst b/docs/tutorials/quick-start/install.rst index 371f3e331c..59c99d9046 100644 --- a/docs/tutorials/quick-start/install.rst +++ b/docs/tutorials/quick-start/install.rst @@ -4,9 +4,9 @@ First create a build directory: -```shell +.. code-block:: bash mkdir build; cd build; -``` + Next configure cmake. The preferred backend for MIOpen can be set using the `-DMIOPEN_BACKEND` cmake variable. @@ -14,58 +14,58 @@ Next configure cmake. The preferred backend for MIOpen can be set using the `-DM Set the C++ compiler to `clang++`. -```shell +.. code-block:: bash export CXX= cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH=";;" .. -``` + An example cmake step can be: -```shell +.. code-block:: bash export CXX=/opt/rocm/llvm/bin/clang++ && \ cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. -``` + Note: When specifying the path for the `CMAKE_PREFIX_PATH` variable, **do not** use the `~` shorthand for the user home directory. ### For OpenCL, run -```shell +.. code-block:: bash cmake -DMIOPEN_BACKEND=OpenCL .. -``` + The above assumes that OpenCL is installed in one of the standard locations. If not, then manually set these cmake variables: -```shell +.. code-block:: bash cmake -DMIOPEN_BACKEND=OpenCL -DMIOPEN_HIP_COMPILER= -DOPENCL_LIBRARIES= -DOPENCL_INCLUDE_DIRS= .. -``` + And an example setting the dependency path for an envirnment in ROCm 3.5 and later: -```shell +.. code-block:: bash cmake -DMIOPEN_BACKEND=OpenCL -DMIOPEN_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. -``` + ### Setting Up Locations By default the install location is set to '/opt/rocm', this can be set by using `CMAKE_INSTALL_PREFIX`: -```shell +.. code-block:: bash cmake -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX= .. -``` + ## Using docker The easiest way is to use docker. You can build the top-level docker file: -```shell +.. code-block:: bash docker build -t miopen-image . -``` + Then to enter the development environment use `docker run`, for example: -```shell +.. code-block:: bash docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image -``` + Prebuilt docker images can be found on [ROCm's public docker hub here](https://hub.docker.com/r/rocm/miopen/tags). \ No newline at end of file From bd1c96d599eceed2bac63638acdf6de0fcdc9328 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 18 Jan 2024 23:21:30 +0000 Subject: [PATCH 62/67] Rmalavally-patch-1 : fix install.rst --- docs/tutorials/quick-start/install.rst | 49 ++++++++++---------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/docs/tutorials/quick-start/install.rst b/docs/tutorials/quick-start/install.rst index 59c99d9046..36a79f3c5c 100644 --- a/docs/tutorials/quick-start/install.rst +++ b/docs/tutorials/quick-start/install.rst @@ -1,71 +1,60 @@ -## Building MIOpen from source +Building MIOpen from source +~~~~~~~~~~ -### Configuring with cmake +Configuring with cmake +---------- First create a build directory: .. code-block:: bash -mkdir build; cd build; + + mkdir build; cd build; Next configure cmake. The preferred backend for MIOpen can be set using the `-DMIOPEN_BACKEND` cmake variable. -### For the HIP backend (ROCm 3.5 and later), run +For the HIP backend (ROCm 3.5 and later), run +---------- Set the C++ compiler to `clang++`. .. code-block:: bash -export CXX= -cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH=";;" .. + export CXX= + cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH=";;" .. An example cmake step can be: .. code-block:: bash -export CXX=/opt/rocm/llvm/bin/clang++ && \ -cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. + export CXX=/opt/rocm/llvm/bin/clang++ && \ + cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. Note: When specifying the path for the `CMAKE_PREFIX_PATH` variable, **do not** use the `~` shorthand for the user home directory. -### For OpenCL, run - -.. code-block:: bash -cmake -DMIOPEN_BACKEND=OpenCL .. - - -The above assumes that OpenCL is installed in one of the standard locations. If not, then manually set these cmake variables: - -.. code-block:: bash -cmake -DMIOPEN_BACKEND=OpenCL -DMIOPEN_HIP_COMPILER= -DOPENCL_LIBRARIES= -DOPENCL_INCLUDE_DIRS= .. - - -And an example setting the dependency path for an envirnment in ROCm 3.5 and later: - -.. code-block:: bash -cmake -DMIOPEN_BACKEND=OpenCL -DMIOPEN_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. - -### Setting Up Locations +Setting Up Locations +---------- By default the install location is set to '/opt/rocm', this can be set by using `CMAKE_INSTALL_PREFIX`: .. code-block:: bash -cmake -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX= .. + cmake -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX= .. -## Using docker +Building MIOpen using docker +~~~~~~~~~~ The easiest way is to use docker. You can build the top-level docker file: .. code-block:: bash -docker build -t miopen-image . + docker build -t miopen-image . Then to enter the development environment use `docker run`, for example: .. code-block:: bash -docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image + docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image Prebuilt docker images can be found on [ROCm's public docker hub here](https://hub.docker.com/r/rocm/miopen/tags). \ No newline at end of file From 880be0beecf40298dfbc0c0d96f9ccbf9c299a70 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 18 Jan 2024 23:23:13 +0000 Subject: [PATCH 63/67] Rmalavally-patch-1 : fix install.rst --- docs/tutorials/quick-start/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/quick-start/install.rst b/docs/tutorials/quick-start/install.rst index 36a79f3c5c..c405a6aed3 100644 --- a/docs/tutorials/quick-start/install.rst +++ b/docs/tutorials/quick-start/install.rst @@ -19,10 +19,10 @@ For the HIP backend (ROCm 3.5 and later), run Set the C++ compiler to `clang++`. .. code-block:: bash + export CXX= cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH=";;" .. - An example cmake step can be: .. code-block:: bash From 4eb7f47803d92643713150b4a853ab9f8e5ba708 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 18 Jan 2024 23:23:47 +0000 Subject: [PATCH 64/67] Rmalavally-patch-1 : render code section in install.rst --- docs/tutorials/quick-start/install.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/tutorials/quick-start/install.rst b/docs/tutorials/quick-start/install.rst index c405a6aed3..dc92d25b53 100644 --- a/docs/tutorials/quick-start/install.rst +++ b/docs/tutorials/quick-start/install.rst @@ -26,6 +26,7 @@ Set the C++ compiler to `clang++`. An example cmake step can be: .. code-block:: bash + export CXX=/opt/rocm/llvm/bin/clang++ && \ cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. @@ -39,6 +40,7 @@ Setting Up Locations By default the install location is set to '/opt/rocm', this can be set by using `CMAKE_INSTALL_PREFIX`: .. code-block:: bash + cmake -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX= .. @@ -48,12 +50,14 @@ Building MIOpen using docker The easiest way is to use docker. You can build the top-level docker file: .. code-block:: bash + docker build -t miopen-image . Then to enter the development environment use `docker run`, for example: .. code-block:: bash + docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image From 888eb7cb225ec7147fcdcff9d626eecfdc1b7eed Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 18 Jan 2024 23:24:55 +0000 Subject: [PATCH 65/67] Rmalavally-patch-1 : render all code section in install.rst --- docs/tutorials/quick-start/install.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/tutorials/quick-start/install.rst b/docs/tutorials/quick-start/install.rst index dc92d25b53..ea5ee38aa4 100644 --- a/docs/tutorials/quick-start/install.rst +++ b/docs/tutorials/quick-start/install.rst @@ -30,7 +30,6 @@ An example cmake step can be: export CXX=/opt/rocm/llvm/bin/clang++ && \ cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. - Note: When specifying the path for the `CMAKE_PREFIX_PATH` variable, **do not** use the `~` shorthand for the user home directory. @@ -43,7 +42,6 @@ By default the install location is set to '/opt/rocm', this can be set by using cmake -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX= .. - Building MIOpen using docker ~~~~~~~~~~ @@ -53,12 +51,10 @@ The easiest way is to use docker. You can build the top-level docker file: docker build -t miopen-image . - Then to enter the development environment use `docker run`, for example: .. code-block:: bash - - docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image + docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image Prebuilt docker images can be found on [ROCm's public docker hub here](https://hub.docker.com/r/rocm/miopen/tags). \ No newline at end of file From 286b8d7428937c6330c0cbb794a496d9b1f9d523 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 18 Jan 2024 23:24:55 +0000 Subject: [PATCH 66/67] Rmalavally-patch-1 : render all code section in install.rst --- docs/tutorials/quick-start/install.rst | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/tutorials/quick-start/install.rst b/docs/tutorials/quick-start/install.rst index dc92d25b53..ea5ee38aa4 100644 --- a/docs/tutorials/quick-start/install.rst +++ b/docs/tutorials/quick-start/install.rst @@ -30,7 +30,6 @@ An example cmake step can be: export CXX=/opt/rocm/llvm/bin/clang++ && \ cmake -DMIOPEN_BACKEND=HIP -DCMAKE_PREFIX_PATH="/opt/rocm/;/opt/rocm/hip;/root/MIOpen/install_dir" .. - Note: When specifying the path for the `CMAKE_PREFIX_PATH` variable, **do not** use the `~` shorthand for the user home directory. @@ -43,7 +42,6 @@ By default the install location is set to '/opt/rocm', this can be set by using cmake -DMIOPEN_BACKEND=OpenCL -DCMAKE_INSTALL_PREFIX= .. - Building MIOpen using docker ~~~~~~~~~~ @@ -53,12 +51,10 @@ The easiest way is to use docker. You can build the top-level docker file: docker build -t miopen-image . - Then to enter the development environment use `docker run`, for example: .. code-block:: bash - - docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image + docker run -it -v $HOME:/data --privileged --rm --device=/dev/kfd --device /dev/dri:/dev/dri:rw --volume /dev/dri:/dev/dri:rw -v /var/lib/docker/:/var/lib/docker --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined miopen-image Prebuilt docker images can be found on [ROCm's public docker hub here](https://hub.docker.com/r/rocm/miopen/tags). \ No newline at end of file From 651739887f369fc04582a3df61d3bc996b73e4af Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 19 Jan 2024 14:49:17 +0000 Subject: [PATCH 67/67] Rmalavally-patch-1: fix referencing multiple docs --- docs/reference/apireference.rst | 2 ++ docs/reference/index.rst | 16 ++++++---------- docs/tutorials/quick-start/install.rst | 2 ++ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/reference/apireference.rst b/docs/reference/apireference.rst index 777ea10b0b..ce34f6d97b 100644 --- a/docs/reference/apireference.rst +++ b/docs/reference/apireference.rst @@ -1,3 +1,5 @@ +.. _apireference: + API Reference ============= diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 57a20a2af2..b6356494fc 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -1,13 +1,9 @@ Indices and tables ======== .. toctree:: - what-is-MIOpen - find_and_immediate - finddb - cache - perfdatabase - Getting_Started_FusionAPI - DebugAndLogging - MI200AlternateImplementation - MIOpen_Porting_Guide - \ No newline at end of file + :maxdepth: 2 + :caption: Contents: + + :ref:`what-is-MIOpen` + :ref:`install` + :ref:`apireference` \ No newline at end of file diff --git a/docs/tutorials/quick-start/install.rst b/docs/tutorials/quick-start/install.rst index ea5ee38aa4..611e9fd746 100644 --- a/docs/tutorials/quick-start/install.rst +++ b/docs/tutorials/quick-start/install.rst @@ -1,3 +1,5 @@ +.. _install: + Building MIOpen from source ~~~~~~~~~~