From 51761b3af172b4fc54ce0a3abc302e203d2bf44a Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Fri, 14 Apr 2023 20:57:55 +0800
Subject: [PATCH 01/33] Release - SuperBench v0.8.0 (#517)

**Description**

Cherry-pick bug fixes from v0.8.0 to main.

**Major Revisions**

* Monitor - Fix the cgroup version checking logic (#502)
* Benchmark - Fix matrix size overflow issue in cuBLASLt GEMM (#503)
* Fix wrong torch usage in communication wrapper for Distributed
Inference Benchmark (#505)
* Analyzer: Fix bug in python3.8 due to pandas api change (#504)
* Bug - Fix bug to get metric from cmd when error happens (#506)
* Monitor - Collect realtime GPU power when benchmarking (#507)
* Add num_workers argument in model benchmark (#511)
* Remove unreachable condition when write host list (#512)
* Update cuda11.8 image to cuda12.1 based on nvcr23.03 (#513)
* Doc - Fix wrong unit of cpu-memory-bw-latency in doc (#515)
* Docs - Upgrade version and release note (#508)

Co-authored-by: guoshzhao <guzhao@microsoft.com>
Co-authored-by: Ziyue Yang <ziyyang@microsoft.com>
Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>
---
 .github/workflows/build-image.yml             |  6 +-
 ...uda11.8.dockerfile => cuda12.1.dockerfile} | 50 +++++------
 docs/developer-guides/using-docker.mdx        |  4 +-
 docs/getting-started/installation.mdx         |  4 +-
 docs/getting-started/run-superbench.md        |  2 +-
 docs/superbench-config.mdx                    |  2 +-
 .../benchmarks/micro-benchmarks.md            | 24 +++---
 docs/user-tutorial/container-images.mdx       |  6 ++
 docs/user-tutorial/data-diagnosis.md          |  2 +-
 docs/user-tutorial/result-summary.md          |  2 +-
 superbench/__init__.py                        |  2 +-
 superbench/analyzer/data_analysis.py          | 14 +++-
 superbench/analyzer/data_diagnosis.py         |  2 +-
 superbench/analyzer/result_summary.py         |  4 +-
 .../cublaslt_gemm/cublaslt_gemm.cu            | 15 ++--
 .../cublaslt_gemm/cublaslt_utils.cc           | 82 +++++++++----------
 .../cublaslt_gemm/cublaslt_utils.h            | 14 ++--
 .../micro_benchmarks/cudnn_function.py        | 22 +++--
 .../micro_benchmarks/dist_inference.py        |  2 +-
 .../benchmarks/model_benchmarks/model_base.py |  7 ++
 .../model_benchmarks/pytorch_base.py          |  2 +-
 superbench/common/utils/device_manager.py     | 16 ++++
 .../utils/gen_traffic_pattern_config.py       | 21 +++--
 superbench/config/amd_mi100_hpe.yaml          |  2 +-
 superbench/config/amd_mi100_z53.yaml          |  2 +-
 .../inference/standard_nc64as_t4_v3.yaml      |  2 +-
 .../inference/standard_nc96ads_a100_v4.yaml   |  2 +-
 .../inference/standard_nv18ads_a10_v5.yaml    |  2 +-
 superbench/config/azure_ndmv4.yaml            |  2 +-
 superbench/config/azure_ndv4.yaml             |  2 +-
 superbench/config/default.yaml                |  2 +-
 superbench/monitor/monitor.py                 | 22 ++---
 superbench/monitor/record.py                  | 16 ++++
 superbench/runner/runner.py                   |  3 +-
 .../model_benchmarks/test_model_base.py       |  2 +
 tests/monitor/test_monitor.py                 |  4 +-
 tests/monitor/test_monitor_record.py          |  9 ++
 website/blog/2023-04-14-release-0-8.md        | 44 ++++++++++
 website/docusaurus.config.js                  |  2 +-
 website/package-lock.json                     |  2 +-
 website/package.json                          |  2 +-
 41 files changed, 265 insertions(+), 162 deletions(-)
 rename dockerfile/{cuda11.8.dockerfile => cuda12.1.dockerfile} (81%)
 create mode 100644 website/blog/2023-04-14-release-0-8.md

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 6e599e9c7..824418a6f 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -24,9 +24,9 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cuda11.8
-          dockerfile: cuda11.8
-          tags: superbench/main:cuda11.8
+        - name: cuda12.1
+          dockerfile: cuda12.1
+          tags: superbench/main:cuda12.1
         - name: cuda11.1.1
           dockerfile: cuda11.1.1
           tags: superbench/main:cuda11.1.1,superbench/superbench:latest
diff --git a/dockerfile/cuda11.8.dockerfile b/dockerfile/cuda12.1.dockerfile
similarity index 81%
rename from dockerfile/cuda11.8.dockerfile
rename to dockerfile/cuda12.1.dockerfile
index 7615b60e3..4a257bf43 100644
--- a/dockerfile/cuda11.8.dockerfile
+++ b/dockerfile/cuda12.1.dockerfile
@@ -1,18 +1,18 @@
-FROM nvcr.io/nvidia/pytorch:22.12-py3
+FROM nvcr.io/nvidia/pytorch:23.03-py3
 
 # OS:
 #   - Ubuntu: 20.04
 #   - OpenMPI: 4.1.5a1
 #   - Docker Client: 20.10.8
 # NVIDIA:
-#   - CUDA: 11.8.0
-#   - cuDNN: 8.7.0.84
-#   - NCCL: v2.15.5-1
+#   - CUDA: 12.1.0
+#   - cuDNN: 8.8.1.3
+#   - NCCL: v2.17.1-1
 # Mellanox:
-#   - OFED: 5.2-2.2.3.0
-#   - HPC-X: v2.8.3
+#   - OFED: 5.2-2.2.3.0 # TODO
+#   - HPC-X: v2.14
 # Intel:
-#   - mlc: v3.9a
+#   - mlc: v3.10
 
 LABEL maintainer="SuperBench"
 
@@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \
 # Install OFED
 ENV OFED_VERSION=5.2-2.2.3.0
 RUN cd /tmp && \
-    wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
     tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
     MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
     rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
 
 # Install HPC-X
+ENV HPCX_VERSION=v2.14
 RUN cd /opt && \
     rm -rf hpcx && \
-    wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
-    tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
-    ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \
-    rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz
+    wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
+    tar xf hpcx.tbz && \
+    mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
+    rm hpcx.tbz
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
 
-ENV PATH="${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
-    SB_HOME=/opt/superbench \
-    SB_MICRO_PATH=/opt/superbench \
-    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
-    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
-
-RUN echo PATH="$PATH" > /etc/environment && \
-    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
-    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
-
 # Install AOCC compiler
 RUN cd /tmp && \
     wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
@@ -115,6 +105,18 @@ RUN cd /tmp && \
     mv amd-blis /opt/AMD && \
     rm -rf aocl-blis-linux-aocc-4.0.tar.gz
 
+
+ENV PATH="${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
+    SB_HOME=/opt/superbench \
+    SB_MICRO_PATH=/opt/superbench \
+    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
+    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
+
+RUN echo PATH="$PATH" > /etc/environment && \
+    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
+    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/
 
diff --git a/docs/developer-guides/using-docker.mdx b/docs/developer-guides/using-docker.mdx
index 621e9cffd..b73891853 100644
--- a/docs/developer-guides/using-docker.mdx
+++ b/docs/developer-guides/using-docker.mdx
@@ -29,7 +29,7 @@ You need to [clone the code](./development.md#set-up) first before building the
 export DOCKER_BUILDKIT=1
 docker buildx build \
   --platform linux/amd64 --cache-to type=inline,mode=max \
-  --tag superbench-dev --file dockerfile/cuda11.1.1.dockerfile .
+  --tag superbench-dev --file dockerfile/cuda12.1.dockerfile .
 ```
 
 </TabItem>
@@ -39,7 +39,7 @@ docker buildx build \
 export DOCKER_BUILDKIT=1
 docker buildx build \
   --platform linux/amd64 --cache-to type=inline,mode=max \
-  --tag superbench-dev --file dockerfile/rocm4.2-pytorch1.7.0.dockerfile .
+  --tag superbench-dev --file dockerfile/rocm5.1.x.dockerfile .
 ```
 
 </TabItem>
diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx
index 3172605cb..82c1fc9c3 100644
--- a/docs/getting-started/installation.mdx
+++ b/docs/getting-started/installation.mdx
@@ -45,7 +45,7 @@ but it is not strictly necessary.
 
 ```bash
 # create a new virtual environment
-python3 -m venv --system-site-packages ./venv
+python3 -m venv ./venv
 # activate the virtual environment
 source ./venv/bin/activate
 
@@ -61,7 +61,7 @@ You can clone the source from GitHub and build it.
 :::note Note
 You should checkout corresponding tag to use release version, for example,
 
-`git clone -b v0.7.0 https://github.com/microsoft/superbenchmark`
+`git clone -b v0.8.0 https://github.com/microsoft/superbenchmark`
 :::
 
 ```bash
diff --git a/docs/getting-started/run-superbench.md b/docs/getting-started/run-superbench.md
index e97626c56..32a8c6d80 100644
--- a/docs/getting-started/run-superbench.md
+++ b/docs/getting-started/run-superbench.md
@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password]
 :::note Note
 You should deploy corresponding Docker image to use release version, for example,
 
-`sb deploy -f local.ini -i superbench/superbench:v0.7.0-cuda11.1.1`
+`sb deploy -f local.ini -i superbench/superbench:v0.8.0-cuda12.1`
 
 You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone.
 
diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx
index 8fe6aa1ea..8802830b2 100644
--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@@ -70,7 +70,7 @@ superbench:
 <TabItem value='example'>
 
 ```yaml
-version: v0.7
+version: v0.8
 superbench:
   enable: benchmark_1
   monitor:
diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 2788a2815..b2e43db3f 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -180,11 +180,11 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer
 
 #### Metrics
 
-| Name                | Unit               | Description                                                                |
-|---------------------|--------------------|----------------------------------------------------------------------------|
-| cpu-hpl/tests_pass  |                    | HPL completed running and correctness test has passed (1: pass, 0: fail).  |
-| cpu-hpl/throughput  | bandwidth (GFlops) | Compute bandwidth.                                                         |
-| cpu-hpl/time        | time (s)           | Time elapsed during HPL run.                                               |
+| Name               | Unit               | Description                                                               |
+|--------------------|--------------------|---------------------------------------------------------------------------|
+| cpu-hpl/tests_pass |                    | HPL completed running and correctness test has passed (1: pass, 0: fail). |
+| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth.                                                        |
+| cpu-hpl/time       | time (s)           | Time elapsed during HPL run.                                              |
 
 ### `cpu-stream`
 
@@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/
 
 | Name                                                                    | Unit             | Description                                                         |
 |-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------|
-| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw  | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth.                        |
-| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us)        | Former NUMA to latter NUMA memory latency.                          |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw               | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read.                      |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw       | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
+| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw  | bandwidth (MB/s) | Former NUMA to latter NUMA memory bandwidth.                        |
+| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (ns)        | Former NUMA to latter NUMA memory latency.                          |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw               | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, full read.                      |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw       | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
 
 ### `mem-bw`
 
diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx
index 112bbf25a..27cf8da6f 100644
--- a/docs/user-tutorial/container-images.mdx
+++ b/docs/user-tutorial/container-images.mdx
@@ -29,6 +29,8 @@ available tags are listed below for all stable versions.
 
 | Tag               | Description                        |
 |-------------------|------------------------------------|
+| v0.8.0-cuda12.1   | SuperBench v0.8.0 with CUDA 12.1   |
+| v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 |
 | v0.7.0-cuda11.8   | SuperBench v0.7.0 with CUDA 11.8   |
 | v0.7.0-cuda11.1.1 | SuperBench v0.7.0 with CUDA 11.1.1 |
 | v0.6.0-cuda11.1.1 | SuperBench v0.6.0 with CUDA 11.1.1 |
@@ -43,6 +45,10 @@ available tags are listed below for all stable versions.
 
 | Tag                           | Description                                      |
 |-------------------------------|--------------------------------------------------|
+| v0.8.0-rocm5.1.3              | SuperBench v0.8.0 with ROCm 5.1.3                |
+| v0.8.0-rocm5.1.1              | SuperBench v0.8.0 with ROCm 5.1.1                |
+| v0.8.0-rocm5.0.1              | SuperBench v0.8.0 with ROCm 5.0.1                |
+| v0.8.0-rocm5.0                | SuperBench v0.8.0 with ROCm 5.0                  |
 | v0.7.0-rocm5.1.3              | SuperBench v0.7.0 with ROCm 5.1.3                |
 | v0.7.0-rocm5.1.1              | SuperBench v0.7.0 with ROCm 5.1.1                |
 | v0.7.0-rocm5.0.1              | SuperBench v0.7.0 with ROCm 5.0.1                |
diff --git a/docs/user-tutorial/data-diagnosis.md b/docs/user-tutorial/data-diagnosis.md
index e4094f83a..94a2a025d 100644
--- a/docs/user-tutorial/data-diagnosis.md
+++ b/docs/user-tutorial/data-diagnosis.md
@@ -65,7 +65,7 @@ superbench:
 example:
 ```yaml
 # SuperBench rules
-version: v0.7
+version: v0.8
 superbench:
   rules:
     failure-rule:
diff --git a/docs/user-tutorial/result-summary.md b/docs/user-tutorial/result-summary.md
index d9053d3b6..e53738ff8 100644
--- a/docs/user-tutorial/result-summary.md
+++ b/docs/user-tutorial/result-summary.md
@@ -58,7 +58,7 @@ superbench:
 
 ```yaml title="Example"
 # SuperBench rules
-version: v0.7
+version: v0.8
 superbench:
   rules:
     kernel_launch:
diff --git a/superbench/__init__.py b/superbench/__init__.py
index a0b5f7c02..5b85c9a9a 100644
--- a/superbench/__init__.py
+++ b/superbench/__init__.py
@@ -6,5 +6,5 @@
 Provide hardware and software benchmarks for AI systems.
 """
 
-__version__ = '0.7.0'
+__version__ = '0.8.0'
 __author__ = 'Microsoft'
diff --git a/superbench/analyzer/data_analysis.py b/superbench/analyzer/data_analysis.py
index d7ac40f1b..5a7fb1ed8 100644
--- a/superbench/analyzer/data_analysis.py
+++ b/superbench/analyzer/data_analysis.py
@@ -31,11 +31,13 @@ def statistic(raw_data_df):
         logger.warning('DataAnalyzer: empty data.')
         return data_statistics_df
     try:
+        raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
+        raw_data_df = raw_data_df.dropna(axis=1, how='all')
         data_statistics_df = raw_data_df.describe()
-        data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01)
-        data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05)
-        data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95)
-        data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99)
+        data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01, numeric_only=True)
+        data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05, numeric_only=True)
+        data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95, numeric_only=True)
+        data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99, numeric_only=True)
         statistics_error = []
         for column in list(raw_data_df.columns):
             if column not in list(data_statistics_df.columns) and not raw_data_df[column].isnull().all():
@@ -122,6 +124,8 @@ def correlation(raw_data_df):
         logger.warning('DataAnalyzer: empty data.')
         return data_corr_df
     try:
+        raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
+        raw_data_df = raw_data_df.dropna(axis=1, how='all')
         data_corr_df = raw_data_df.corr()
         statistics_error = []
         for column in list(raw_data_df.columns):
@@ -181,6 +185,8 @@ def generate_baseline(raw_data_df, output_dir):
         output_dir (str): the directory of output file
     """
     try:
+        raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
+        raw_data_df = raw_data_df.dropna(axis=1, how='all')
         if not isinstance(raw_data_df, pd.DataFrame):
             logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame')
             return
diff --git a/superbench/analyzer/data_diagnosis.py b/superbench/analyzer/data_diagnosis.py
index f7a906560..ee5d705b6 100644
--- a/superbench/analyzer/data_diagnosis.py
+++ b/superbench/analyzer/data_diagnosis.py
@@ -285,7 +285,7 @@ def output_diagnosis_in_excel(self, raw_data_df, data_not_accept_df, output_path
                 logger.log_and_raise(exception=IOError, msg='DataDiagnosis: excel_data_output - invalid file path.')
             file_handler.output_excel_raw_data(writer, raw_data_df, 'Raw Data')
             file_handler.output_excel_data_not_accept(writer, data_not_accept_df, rules)
-            writer.save()
+            writer.close()
         except Exception as e:
             logger.log_and_raise(exception=Exception, msg='DataDiagnosis: excel_data_output - {}'.format(str(e)))
 
diff --git a/superbench/analyzer/result_summary.py b/superbench/analyzer/result_summary.py
index e269d70a5..09954a8dc 100644
--- a/superbench/analyzer/result_summary.py
+++ b/superbench/analyzer/result_summary.py
@@ -117,7 +117,7 @@ def _merge_summary(self, summary):
         summary_df = pd.DataFrame()
         for category in summary:
             for i in range(len(summary[category])):
-                summary_df = summary_df.append([summary[category][i]], ignore_index=True)
+                summary_df = pd.concat([summary_df, pd.DataFrame([summary[category][i]])], ignore_index=True)
         return summary_df
 
     def _generate_summary(self, round):
@@ -217,7 +217,7 @@ def output_summary_in_excel(self, raw_data_df, summary, output_path):
                 file_handler.merge_column_in_excel(worksheet, row, 1)
             else:
                 logger.error('ResultSummary: excel_data_output - summary is empty.')
-            writer.save()
+            writer.close()
         except Exception as e:
             logger.error('ResultSummary: excel_data_output - {}'.format(str(e)))
 
diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
index bc8478274..788b1989d 100644
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
@@ -88,20 +88,21 @@ template <typename T> cudaDataType_t get_datatype() {
 }
 
 template <typename Ta, typename Tb, typename Tout>
-float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) {
+float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, int iter) {
     // init matrix
     Ta *matrix_a = nullptr;
     Tb *matrix_b = nullptr;
     Tout *matrix_out = nullptr;
-    cudaMalloc(&matrix_a, m * k * std::max(batch, 1) * sizeof(Ta));
-    cudaMalloc(&matrix_b, k * n * std::max(batch, 1) * sizeof(Tb));
-    cudaMalloc(&matrix_out, m * n * std::max(batch, 1) * sizeof(Tout));
+    batch = std::max<size_t>(batch, 1);
+    cudaMalloc(&matrix_a, m * k * batch * sizeof(Ta));
+    cudaMalloc(&matrix_b, k * n * batch * sizeof(Tb));
+    cudaMalloc(&matrix_out, m * n * batch * sizeof(Tout));
 
-    init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * std::max(batch, 1));
-    init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * std::max(batch, 1));
+    init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * batch);
+    init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * batch);
 
     // init gemm
-    int lda = k, ldb = k, ldd = m;
+    size_t lda = k, ldb = k, ldd = m;
     std::unique_ptr<cublasLtGemm> gemm = std::make_unique<cublasLtGemm>();
     gemm->Init();
     gemm->Setup(m, n, k, batch, lda, ldb, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tout>(),
diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
index a91304c5a..4842c22d1 100644
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
@@ -5,12 +5,12 @@
 
 void cublasLtGemm::Init() {
     cublasLtHandle_t handle;
-    checkCublasStatus(cublasLtCreate(&handle));
+    CUBLAS_CHECK(cublasLtCreate(&handle));
     handle_.reset(handle);
 
     /* preference can be initialized without arguments */
     cublasLtMatmulPreference_t preference;
-    checkCublasStatus(cublasLtMatmulPreferenceCreate(&preference));
+    CUBLAS_CHECK(cublasLtMatmulPreferenceCreate(&preference));
     preference_.reset(preference);
 }
 
@@ -24,32 +24,32 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
     // force c_type
     cudaDataType_t c_type = d_type;
     // Create matrix descriptors.
-    checkCublasStatus(
+    CUBLAS_CHECK(
         cublasLtMatrixLayoutCreate(&a_desc, a_type, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda));
-    checkCublasStatus(
+    CUBLAS_CHECK(
         cublasLtMatrixLayoutCreate(&b_desc, b_type, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb));
-    checkCublasStatus(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldd));
-    checkCublasStatus(cublasLtMatrixLayoutCreate(&d_desc, d_type, m, n, ldd));
+    CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldd));
+    CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&d_desc, d_type, m, n, ldd));
 
     // strided batch gemm
     if (batch > 0) {
         int64_t stridea = m * k, strideb = k * n, stridec = m * n, strided = m * n;
-        checkCublasStatus(
+        CUBLAS_CHECK(
             cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
-        checkCublasStatus(cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-                                                           &stridea, sizeof(stridea)));
-        checkCublasStatus(
+        CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
+                                                      sizeof(stridea)));
+        CUBLAS_CHECK(
             cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
-        checkCublasStatus(cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-                                                           &strideb, sizeof(strideb)));
-        checkCublasStatus(
+        CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
+                                                      sizeof(strideb)));
+        CUBLAS_CHECK(
             cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
-        checkCublasStatus(cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-                                                           &stridec, sizeof(stridec)));
-        checkCublasStatus(
+        CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
+                                                      sizeof(stridec)));
+        CUBLAS_CHECK(
             cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
-        checkCublasStatus(cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-                                                           &strided, sizeof(strided)));
+        CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strided,
+                                                      sizeof(strided)));
     }
     a_desc_.reset(a_desc);
     b_desc_.reset(b_desc);
@@ -64,7 +64,7 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
         gemm_compute_type = CUBLAS_COMPUTE_64F;
 
     cublasLtMatmulDesc_t op_desc = nullptr;
-    checkCublasStatus(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F));
+    CUBLAS_CHECK(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F));
     op_desc_.reset(op_desc);
 
     if (a_type == CUDA_R_8F_E5M2 || b_type == CUDA_R_8F_E5M2 || a_type == CUDA_R_8F_E4M3 || b_type == CUDA_R_8F_E4M3) {
@@ -73,33 +73,31 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
         cublasLtMatmulDescSetAttribute(op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(fastAccuMode));
     }
 
-    checkCublasStatus(
-        cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
-    checkCublasStatus(
-        cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
+    CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
+    CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
 
     if (a_scale_inverse != nullptr) {
-        checkCublasStatus(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
-                                                         &a_scale_inverse, sizeof(a_scale_inverse)));
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
+                                                    &a_scale_inverse, sizeof(a_scale_inverse)));
     }
     if (b_scale_inverse != nullptr) {
-        checkCublasStatus(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
-                                                         &b_scale_inverse, sizeof(b_scale_inverse)));
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
+                                                    &b_scale_inverse, sizeof(b_scale_inverse)));
     }
-    checkCublasStatus(
+    CUBLAS_CHECK(
         cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)));
 }
 
 size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_size) {
-    checkCublasStatus(cublasLtMatmulPreferenceSetAttribute(preference_.get(), CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-                                                           &max_workspace_size, sizeof(max_workspace_size)));
+    CUBLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(preference_.get(), CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                                                      &max_workspace_size, sizeof(max_workspace_size)));
 
     int found_algorithm_count = 0;
     std::vector<cublasLtMatmulHeuristicResult_t> results(max_algorithm_count);
     // Though we query all of possible algorithm, we will use the first later
-    checkCublasStatus(cublasLtMatmulAlgoGetHeuristic(handle_.get(), op_desc_.get(), a_desc_.get(), b_desc_.get(),
-                                                     c_desc_.get(), d_desc_.get(), preference_.get(),
-                                                     max_algorithm_count, results.data(), &found_algorithm_count));
+    CUBLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(handle_.get(), op_desc_.get(), a_desc_.get(), b_desc_.get(),
+                                                c_desc_.get(), d_desc_.get(), preference_.get(), max_algorithm_count,
+                                                results.data(), &found_algorithm_count));
     if (found_algorithm_count == 0) {
         throw std::runtime_error("Unable to find any suitable algorithms");
     }
@@ -111,13 +109,13 @@ size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_
 
 void cublasLtGemm::Execute(void *matrix_a, void *matrix_b, void *matrix_c, void *matrix_d, float alpha, float beta,
                            void *workspace, size_t workspace_size, cudaStream_t stream) {
-    checkCublasStatus(cublasLtMatmul(handle_.get(), op_desc_.get(), static_cast<const void *>(&alpha), /* alpha */
-                                     matrix_a,                                                         /* A */
-                                     a_desc_.get(), matrix_b,                                          /* B */
-                                     b_desc_.get(), static_cast<const void *>(&beta),                  /* beta */
-                                     matrix_c,                                                         /* C */
-                                     c_desc_.get(), matrix_d,                                          /* D */
-                                     d_desc_.get(), &heuristic_results_.front().algo,                  /* algo */
-                                     workspace,                                                        /* workspace */
-                                     workspace_size, stream));                                         /* stream */
+    CUBLAS_CHECK(cublasLtMatmul(handle_.get(), op_desc_.get(), static_cast<const void *>(&alpha), /* alpha */
+                                matrix_a,                                                         /* A */
+                                a_desc_.get(), matrix_b,                                          /* B */
+                                b_desc_.get(), static_cast<const void *>(&beta),                  /* beta */
+                                matrix_c,                                                         /* C */
+                                c_desc_.get(), matrix_d,                                          /* D */
+                                d_desc_.get(), &heuristic_results_.front().algo,                  /* algo */
+                                workspace,                                                        /* workspace */
+                                workspace_size, stream));                                         /* stream */
 }
diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h
index ca1f8fcfd..f89f934e4 100644
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h
@@ -10,12 +10,14 @@
 
 #include <cublasLt.h>
 
-inline void checkCublasStatus(cublasStatus_t status) {
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        printf("cuBLAS API failed with status %s\n", cublasGetStatusString(status));
-        throw std::logic_error("cuBLAS API failed");
-    }
-}
+#define CUBLAS_CHECK(func)                                                                                             \
+    do {                                                                                                               \
+        cublasStatus_t status = func;                                                                                  \
+        if (status != CUBLAS_STATUS_SUCCESS) {                                                                         \
+            printf("cuBLAS call %s failed at %s:%d '%s'\n", #func, __FILE__, __LINE__, cublasGetStatusString(status)); \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
 
 class cublasLtGemm {
   public:
diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function.py b/superbench/benchmarks/micro_benchmarks/cudnn_function.py
index 4c7f08193..82384ae8b 100644
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function.py
@@ -408,23 +408,21 @@ def _process_raw_result(self, cmd_idx, raw_output):
             True if the raw output string is valid and result can be extracted.
         """
         self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
-
+        metric = ''
         try:
             lines = raw_output.splitlines()
-            metric = ''
+
+            cmd_config = json.loads(self._commands[cmd_idx].split('--config_json')[-1].replace(' ', '')[1:-1])
+            for key in sorted(cmd_config.keys()):
+                if 'name' in key:
+                    metric = key + '_' + str(cmd_config[key]) + metric
+                else:
+                    metric = metric + '_' + key + '_' + str(cmd_config[key])
+            metric = metric.replace(' ', '').replace(',', '_')
+
             error = False
             raw_data = []
             for line in lines:
-                if '[function config]' in line:
-                    metric = ''
-                    metric_json_str = line[line.index('[function config]: ') +
-                                           len('[function config]: '):].replace(' ', '').replace(':', '_')[1:-1]
-                    metric_list = metric_json_str.split(',')
-                    for key in metric_list:
-                        if 'name' in key:
-                            metric = key + metric
-                        else:
-                            metric = metric + '_' + key
                 if '[raw_data]' in line:
                     raw_data = line[line.index('[raw_data]: ') + len('[raw_data]: '):]
                     raw_data = raw_data.split(',')
diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference.py b/superbench/benchmarks/micro_benchmarks/dist_inference.py
index 4083c16ad..535c4fbf6 100644
--- a/superbench/benchmarks/micro_benchmarks/dist_inference.py
+++ b/superbench/benchmarks/micro_benchmarks/dist_inference.py
@@ -121,7 +121,7 @@ def __all_gather_wrapper(self, x):
         Return:
             Tensor after all-gather.
         """
-        output = torch.empty_like([x.shape[0] * self.num_ranks] + list(x.shape[1:]))
+        output = torch.empty([x.shape[0] * self.num_ranks] + list(x.shape[1:]), dtype=x.dtype, device=x.device)
         dist.all_gather_into_tensor(output, x)
         return output
 
diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py
index 133ee76f4..a51c05850 100644
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -78,6 +78,13 @@ def add_parser_arguments(self):
             required=False,
             help='The number of batch size.',
         )
+        self._parser.add_argument(
+            '--num_workers',
+            type=int,
+            default=8,
+            required=False,
+            help='Number of subprocesses to use for data loading.',
+        )
         self._parser.add_argument(
             '--precision',
             type=Precision,
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py
index ce1cca93b..f0cb52319 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -181,7 +181,7 @@ def _init_dataloader(self):
             dataset=self._dataset,
             batch_size=self._args.batch_size,
             shuffle=False,
-            num_workers=8,
+            num_workers=self._args.num_workers,
             sampler=train_sampler,
             drop_last=True,
             pin_memory=self._args.pin_memory
diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py
index aeb62b586..2a6a8a889 100644
--- a/superbench/common/utils/device_manager.py
+++ b/superbench/common/utils/device_manager.py
@@ -72,6 +72,22 @@ def get_device_temperature(self, idx):
             temp = None
         return temp
 
+    def get_device_power(self, idx):
+        """Get the realtime power of device, unit: watt.
+
+        Args:
+            idx (int): device index.
+
+        Return:
+            temp (float): the realtime power of device, None means failed to get the data.
+        """
+        try:
+            power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx])
+        except Exception as err:
+            logger.error('Get device power failed: {}'.format(str(err)))
+            return None
+        return int(int(power) / 1000)
+
     def get_device_power_limit(self, idx):
         """Get the power management limit of device, unit: watt.
 
diff --git a/superbench/common/utils/gen_traffic_pattern_config.py b/superbench/common/utils/gen_traffic_pattern_config.py
index 97864784c..84a2e65d0 100644
--- a/superbench/common/utils/gen_traffic_pattern_config.py
+++ b/superbench/common/utils/gen_traffic_pattern_config.py
@@ -182,15 +182,14 @@ def gen_traffic_pattern_host_groups(host_list, pattern, mpi_pattern_path, benchm
         logger.error('Unsupported traffic pattern: {}'.format(pattern.type))
     host_groups = __convert_config_to_host_group(config, host_list)
     # write traffic pattern host groups to specified path
-    if pattern.mpi_pattern:
-        with open(mpi_pattern_path, 'a') as f:
-            f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n')
-            for host_group in host_groups:
-                row = []
-                for host_list in host_group:
-                    group = ','.join(host_list)
-                    row.append(group)
-                group = ';'.join(row)
-                f.write(group + '\n')
-            f.write('\n')
+    with open(mpi_pattern_path, 'a') as f:
+        f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n')
+        for host_group in host_groups:
+            row = []
+            for host_list in host_group:
+                group = ','.join(host_list)
+                row.append(group)
+            group = ';'.join(row)
+            f.write(group + '\n')
+        f.write('\n')
     return host_groups
diff --git a/superbench/config/amd_mi100_hpe.yaml b/superbench/config/amd_mi100_hpe.yaml
index 4f6a68a6e..150424c0f 100644
--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@@ -3,7 +3,7 @@
 # Server:
 #   - Product: HPE Apollo 6500
 
-version: v0.7
+version: v0.8
 superbench:
   enable: null
   var:
diff --git a/superbench/config/amd_mi100_z53.yaml b/superbench/config/amd_mi100_z53.yaml
index 9ef423a3e..188c93547 100644
--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@@ -4,7 +4,7 @@
 #   - Product: G482-Z53
 #   - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html
 
-version: v0.7
+version: v0.8
 superbench:
   enable: null
   var:
diff --git a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
index 4ba445909..62e0d6586 100644
--- a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
+++ b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
@@ -1,4 +1,4 @@
-version: v0.7
+version: v0.8
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
index 56dc89b15..337affacf 100644
--- a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
+++ b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
@@ -1,4 +1,4 @@
-version: v0.7
+version: v0.8
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
index d980488a2..f95469cb0 100644
--- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
+++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
@@ -1,4 +1,4 @@
-version: v0.7
+version: v0.8
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml
index 1d23a2ebb..e482d6ed0 100644
--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@@ -3,7 +3,7 @@
 # Azure NDm A100 v4
 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series
 
-version: v0.7
+version: v0.8
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml
index 02317c144..cb9a93ddc 100644
--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@@ -1,5 +1,5 @@
 # SuperBench Config
-version: v0.7
+version: v0.8
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml
index bff622a51..60d6be7b0 100644
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -1,5 +1,5 @@
 # SuperBench Config
-version: v0.7
+version: v0.8
 superbench:
   enable: null
   monitor:
diff --git a/superbench/monitor/monitor.py b/superbench/monitor/monitor.py
index 0945965eb..b3d01711d 100644
--- a/superbench/monitor/monitor.py
+++ b/superbench/monitor/monitor.py
@@ -38,16 +38,7 @@ def __init__(self, container_name, sample_duration, sample_interval, output_file
         self.__unit_MiByte = 1024 * 1024 * 1.0
 
         self.__output_handler = open(self.__output_file, 'a')
-
         self.__cgroup = 1
-        output = run_command('grep cgroup /proc/filesystems', quiet=True)
-        if output.returncode != 0:
-            logger.error('Failed to check the cgroup version, will assume using cgroup V1.')
-        else:
-            if 'cgroup2' in output.stdout:
-                self.__cgroup = 2
-
-        logger.info('cgroup version: {}.'.format(self.__cgroup))
 
     def __preprocess(self):
         """Preprocess/preparation operations before the monitoring.
@@ -77,13 +68,15 @@ def __preprocess(self):
             container_pid = output.stdout
 
             try:
-                if self.__cgroup == 1:
-                    self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0]
+                cpu_file_cgroup_v1 = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))
+                if len(cpu_file_cgroup_v1) > 0:
+                    self._cpu_file = cpu_file_cgroup_v1[0]
                     self._mem_file = glob.glob(
                         '/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id)
                     )[0]
                     self._net_file = '/proc/{}/net/dev'.format(container_pid)
                 else:
+                    self.__cgroup = 2
                     self._cpu_file = glob.glob(
                         '/sys/fs/cgroup/system.slice/docker-{}*.scope/cpu.stat'.format(container_id)
                     )[0]
@@ -99,10 +92,12 @@ def __preprocess(self):
                 )
                 return False
         else:
-            if self.__cgroup == 1:
-                self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
+            cpu_file_cgroup_v1 = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
+            if os.path.exists(cpu_file_cgroup_v1):
+                self._cpu_file = cpu_file_cgroup_v1
                 self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes'
             else:
+                self.__cgroup = 2
                 self._cpu_file = '/sys/fs/cgroup/cpu.stat'
                 self._mem_file = '/sys/fs/cgroup/memory.stat'
             self._net_file = '/proc/net/dev'
@@ -199,6 +194,7 @@ def __sample_gpu_metrics(self, record):
         for i in range(device_count):
             record.gpu_usage.append(dm.device_manager.get_device_utilization(i))
             record.gpu_temperature.append(dm.device_manager.get_device_temperature(i))
+            record.gpu_power.append(dm.device_manager.get_device_power(i))
             record.gpu_power_limit.append(dm.device_manager.get_device_power_limit(i))
             mem_used, mem_total = dm.device_manager.get_device_memory(i)
             record.gpu_mem_used.append(mem_used)
diff --git a/superbench/monitor/record.py b/superbench/monitor/record.py
index 73ff7c3a6..3b229f108 100644
--- a/superbench/monitor/record.py
+++ b/superbench/monitor/record.py
@@ -14,6 +14,7 @@ class MonitorRecord:
     """Record class to save all monitoring data."""
     reduce_ops = {
         'gpu_temperature': ReduceType.MAX,
+        'gpu_power': ReduceType.MAX,
         'gpu_power_limit': ReduceType.MIN,
         'gpu_corrected_ecc': ReduceType.LAST,
         'gpu_uncorrected_ecc': ReduceType.LAST,
@@ -28,6 +29,7 @@ def __init__(self):
         self.__mem_total = None
         self.__gpu_usage = list()
         self.__gpu_temperature = list()
+        self.__gpu_power = list()
         self.__gpu_power_limit = list()
         self.__gpu_mem_used = list()
         self.__gpu_mem_total = list()
@@ -112,6 +114,20 @@ def gpu_temperature(self, gpu_temperature):
         """
         self.__gpu_temperature = gpu_temperature
 
+    @property
+    def gpu_power(self):
+        """Decoration function to access __gpu_power."""
+        return self.__gpu_power
+
+    @gpu_power.setter
+    def gpu_power(self, gpu_power):
+        """Set the gpu realtime power, unit: Watt.
+
+        Args:
+            gpu_power(list): list of gpu realtime power.
+        """
+        self.__gpu_power = gpu_power
+
     @property
     def gpu_power_limit(self):
         """Decoration function to access __gpu_power_limit."""
diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index 29d114b14..28b5c7186 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -387,8 +387,9 @@ def __merge_monitor_metrics(self, node_path):
                 metrics_dict[metric].append(value)
 
         for metric, values in metrics_dict.items():
+            prefix = metric.split(':')[0]
             for pattern, reduce_type in MonitorRecord.reduce_ops.items():
-                if pattern in metric:
+                if pattern == prefix:
                     reduce_func = Reducer.get_reduce_func(reduce_type)
                     metric_name = 'monitor/{}'.format(metric)
                     metrics_summary[metric_name] = reduce_func(values)
diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py
index 926088aea..deba3a438 100644
--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -167,6 +167,7 @@ def test_arguments_related_interfaces():
   --no_gpu              Disable GPU training.
   --num_steps int       The number of test step.
   --num_warmup int      The number of warmup step.
+  --num_workers int     Number of subprocesses to use for data loading.
   --pin_memory          Enable option to pin memory in data loader.
   --precision Precision [Precision ...]
                         Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
@@ -206,6 +207,7 @@ def test_preprocess():
   --no_gpu              Disable GPU training.
   --num_steps int       The number of test step.
   --num_warmup int      The number of warmup step.
+  --num_workers int     Number of subprocesses to use for data loading.
   --pin_memory          Enable option to pin memory in data loader.
   --precision Precision [Precision ...]
                         Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
diff --git a/tests/monitor/test_monitor.py b/tests/monitor/test_monitor.py
index 0fa601e21..16ca151a7 100644
--- a/tests/monitor/test_monitor.py
+++ b/tests/monitor/test_monitor.py
@@ -44,8 +44,8 @@ def test_monitor(self):
 
         monitor._Monitor__sample_gpu_metrics(record)
         gpu_list_metrics = [
-            record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total,
-            record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
+            record.gpu_usage, record.gpu_temperature, record.gpu_power, record.gpu_power_limit, record.gpu_mem_used,
+            record.gpu_mem_total, record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
         ]
         for metric in gpu_list_metrics:
             assert (metric)
diff --git a/tests/monitor/test_monitor_record.py b/tests/monitor/test_monitor_record.py
index 13b866fd9..069dbf6fd 100644
--- a/tests/monitor/test_monitor_record.py
+++ b/tests/monitor/test_monitor_record.py
@@ -17,6 +17,7 @@ def test_monitor_record():
     mr.mem_total = 1024
     mr.gpu_usage = [90, 80, 86, 72, 79, 81, 94, 85]
     mr.gpu_temperature = [62, 75, 69, 63, 72, 77, 80, 71]
+    mr.gpu_power = [257, 290, 280, 262, 291, 284, 281, 273]
     mr.gpu_power_limit = [400, 400, 400, 350, 400, 400, 400, 400]
     mr.gpu_mem_used = [2550, 2680, 2543, 2588, 2612, 2603, 2515, 2593]
     mr.gpu_mem_total = [16777216, 16777216, 16777216, 16777216, 16777216, 16777216, 16777216, 16777216]
@@ -59,6 +60,14 @@ def test_monitor_record():
         'gpu_temperature:5': 77,
         'gpu_temperature:6': 80,
         'gpu_temperature:7': 71,
+        'gpu_power:0': 257,
+        'gpu_power:1': 290,
+        'gpu_power:2': 280,
+        'gpu_power:3': 262,
+        'gpu_power:4': 291,
+        'gpu_power:5': 284,
+        'gpu_power:6': 281,
+        'gpu_power:7': 273,
         'gpu_power_limit:0': 400,
         'gpu_power_limit:1': 400,
         'gpu_power_limit:2': 400,
diff --git a/website/blog/2023-04-14-release-0-8.md b/website/blog/2023-04-14-release-0-8.md
new file mode 100644
index 000000000..2124ebc04
--- /dev/null
+++ b/website/blog/2023-04-14-release-0-8.md
@@ -0,0 +1,44 @@
+---
+slug: release-sb-v0.8
+title: Releasing SuperBench v0.8
+author: Peng Cheng
+author_title: SuperBench Team
+author_url: https://github.com/cp5555
+author_image_url: https://github.com/cp5555.png
+tags: [superbench, announcement, release]
+---
+
+We are very happy to announce that **SuperBench 0.8.0 version** is officially released today!
+
+You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation).
+
+## SuperBench 0.8.0 Release Notes
+
+### SuperBench Improvements
+
+- Support SuperBench Executor running on Windows.
+- Remove fixed rccl version in rocm5.1.x docker file.
+- Upgrade networkx version to fix installation compatibility issue.
+- Pin setuptools version to v65.7.0.
+- Limit ansible_runner version for Python 3.6.
+- Support cgroup V2 when read system metrics in monitor.
+- Fix analyzer bug in Python 3.8 due to pandas api change.
+- Collect real-time GPU power in monitor.
+- Remove unreachable condition when write host list in mpi mode.
+- Upgrade Docker image with cuda12.1, nccl 2.17.1-1, hpcx v2.14, and mlc 3.10.
+- Fix wrong unit of cpu-memory-bw-latency in document.
+
+### Micro-benchmark Improvements
+
+- Add STREAM benchmark for sustainable memory bandwidth and the corresponding computation rate.
+- Add HPL Benchmark for HPC Linpack Benchmark.
+- Support flexible warmup and non-random data initialization in cublas-benchmark.
+- Support error tolerance in micro-benchmark for CuDNN function.
+- Add distributed inference benchmark.
+- Support tensor core precisions (e.g., FP8) and batch/shape range in cublaslt gemm.
+
+### Model Benchmark Improvements
+
+- Fix torch.dist init issue with multiple models.
+- Support TE FP8 in BERT/GPT2 model.
+- Add num_workers configurations in model benchmark.
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
index 7e780b1c6..cc583913d 100644
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -101,7 +101,7 @@ module.exports = {
     announcementBar: {
       id: 'supportus',
       content:
-        '📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.7">v0.7.0</a> has been released! ' +
+        '📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.8">v0.8.0</a> has been released! ' +
         '⭐️ If you like SuperBench, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/microsoft/superbenchmark">GitHub</a>! ⭐️',
     },
     algolia: {
diff --git a/website/package-lock.json b/website/package-lock.json
index 369418ed1..7526213de 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -1,6 +1,6 @@
 {
   "name": "superbench-website",
-  "version": "0.7.0",
+  "version": "0.8.0",
   "lockfileVersion": 1,
   "requires": true,
   "dependencies": {
diff --git a/website/package.json b/website/package.json
index f4d217d67..c761f26d8 100644
--- a/website/package.json
+++ b/website/package.json
@@ -1,6 +1,6 @@
 {
   "name": "superbench-website",
-  "version": "0.7.0",
+  "version": "0.8.0",
   "private": true,
   "scripts": {
     "docusaurus": "docusaurus",

From 4cb431cab4dfc43f61e4adf7712fb3e9ebe48e25 Mon Sep 17 00:00:00 2001
From: Ziyue Yang <ziyyang@microsoft.com>
Date: Mon, 24 Apr 2023 10:17:49 +0800
Subject: [PATCH 02/33] Benchmarks - Revise step time collection in distributed
 inference benchmark (#524)

**Description**
This commit revises distributed inference benchmark to give a unified
step time result by taking maximum step times of different GPUs.
---
 superbench/benchmarks/micro_benchmarks/dist_inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference.py b/superbench/benchmarks/micro_benchmarks/dist_inference.py
index 535c4fbf6..8e51b6bd8 100644
--- a/superbench/benchmarks/micro_benchmarks/dist_inference.py
+++ b/superbench/benchmarks/micro_benchmarks/dist_inference.py
@@ -14,6 +14,7 @@
 from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode, Precision
 from superbench.benchmarks.micro_benchmarks import MicroBenchmark
 from superbench.benchmarks.context import Enum
+from superbench.benchmarks.reducer import ReduceType
 
 
 class ComputationKernelType(Enum):
@@ -390,7 +391,7 @@ def _process_data(self, step_times):
         Return:
             True if _process_data succeeds.
         """
-        if not self._process_numeric_result('step_times', step_times, cal_percentile=True):
+        if not self._process_numeric_result('step_times', step_times, reduce_type=ReduceType.MAX, cal_percentile=True):
             return False
         return True
 

From 664c59a14d376510e43cf16bdd4e1eead2f0f923 Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Fri, 28 Apr 2023 11:36:11 +0800
Subject: [PATCH 03/33] Docs - Update version in README (#529)

Update version in README.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 22e3932af..ffcd51960 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 
 __SuperBench__ is a validation and profiling tool for AI infrastructure.
 
-📢 [v0.7.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.7.0) has been released!
+📢 [v0.8.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.8.0) has been released!
 
 ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._
 

From f38a9829d048d47012dd7aa679ea479ba3edd3c1 Mon Sep 17 00:00:00 2001
From: guoshzhao <guzhao@microsoft.com>
Date: Fri, 28 Apr 2023 13:15:47 +0800
Subject: [PATCH 04/33] ModelBenchmarks - Fix early stop logic due to
 num_steps. (#522)

**Description**
Model benchmarks can stop due to `num_steps` or `duration` config which
will take effect when the value is set greater than 0.
If both are set greater than 0, the earliest condition reached will
work.
---
 docs/superbench-config.mdx                    |  4 +--
 .../benchmarks/model_benchmarks/model_base.py |  7 +++-
 .../model_benchmarks/test_model_base.py       | 35 +++++++++++++++++++
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx
index 8802830b2..5720a8125 100644
--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@@ -344,10 +344,10 @@ There have four common parameters for all benchmarks:
 
 For Model-Benchmark, there have some parameters that can control the elapsed time.
 * duration: the elapsed time of benchmark in seconds.
-* num_warmup: the number of warmup step.
+* num_warmup: the number of warmup step, should be positive integer.
 * num_steps: the number of test step.
 
-If `duration > 0` and `num_warmup + num_steps > 0`, then benchmark will take the least as the elapsed time. Otherwise only one of them will take effect.
+If `duration > 0` and `num_steps > 0`, then benchmark will take the least as the elapsed time. Otherwise only one of them will take effect.
 
 ## `Mode` Schema
 
diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py
index a51c05850..6238c2b0e 100644
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -204,6 +204,11 @@ def _preprocess(self):
             )
         )
 
+        if self._args.num_warmup < 0:
+            logger.error('num_warmup should be positive integer, while {} is set.'.format(self._args.num_warmup))
+            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+            return False
+
         if not self._init_distributed_setting():
             self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
             return False
@@ -374,7 +379,7 @@ def _is_finished(self, curr_step, curr_time):
 
         if (
             (self._args.duration > 0 and (curr_time - self._sub_benchmark_start_time) >= self._args.duration)
-            or (total_steps > 0 and curr_step >= total_steps)
+            or (self._args.num_steps > 0 and curr_step >= total_steps)
         ):
             return True
 
diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py
index deba3a438..1b6af1775 100644
--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -20,6 +20,7 @@ def __init__(self, name, parameters=''):
         """
         super().__init__(name, parameters)
         self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16]
+        self._sub_benchmark_start_time = 0
 
     def add_parser_arguments(self):
         """Add the specified arguments."""
@@ -377,3 +378,37 @@ def test_check_result_format():
     # Negative case for __check_raw_data() - invalid benchmark result.
     assert (benchmark._Benchmark__check_result_format() is False)
     assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT)
+
+
+def test_is_finished():
+    """Test interface Benchmark._is_finished()."""
+    # Only step takes effect, benchmarking finish due to step.
+    benchmark = create_benchmark('--num_warmup 32 --num_steps 128 --duration 0')
+    benchmark._preprocess()
+    end_time = 2
+    curr_step = 50
+    assert (benchmark._is_finished(curr_step, end_time) is False)
+    curr_step = 160
+    assert (benchmark._is_finished(curr_step, end_time))
+
+    # Only duration takes effect, benchmarking finish due to duration.
+    benchmark = create_benchmark('--num_warmup 32 --num_steps 0 --duration 10')
+    benchmark._preprocess()
+    benchmark._sub_benchmark_start_time = 0
+    curr_step = 50
+    end_time = 1
+    assert (benchmark._is_finished(curr_step, end_time) is False)
+    end_time = 10
+    assert (benchmark._is_finished(curr_step, end_time))
+
+    # Both step and duration take effect.
+    benchmark = create_benchmark('--num_warmup 32 --num_steps 128 --duration 10')
+    benchmark._preprocess()
+    # Benchmarking finish due to step.
+    curr_step = 160
+    end_time = 2
+    assert (benchmark._is_finished(curr_step, end_time))
+    # Benchmarking finish due to duration.
+    curr_step = 50
+    end_time = 10
+    assert (benchmark._is_finished(curr_step, end_time))

From 4c0d96e5d8dcd234084dfdaa02ccf647dda8f775 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=CC=B7N=CC=B7?= <kaw411gh0st@gmail.com>
Date: Thu, 4 May 2023 07:55:42 +0700
Subject: [PATCH 05/33] Docs - Fix typo on kernel_parameters and kernel_modules
 in system-config (#528)

**Description**
Kernel_parameters and kernel_modules command and examples are exchanged.
---
 docs/user-tutorial/system-config.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md
index 1daef4c7b..dbde728d3 100644
--- a/docs/user-tutorial/system-config.md
+++ b/docs/user-tutorial/system-config.md
@@ -91,22 +91,22 @@ id: system-config
         <b>Kernel</b>
       </td>
       <td>kernel_modules</td>
-      <td>sysctl</td>
+      <td>lsmod</td>
       <td>list of active kernel modules</td>
       <td>
-        "abi.vsyscall32": "1",<br />
-        "debug.exception-trace": "1",<br />
+        "Module": "binfmt_misc",<br />
+        "Size": "24576",<br />
+        "Used": "1"<br />
         ...
       </td>
     </tr>
     <tr>
       <td>kernel_parameters</td>
-      <td>lsmod</td>
+      <td>sysctl</td>
       <td>kernel parameters</td>
       <td>
-        "Module": "binfmt_misc",<br />
-        "Size": "24576",<br />
-        "Used": "1"<br />
+        "abi.vsyscall32": "1",<br />
+        "debug.exception-trace": "1",<br />
         ...
       </td>
     </tr>

From a1cd3c94750631a0ac6a01b93a234500a0d6838f Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Tue, 23 May 2023 17:25:35 +0800
Subject: [PATCH 06/33] Runner - Add signal handler in runner (#530)

Add signal handler in runner to gracefully exit when receiving SIGINT
(<kbd>Ctrl</kbd>+<kbd>C</kbd>) or SIGTERM during benchmark execution.
---
 setup.py                     |  1 +
 superbench/runner/ansible.py |  5 +++--
 superbench/runner/runner.py  | 21 ++++++++++++++++++++-
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index b42639eea..30d3d1878 100644
--- a/setup.py
+++ b/setup.py
@@ -198,6 +198,7 @@ def run(self):
                 'types-pkg_resources',
                 'types-pyyaml',
                 'typing-extensions>=3.10',
+                'urllib3<2.0',
                 'vcrpy>=4.1.1',
                 'yapf==0.31.0',
             ],
diff --git a/superbench/runner/ansible.py b/superbench/runner/ansible.py
index c012edc5c..fc71b7bd6 100644
--- a/superbench/runner/ansible.py
+++ b/superbench/runner/ansible.py
@@ -59,11 +59,12 @@ def __init__(self, config):
                 self._config['cmdline'] += ' --ask-pass --ask-become-pass'
         logger.info(self._config)
 
-    def run(self, ansible_config, sudo=False):    # pragma: no cover
+    def run(self, ansible_config, cancel_callback=None, sudo=False):    # pragma: no cover
         """Run Ansible runner.
 
         Args:
             ansible_config (dict): Ansible config dict.
+            cancel_callback (Callable): Ansible runner cancel callback.
             sudo (bool): Run as sudo or not. Defaults to False.
 
         Returns:
@@ -73,7 +74,7 @@ def run(self, ansible_config, sudo=False):    # pragma: no cover
             logger.info('Run as sudo ...')
             ansible_config['cmdline'] += ' --become'
         with tempfile.TemporaryDirectory(prefix='ansible') as tmpdir:
-            r = ansible_runner.run(private_data_dir=tmpdir, **ansible_config)
+            r = ansible_runner.run(private_data_dir=tmpdir, cancel_callback=cancel_callback, **ansible_config)
             logger.debug(r.stats)
         if r.rc == 0:
             logger.info('Run succeed, return code {}.'.format(r.rc))
diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index 28b5c7186..d91020bfb 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -4,8 +4,10 @@
 """SuperBench Runner."""
 
 import os
+import sys
 import json
 import random
+import signal
 from pathlib import Path
 from pprint import pformat
 from collections import defaultdict
@@ -233,6 +235,18 @@ def fetch_results(self):    # pragma: no cover
             )
         )
 
+    def __signal_handler(self, signum, frame):
+        """Signal handler for runner.
+
+        Args:
+            signum (int): Signal number.
+            frame (FrameType): Timeout frame.
+        """
+        if signum == signal.SIGINT or signum == signal.SIGTERM:
+            logger.info('Killed by %s, exiting ...', signal.Signals(signum).name)
+            self.cleanup()
+            sys.exit(128 + signum)
+
     def __create_results_summary(self):    # pragma: no cover
         """Create the result summary file of all nodes."""
         all_results = list()
@@ -438,12 +452,17 @@ def _run_proc(self, benchmark_name, mode, vars):
             # we do not expect timeout in ansible unless subprocess hangs
             ansible_runner_config['timeout'] = timeout + 60
 
-        rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip))
+        # overwrite ansible runner's default signal handler with main process's
+        rc = self._ansible_client.run(
+            ansible_runner_config, cancel_callback=lambda: None, sudo=(not self._docker_config.skip)
+        )
         return rc
 
     def run(self):
         """Run the SuperBench benchmarks distributedly."""
         self.check_env()
+        signal.signal(signal.SIGINT, self.__signal_handler)
+        signal.signal(signal.SIGTERM, self.__signal_handler)
         for benchmark_name in self._sb_benchmarks:
             if benchmark_name not in self._sb_enabled_benchmarks:
                 continue

From f4dab9f7baf00dffdf7d2f27e7f9e76b816ffb47 Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Wed, 14 Jun 2023 10:51:45 +0800
Subject: [PATCH 07/33] Update error message in setup (#538)

Update error message in setup, require wheel for pip>=23.1.
---
 setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 30d3d1878..af65fc690 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,10 @@
 try:
     pkg_resources.require(['pip>=18', 'setuptools>=45, <66'])
 except (pkg_resources.VersionConflict, pkg_resources.DistributionNotFound):
-    print('Try update pip/setuptools versions, for example, python3 -m pip install --upgrade pip setuptools==65.7')
+    print(
+        '\033[93mTry update pip/setuptools versions, for example, '
+        'python3 -m pip install --upgrade pip wheel setuptools==65.7\033[0m'
+    )
     raise
 
 here = pathlib.Path(__file__).parent.resolve()

From e909ddd0caae9b5b5b94a1f74f7cbbe2eab59733 Mon Sep 17 00:00:00 2001
From: guoshzhao <guzhao@microsoft.com>
Date: Fri, 16 Jun 2023 17:50:09 +0800
Subject: [PATCH 08/33] Benchmarks - Update outdate references (#539)

**Description**
Update 404 outdate reference links.
---
 superbench/benchmarks/model_benchmarks/pytorch_bert.py |  6 +++---
 superbench/benchmarks/model_benchmarks/pytorch_gpt2.py | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/pytorch_bert.py b/superbench/benchmarks/model_benchmarks/pytorch_bert.py
index d43c188b5..d32c586b3 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_bert.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_bert.py
@@ -71,7 +71,7 @@ def __init__(self, name, parameters=''):
     def add_parser_arguments(self):
         """Add the BERT-specified arguments.
 
-        BERT model reference: https://huggingface.co/transformers/model_doc/bert.html
+        BERT model reference: https://huggingface.co/docs/transformers/model_doc/bert
         """
         super().add_parser_arguments()
 
@@ -227,7 +227,7 @@ def _inference_step(self, precision):
 
 
 # Register BERT Large benchmark.
-# Reference: https://huggingface.co/transformers/pretrained_models.html
+# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html
 BenchmarkRegistry.register_benchmark(
     'pytorch-bert-large',
     PytorchBERT,
@@ -235,7 +235,7 @@ def _inference_step(self, precision):
 )
 
 # Register BERT Base benchmark.
-# Reference: https://huggingface.co/transformers/pretrained_models.html
+# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html
 BenchmarkRegistry.register_benchmark(
     'pytorch-bert-base',
     PytorchBERT,
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
index 77c8e4145..4ddcb7d6e 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
@@ -71,7 +71,7 @@ def __init__(self, name, parameters=''):
     def add_parser_arguments(self):
         """Add the GPT2-specified arguments.
 
-        GPT2 model reference: https://huggingface.co/transformers/model_doc/gpt2.html
+        GPT2 model reference: https://huggingface.co/docs/transformers/model_doc/gpt2
         """
         super().add_parser_arguments()
 
@@ -221,25 +221,25 @@ def _inference_step(self, precision):
 
 
 # Register GPT2 benchmark with 117M parameters.
-# Reference: https://huggingface.co/transformers/pretrained_models.html
+# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html
 BenchmarkRegistry.register_benchmark(
     'pytorch-gpt2-small', PytorchGPT2, parameters='--hidden_size=768 --num_hidden_layers=12 --num_attention_heads=12'
 )
 
 # Register GPT2 benchmark with 345M parameters.
-# Reference: https://huggingface.co/transformers/pretrained_models.html
+# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html
 BenchmarkRegistry.register_benchmark(
     'pytorch-gpt2-medium', PytorchGPT2, parameters='--hidden_size=1024 --num_hidden_layers=24 --num_attention_heads=16'
 )
 
 # Register GPT2 benchmark with 774M parameters.
-# Reference: https://huggingface.co/transformers/pretrained_models.html
+# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html
 BenchmarkRegistry.register_benchmark(
     'pytorch-gpt2-large', PytorchGPT2, parameters='--hidden_size=1280 --num_hidden_layers=36 --num_attention_heads=20'
 )
 
 # Register GPT2 benchmark with 1558M parameters.
-# Reference: https://huggingface.co/transformers/pretrained_models.html
+# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html
 BenchmarkRegistry.register_benchmark(
     'pytorch-gpt2-xl', PytorchGPT2, parameters='--hidden_size=1600 --num_hidden_layers=48 --num_attention_heads=25'
 )

From bbb0e24342a69df7ed547d8eb3ca630091a4925f Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Wed, 21 Jun 2023 09:58:13 +0800
Subject: [PATCH 09/33] Benchmarks - Add support for DirectX GPU platform
 (#536)

**Description**
Add support for DirectX GPU platform.

**Major Revision**
- Add DirectX platform for benchmark registry
- Add gpu_vendor identify for AMD and NVIDIA with win driver
---
 superbench/benchmarks/context.py | 1 +
 superbench/common/devices/gpu.py | 4 ++++
 superbench/executor/executor.py  | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/superbench/benchmarks/context.py b/superbench/benchmarks/context.py
index cb9a756ed..d1ad2237b 100644
--- a/superbench/benchmarks/context.py
+++ b/superbench/benchmarks/context.py
@@ -24,6 +24,7 @@ class Platform(Enum):
     CPU = 'CPU'
     CUDA = 'CUDA'
     ROCM = 'ROCm'
+    DIRECTX = 'DirectX'
 
 
 class Framework(Enum):
diff --git a/superbench/common/devices/gpu.py b/superbench/common/devices/gpu.py
index 9cbb06a9c..e12889e10 100644
--- a/superbench/common/devices/gpu.py
+++ b/superbench/common/devices/gpu.py
@@ -29,6 +29,10 @@ def get_vendor(self):
             if not list(Path('/dev/dri').glob('card*')):
                 logger.warning('Cannot find AMD GPU device.')
             return 'amd'
+        if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')):
+            return 'nvidia-graphics'
+        if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/u*.inf_amd64_*/*/aticfx64.dll')):
+            return 'amd-graphics'
         return None
 
     @property
diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py
index f78806981..ca2b78093 100644
--- a/superbench/executor/executor.py
+++ b/superbench/executor/executor.py
@@ -87,6 +87,8 @@ def __get_platform(self):
                 return Platform.CUDA
             elif gpu.vendor == 'amd':
                 return Platform.ROCM
+            elif gpu.vendor == 'amd-graphics' or gpu.vendor == 'nvidia-graphics':
+                return Platform.DIRECTX
         except Exception as e:
             logger.error(e)
         return Platform.CPU

From 44ef531465d555af6a4e72d82b77baf53e77d39b Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Wed, 28 Jun 2023 05:35:11 +0000
Subject: [PATCH 10/33] Dockerfile - Add SuperBench Windows Dockerfile (#534)

**Description**
Add dockerfile for win10 and building script for directx_benchmarks.

**Major Revision**
- Add docker file for win10 and required scripts to install the
dependency
- Add building script to build all directx vs benchmarks
- Add call of building script in Makefile

---------

Co-authored-by: yukirora <yuting.jiang@microsoft.com>
Co-authored-by: Yifan Xiong <yifan.xiong@microsoft.com>
---
 .github/workflows/build-win.yml            | 46 +++++++++++++++
 Makefile                                   |  3 +
 dockerfile/directx/enable-graphics-apis.py | 69 ++++++++++++++++++++++
 dockerfile/directx/install-components.bat  |  9 +++
 dockerfile/directx/mini_vsconfig.json      | 14 +++++
 dockerfile/directx12.dockerfile            | 65 ++++++++++++++++++++
 superbench/benchmarks/build.bat            | 18 ++++++
 7 files changed, 224 insertions(+)
 create mode 100644 .github/workflows/build-win.yml
 create mode 100644 dockerfile/directx/enable-graphics-apis.py
 create mode 100644 dockerfile/directx/install-components.bat
 create mode 100644 dockerfile/directx/mini_vsconfig.json
 create mode 100644 dockerfile/directx12.dockerfile
 create mode 100644 superbench/benchmarks/build.bat

diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml
new file mode 100644
index 000000000..7226af8c7
--- /dev/null
+++ b/.github/workflows/build-win.yml
@@ -0,0 +1,46 @@
+name: Build on Windows
+
+on:
+  push:
+    branches:
+    - main
+    - release/*
+  pull_request:
+    branches:
+    - main
+    - release/*
+
+jobs:
+  docker:
+    name: Docker build win2004
+    runs-on: [self-hosted, windows, x64, win2004]
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        submodules: true
+    - name: Build Docker image
+      working-directory: .
+      shell: pwsh
+      run: |
+        docker build `
+          --file dockerfile/directx12.dockerfile `
+          --label org.opencontainers.image.source=${{ github.event.repository.html_url }} `
+          --label org.opencontainers.image.created=${{ github.event.repository.pushed_at }} `
+          --label org.opencontainers.image.revision=${{ github.sha }} `
+          --platform windows/amd64 `
+          --isolation=process `
+          --tag $env:TAG .
+      env:
+        TAG: superbench/main:win2004
+    - name: Push Docker image
+      if: ${{ github.event_name != 'pull_request' }}
+      shell: pwsh
+      run: |
+        docker login -u $env:USER -p $env:PASS
+        docker push $env:TAG
+        docker logout
+      env:
+        TAG: superbench/main:win2004
+        USER: ${{ secrets.DOCKERHUB_USERNAME }}
+        PASS: ${{ secrets.DOCKERHUB_TOKEN }}
diff --git a/Makefile b/Makefile
index 8e43caadd..a7b8f05d1 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,9 @@ cppformat:
 cppbuild:
 	cd ./superbench/benchmarks/ && bash build.sh
 
+directxbuild:
+	cd ./superbench/benchmarks/ && build.bat
+
 thirdparty:
 	cd ./third_party/ && make all
 
diff --git a/dockerfile/directx/enable-graphics-apis.py b/dockerfile/directx/enable-graphics-apis.py
new file mode 100644
index 000000000..7f6d0e3cd
--- /dev/null
+++ b/dockerfile/directx/enable-graphics-apis.py
@@ -0,0 +1,69 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Enables graphics APIs in the Windows container."""
+# Reference to
+# https://github.com/EpicGames/UnrealEngine/blob/release/Engine/Extras/Containers/Dockerfiles/windows/runtime/enable-graphics-apis.ps1
+
+import os
+import shutil
+import glob
+
+
+def copy_to_system32(source_directory, filenames, rename=None):
+    """Copies the specified files from the source directory to the system32 directory."""
+    for filename in filenames:
+        source = os.path.join(source_directory, filename)
+        destination = os.path.join('C:\\Windows\\System32', filename)
+        if rename and filename in rename:
+            renamed = rename[filename]
+            destination = os.path.join('C:\\Windows\\System32', renamed)
+        try:
+            print(f'Copying {source} to {destination}')
+            shutil.copy2(source, destination)
+        except Exception as e:
+            print(f'Warning: failed to copy file {filename}. Reason: {str(e)}')
+
+
+# Attempt to locate the NVIDIA Display Driver directory in the host system's driver store
+nvidia_sentinel_file = glob.glob('C:\\Windows\\System32\\HostDriverStore\\FileRepository\\nv*.inf_amd64_*\\nvapi64.dll')
+if nvidia_sentinel_file:
+    nvidia_directory = os.path.dirname(nvidia_sentinel_file[0])
+    print(f'Found NVIDIA Display Driver directory: {nvidia_directory}')
+
+    print('\nEnabling NVIDIA NVAPI support:')
+    copy_to_system32(nvidia_directory, ['nvapi64.dll'])
+
+    print('\nEnabling NVIDIA NVENC support:')
+    copy_to_system32(nvidia_directory, ['nvEncodeAPI64.dll', 'nvEncMFTH264x.dll', 'nvEncMFThevcx.dll'])
+
+    print('\nEnabling NVIDIA CUVID/NVDEC support:')
+    copy_to_system32(
+        nvidia_directory, ['nvcuvid64.dll', 'nvDecMFTMjpeg.dll', 'nvDecMFTMjpegx.dll'],
+        {'nvcuvid64.dll': 'nvcuvid.dll'}
+    )
+
+    print('\nEnabling NVIDIA CUDA support:')
+    copy_to_system32(
+        nvidia_directory, ['nvcuda64.dll', 'nvcuda_loader64.dll', 'nvptxJitCompiler64.dll'],
+        {'nvcuda_loader64.dll': 'nvcuda.dll'}
+    )
+
+    print('\n')
+
+# Attempt to locate the AMD Display Driver directory in the host system's driver store
+amd_sentinel_file = glob.glob('C:\\Windows\\System32\\HostDriverStore\\FileRepository\\u*.inf_amd64_*\\*\\aticfx64.dll')
+if amd_sentinel_file:
+    amd_directory = os.path.dirname(amd_sentinel_file[0])
+    print(f'Found AMD Display Driver directory: {amd_directory}')
+
+    print('\nCopying AMD DirectX driver files:')
+    copy_to_system32(amd_directory, ['aticfx64.dll', 'atidxx64.dll'])
+
+    print('\nEnabling AMD Display Library (ADL) support:')
+    copy_to_system32(amd_directory, ['atiadlxx.dll', 'atiadlxy.dll'])
+
+    print('\nEnabling AMD Advanced Media Framework (AMF) support:')
+    copy_to_system32(amd_directory, ['amfrt64.dll', 'amfrtdrv64.dll', 'amdihk64.dll'])
+
+    print('\n')
diff --git a/dockerfile/directx/install-components.bat b/dockerfile/directx/install-components.bat
new file mode 100644
index 000000000..95c42380a
--- /dev/null
+++ b/dockerfile/directx/install-components.bat
@@ -0,0 +1,9 @@
+REM Copyright (c) Microsoft Corporation - All rights reserved
+REM Licensed under the MIT License
+
+curl -s -L https://aka.ms/vs/17/release/vs_BuildTools.exe -o "vs_BuildTools.exe"
+start /b /wait vs_BuildTools.exe --config  %SB_HOME%\dockerfile\directx\mini_vsconfig.json --wait --quiet --norestart --nocache
+if %errorlevel% neq 0 (
+  exit /b %errorlevel%
+)
+del "vs_BuildTools.exe"
diff --git a/dockerfile/directx/mini_vsconfig.json b/dockerfile/directx/mini_vsconfig.json
new file mode 100644
index 000000000..f22b84143
--- /dev/null
+++ b/dockerfile/directx/mini_vsconfig.json
@@ -0,0 +1,14 @@
+{
+  "version": "1.0",
+  "components": [
+    "Microsoft.VisualStudio.Component.Windows10SDK.19041",
+    "Microsoft.VisualStudio.Workload.VCTools",
+    "Microsoft.Component.MSBuild",
+    "Microsoft.VisualStudio.Component.CoreBuildTools",
+    "Microsoft.VisualStudio.Workload.MSBuildTools",
+    "Microsoft.VisualStudio.Component.VC.CoreBuildTools",
+    "Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+    "Microsoft.VisualStudio.Component.VC.14.35.17.5.ATL.Spectre",
+    "Microsoft.VisualStudio.Component.VC.14.35.17.5.MFC.Spectre"
+  ]
+}
diff --git a/dockerfile/directx12.dockerfile b/dockerfile/directx12.dockerfile
new file mode 100644
index 000000000..1a958d69a
--- /dev/null
+++ b/dockerfile/directx12.dockerfile
@@ -0,0 +1,65 @@
+FROM mcr.microsoft.com/windows:2004
+
+
+# Install Python and additional packages
+# Download Python
+ADD https://www.python.org/ftp/python/3.9.7/python-3.9.7-amd64.exe python-installer.exe
+# Install Python
+RUN python-installer.exe /quiet InstallAllUsers=1 PrependPath=1 && DEL python-installer.exe
+# Verify Python Was Successfully Installed
+RUN python --version && \
+    python -m ensurepip --upgrade
+
+# Install choco and install some necessary packages
+RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; \
+    [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))"
+RUN choco install -y vcredist-all vim git make
+
+# Retrieve the DirectX runtime files required by the Unreal Engine, since even the full Windows base image does not include them
+RUN mkdir C:\GatheredDlls
+RUN curl -s -L "https://download.microsoft.com/download/8/4/A/84A35BF1-DAFE-4AE8-82AF-AD2AE20B6B14/directx_Jun2010_redist.exe" --output %TEMP%\directx_redist.exe && \
+    start /wait %TEMP%\directx_redist.exe /Q /T:%TEMP%\DirectX && \
+    expand %TEMP%\DirectX\APR2007_xinput_x64.cab -F:xinput1_3.dll C:\GatheredDlls\ && \
+    expand %TEMP%\DirectX\Feb2010_X3DAudio_x64.cab -F:X3DAudio1_7.dll C:\GatheredDlls\ && \
+    expand %TEMP%\DirectX\Jun2010_D3DCompiler_43_x64.cab -F:D3DCompiler_43.dll C:\GatheredDlls\ && \
+    expand %TEMP%\DirectX\Jun2010_XAudio_x64.cab -F:XAudio2_7.dll C:\GatheredDlls\ && \
+    expand %TEMP%\DirectX\Jun2010_XAudio_x64.cab -F:XAPOFX1_5.dll C:\GatheredDlls\ && \
+    break
+
+# Retrieve the DirectX shader compiler files needed for DirectX Raytracing (DXR)
+RUN curl -s -L "https://github.com/microsoft/DirectXShaderCompiler/releases/download/v1.6.2104/dxc_2021_04-20.zip" --output %TEMP%\dxc.zip && \
+    powershell -Command "Expand-Archive -Path \"$env:TEMP\dxc.zip\" -DestinationPath $env:TEMP" && \
+    xcopy /y %TEMP%\bin\x64\dxcompiler.dll C:\GatheredDlls\ && \
+    xcopy /y %TEMP%\bin\x64\dxil.dll C:\GatheredDlls\ && \
+    break
+
+# Copy the required DLLs to System32 dir
+RUN xcopy C:\GatheredDlls\* C:\windows\System32\ /i
+
+ENV SB_HOME="C:/superbench" \
+    SB_MICRO_PATH="C:/superbench" \
+    WindowsSDKDir="\\Program Files (x86)\\Windows Kits\\10\\"
+
+RUN setx INCLUDE "%include%;%WindowsSDKDir%\\Include" /M && \
+    setx LIB "%lib%;%WindowsSDKDir%\\Lib" /M && \
+    setx PATH "%path%;%SB_MICRO_PATH%\\bin" /M
+
+WORKDIR ${SB_HOME}
+COPY ./ ${SB_HOME}
+
+# Download vs_BuildTools.exe if not already present
+RUN mkdir "%SB_MICRO_PATH%/bin"
+RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "%SB_MICRO_PATH%/bin/nuget.exe"
+# Run the setup script to install the visual studio components
+RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat"
+
+# Install Superbench
+RUN python -m pip install setuptools==65.0.0 && \
+    python -m pip install --no-cache-dir .[amdworker] && \
+    make directxbuild
+
+# Run the entrypoint script for enabling vendor-specific graphics APIs
+RUN powershell -Command "Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine -Force"
+CMD [ "python", "dockerfile/directx/enable-graphics-apis.py" ]
+ENTRYPOINT [ "cmd.exe" ]
diff --git a/superbench/benchmarks/build.bat b/superbench/benchmarks/build.bat
new file mode 100644
index 000000000..8639e1771
--- /dev/null
+++ b/superbench/benchmarks/build.bat
@@ -0,0 +1,18 @@
+@echo off
+REM Copyright (c) Microsoft Corporation - All rights reserved
+REM Licensed under the MIT License
+
+
+SETLOCAL EnableDelayedExpansion
+
+for /r %%F in (*.vcxproj) do (
+    echo Found .vcxproj file: %%~dpF%%~nxF
+    SET "PROJ_PATH=%%~dpF%%~nxF"
+    SET "MSBUILD=C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Current\Bin\MSBuild.exe"
+    REM Download dependencies
+    "!MSBUILD!" "!PROJ_PATH!" -t:restore -p:RestorePackagesConfig=true
+    REM Build project
+    "!MSBUILD!" "!PROJ_PATH!" /p:Configuration=Release /p:AdditionalLibraryDirectories="%WindowsSDKDir%\Lib" /p:AdditionalIncludeDirectories="%WindowsSDKDir%\Include" /p:OutDir="%SB_MICRO_PATH%\bin"
+)
+
+endlocal

From 3a6622f7d3cf09530523fde077fc7154c8510bc7 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 29 Jun 2023 02:06:14 +0000
Subject: [PATCH 11/33] Benchmarks: Add benchmark - Add source code of
 DirectXGPUCoreFLOPs microbenchmark (#488)

**Description**
Add source code of DirectXGPUCoreFLOPs microbenchmark.

---------

Co-authored-by: v-junlinlv <v-junlinlv@microsoft.com>
---
 .gitignore                                    |   73 +
 .../BenchmarkOptions.h                        |   68 +
 .../GPUCore.cpp                               |  507 +++
 .../directx_gemm_flops_performance/GPUCore.h  |  151 +
 .../GPUCore.vcxproj                           |  110 +
 .../directx_gemm_flops_performance/Main.cpp   |   11 +
 .../packages.config                           |    4 +
 .../directx_third_party/DXSampleHelper.h      |  275 ++
 .../directx_third_party/d3dx12.h              | 3258 +++++++++++++++++
 .../directx_utils/D3D12Timer.cpp              |   80 +
 .../directx_utils/D3D12Timer.h                |   54 +
 .../micro_benchmarks/directx_utils/Options.h  |  113 +
 12 files changed, 4704 insertions(+)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/Main.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/packages.config
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/DXSampleHelper.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/d3dx12.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_utils/Options.h

diff --git a/.gitignore b/.gitignore
index 8872e5df4..e1ab18ca4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,3 +141,76 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
new file mode 100644
index 000000000..c5207bb4f
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
@@ -0,0 +1,68 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "../directx_utils/Options.h"
+
+namespace Option {
+enum Precision {
+    F16,
+    F32,
+};
+using PrecisionType = Option::Precision;
+} // namespace Option
+
+class BenchmarkOptions : public Options {
+  public:
+    // Number of warm up rounds to run.
+    int num_warm_up = 0;
+    // The number of benchmark runs.
+    int num_loops = 0;
+    // Dimension m of GEMM.
+    int m = 0;
+    // Dimension n of GEMM.
+    int n = 0;
+    // Dimension k of GEMM.
+    int k = 0;
+    // The precision of calculate.
+    Option::PrecisionType mode_precision = Option::F32;
+
+    /**
+     * @brief Construct a new GPUCoreOptions object.
+     */
+    BenchmarkOptions(int argc, char *argv[]) : Options(argc, argv) {}
+
+    /**
+     * @brief Parse the arguments.
+     */
+    virtual void parse_arguments() {
+
+        num_loops = get_cmd_line_argument_int("--num_loops", 10);
+        num_warm_up = get_cmd_line_argument_int("--num_loops", 0);
+        m = get_cmd_line_argument_int("--m", 16 * 256);
+        n = get_cmd_line_argument_int("--n", 16 * 256);
+        k = get_cmd_line_argument_int("--k", 16 * 256);
+        if (get_cmd_line_argument_bool("--f16")) {
+            mode_precision = Option::F16;
+        }
+        if (get_cmd_line_argument_bool("--f32")) {
+            mode_precision = Option::F32;
+        }
+    }
+
+    /**
+     * @brief Get the option usage.
+     */
+    void get_option_usage() override {
+        std::cout << "Usage: " << std::endl;
+        std::cout << "  --help: Print help message." << std::endl;
+        std::cout << "  --num_loops: The number of benchmark runs." << std::endl;
+        std::cout << "  --num_warm_up: The number of warmup runs." << std::endl;
+        std::cout << "  --m: m dimension of GEMM." << std::endl;
+        std::cout << "  --n: n dimension of GEMM." << std::endl;
+        std::cout << "  --k: l dimension of GEMM." << std::endl;
+        std::cout << "  --fp16: half precision to compute." << std::endl;
+        std::cout << "  --fp32: float precision to compute." << std::endl;
+    }
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
new file mode 100644
index 000000000..206c49f90
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
@@ -0,0 +1,507 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// GPUCore.cpp : This file contains the 'main' function. Program execution begins and ends there.
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <tchar.h>
+#include <vector>
+
+#include <directml.h>
+
+#include "GPUCore.h"
+
+/**
+ * @brief Setup GPU and start benchmark.
+ */
+void GPUCore::Run() {
+    int m = opts->m;
+    int n = opts->n;
+    int k = opts->k;
+
+    // Setup GPU objects like device and command list.
+    CreatePipeline();
+
+    int loops = opts->num_loops;
+    std::cout << "GPUCoreFLOPs" << std::endl;
+
+    switch (opts->mode_precision) {
+    case Option::F32: {
+        // Prepare input and output data and buffers.
+        PrepareData<float>(opts->m, opts->n, opts->k);
+        // Setup pipeline and compile operator.
+        SetupAndCompileOp(opts->m, opts->n, opts->k, DML_TENSOR_DATA_TYPE_FLOAT32);
+        InitializeOp<float>(opts->m, opts->n, opts->k);
+        for (int i = 0; i < opts->num_warm_up; ++i) {
+            ExecuteComputeOp();
+        }
+        for (int i = 0; i < loops; ++i) {
+            gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
+            // Do FLOPs job.
+            double timeInMs = ExecuteComputeOp();
+            auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
+            std::cout << flops << " TFLOPs" << std::endl;
+#if defined _PRINT_RESULT
+            PrintResultForDebug<float>(m, n);
+#endif
+        }
+    } break;
+    case Option::F16: {
+        PrepareData<uint16_t>(opts->m, opts->n, opts->k);
+        SetupAndCompileOp(opts->m, opts->n, opts->k, DML_TENSOR_DATA_TYPE_FLOAT16);
+        InitializeOp<uint16_t>(opts->m, opts->n, opts->k);
+        for (int i = 0; i < opts->num_warm_up; ++i) {
+            ExecuteComputeOp();
+        }
+        for (int i = 0; i < loops; ++i) {
+            gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
+            // Do FLOPs job.
+            double timeInMs = ExecuteComputeOp();
+            auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
+            std::cout << flops << " TFLOPs" << std::endl;
+#if defined _PRINT_RESULT
+            PrintResultForDebug<uint16_t>(m, n);
+#endif
+        }
+    } break;
+    default:
+        std::cout << "Error: Unsupported precision mode." << std::endl;
+        break;
+    }
+}
+
+/**
+ * @brief Create pipeline including
+ *		  create device object, command list, command queue
+ *		  and synchronization objects.
+ */
+void GPUCore::CreatePipeline() {
+    UINT dxgiFactoryFlags = 0;
+
+#if defined(_DEBUG)
+    // Enable the debug layer (requires the Graphics Tools "optional feature").
+    // NOTE: Enabling the debug layer after device creation will invalidate the active device.
+    {
+        ComPtr<ID3D12Debug> debugController;
+        if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) {
+            debugController->EnableDebugLayer();
+
+            // Enable additional debug layers.
+            dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG;
+        }
+    }
+#endif
+
+    ComPtr<IDXGIFactory4> factory;
+    ThrowIfFailed(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory)));
+
+    ComPtr<IDXGIAdapter1> hardwareAdapter;
+    GetHardwareAdapter(factory.Get(), &hardwareAdapter);
+
+    // Create GPU device object.
+    ThrowIfFailed(D3D12CreateDevice(hardwareAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device)));
+
+    DML_CREATE_DEVICE_FLAGS dmlCreateDeviceFlags = DML_CREATE_DEVICE_FLAG_NONE;
+
+#if defined(_DEBUG)
+    // If the project is in a debug build, then enable the Direct3D 12 debug layer.
+    // This is optional (starting in DML_FEATURE_LEVEL_5_2) but strongly recommended!
+
+    // If the project is in a debug build, then enable debugging via DirectML debug layers with this flag.
+    dmlCreateDeviceFlags |= DML_CREATE_DEVICE_FLAG_DEBUG;
+#endif
+
+    ThrowIfFailed(DMLCreateDevice(m_device.Get(), dmlCreateDeviceFlags, IID_PPV_ARGS(m_dmlDevice.GetAddressOf())));
+
+    D3D12_COMMAND_QUEUE_DESC queueDesc;
+    // Initialize command queue.
+    ZeroMemory(&queueDesc, sizeof(queueDesc));
+
+    // Describe and create the command queue.
+    queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+    queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+
+    D3D12_COMMAND_QUEUE_DESC cqd3 = {};
+    cqd3.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+    m_device->CreateCommandQueue(&cqd3, IID_PPV_ARGS(&m_commandQueue));
+
+    m_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_commandAllocator));
+
+    m_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_commandAllocator.Get(), nullptr,
+                                IID_PPV_ARGS(&m_commandList));
+
+    // Create fence.
+    ThrowIfFailed(m_device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_fence)));
+    m_currentFence = 1;
+    // Create an event handle to use for GPU synchronization.
+    m_eventHandle = CreateEvent(0, false, false, 0);
+}
+
+/**
+ * @brief Calculates the minimum number of bytes required to store a buffer tensor with the specified type, sizes, and
+    strides. The formula can be expressed as the following:
+
+    IndexOfLastElement = dot(Sizes - 1, Strides);
+    MinimumImpliedSizeInBytes = roundup((IndexOfLastElement + 1) * ElementSizeInBytes, 4)
+
+    In other words, the minimum size of a tensor is the index of the one-past-the-end element, multiplied by the
+    element size (e.g. 2 bytes for a FLOAT16 tensor). Additionally DirectML requires that all buffers bound must have
+    a total size which is DWORD-aligned, and hence the minimum implied size in bytes must be rounded up to the nearest
+    4-byte boundary.
+
+    Refer to DirectMLX.h (https://github.com/microsoft/DirectML/blob/master/Libraries/DirectMLX.h).
+ */
+inline UINT64 DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE dataType, UINT tensorElementCount) {
+    UINT elementSizeInBytes = 0;
+    switch (dataType) {
+    case DML_TENSOR_DATA_TYPE_FLOAT32:
+        elementSizeInBytes = 4;
+        break;
+    case DML_TENSOR_DATA_TYPE_FLOAT16:
+        elementSizeInBytes = 2;
+        break;
+    default:
+        return 0; // Invalid data type
+    }
+    UINT64 minimumImpliedSizeInBytes = 0;
+    // Aligh size in 4 bytes in memory
+    // Round up to nearest multiple 4 bytes
+    minimumImpliedSizeInBytes = (tensorElementCount * elementSizeInBytes + 3) & ~3ull;
+    return minimumImpliedSizeInBytes;
+}
+
+/**
+ * @brief Create and initialize DML_TENSOR_DESC.
+ */
+std::unique_ptr<DML_TENSOR_DESC> GPUCore::CreateTensorDesc(DML_TENSOR_DATA_TYPE dataType, UINT *tensorSizes,
+                                                           int dimensionCount) {
+    std::unique_ptr<DML_TENSOR_DESC> tensorDesc = std::make_unique<DML_TENSOR_DESC>();
+    std::unique_ptr<DML_BUFFER_TENSOR_DESC> bufferDesc = std::make_unique<DML_BUFFER_TENSOR_DESC>();
+
+    // Initialize tensorDesc
+    tensorDesc->Type = DML_TENSOR_TYPE_BUFFER;
+
+    // Initialize bufferDesc
+    UINT tensorElementCount = tensorSizes[0] * tensorSizes[1] * tensorSizes[2] * tensorSizes[3];
+    bufferDesc->DataType = dataType;
+    bufferDesc->Flags = DML_TENSOR_FLAG_NONE;
+    bufferDesc->DimensionCount = dimensionCount;
+    bufferDesc->Sizes = tensorSizes;
+    bufferDesc->Strides = nullptr;
+    bufferDesc->TotalTensorSizeInBytes = DMLCalcBufferTensorSize(dataType, tensorElementCount);
+
+    // Assign bufferDesc to tensorDesc
+    tensorDesc->Desc = bufferDesc.release();
+
+    return tensorDesc;
+}
+
+/**
+ * @brief Setup and compile DirectML operator.
+ */
+void GPUCore::SetupAndCompileOp(int m, int n, int k, DML_TENSOR_DATA_TYPE dataType) {
+    // Create DirectML operator(s). Operators represent abstract functions such as "multiply", "reduce",
+    // "convolution", or even compound operations such as recurrent neural nets. This example creates an instance of
+    // the Identity operator, which applies the function f(x) = x for all elements in a tensor.
+    std::unique_ptr<DML_GEMM_OPERATOR_DESC> dmlGEMMOperatorDesc = std::make_unique<DML_GEMM_OPERATOR_DESC>();
+
+    UINT tensorSizesA[4] = {1, 1, static_cast<UINT>(m), static_cast<UINT>(k)};
+    std::unique_ptr<DML_TENSOR_DESC> dmlTensorDescA = CreateTensorDesc(dataType, tensorSizesA, ARRAYSIZE(tensorSizesA));
+    dmlGEMMOperatorDesc->ATensor = dmlTensorDescA.release();
+    UINT tensorSizesB[4] = {1, 1, static_cast<UINT>(k), static_cast<UINT>(n)};
+    std::unique_ptr<DML_TENSOR_DESC> dmlTensorDescB = CreateTensorDesc(dataType, tensorSizesB, ARRAYSIZE(tensorSizesB));
+    dmlGEMMOperatorDesc->BTensor = dmlTensorDescB.release();
+    UINT tensorSizes[4] = {1, 1, static_cast<UINT>(m), static_cast<UINT>(n)};
+    std::unique_ptr<DML_TENSOR_DESC> dmlTensorDescC = CreateTensorDesc(dataType, tensorSizes, ARRAYSIZE(tensorSizes));
+    dmlGEMMOperatorDesc->OutputTensor = dmlTensorDescC.release();
+
+    dmlGEMMOperatorDesc->CTensor = nullptr;
+    dmlGEMMOperatorDesc->TransA = DML_MATRIX_TRANSFORM_NONE;
+    dmlGEMMOperatorDesc->TransB = DML_MATRIX_TRANSFORM_NONE;
+    dmlGEMMOperatorDesc->Alpha = 1.0f;
+    dmlGEMMOperatorDesc->Beta = 0.0f;
+
+    std::unique_ptr<DML_OPERATOR_DESC> dmlOperatorDesc = std::make_unique<DML_OPERATOR_DESC>();
+
+    dmlOperatorDesc->Type = DML_OPERATOR_GEMM;
+    dmlOperatorDesc->Desc = dmlGEMMOperatorDesc.release();
+    ComPtr<IDMLOperator> dmlOperator;
+    ThrowIfFailed(m_dmlDevice->CreateOperator(dmlOperatorDesc.release(), IID_PPV_ARGS(dmlOperator.GetAddressOf())));
+
+    ThrowIfFailed(m_dmlDevice->CompileOperator(dmlOperator.Get(), DML_EXECUTION_FLAG_NONE,
+                                               IID_PPV_ARGS(m_dmlCompiledOperator.GetAddressOf())));
+}
+
+/**
+ * @brief Prepare input and output data and buffers of the tensor elements..
+ */
+template <typename T> void GPUCore::PrepareData(const int m, const int n, const int k) {
+    // Define the tensors.
+    std::vector<T> dataA(m * k);
+    std::vector<T> dataB(n * k);
+
+    // Prepare input data.
+    std::fill(dataA.begin(), dataA.end(), 1);
+    std::fill(dataB.begin(), dataB.end(), 1);
+
+    UINT64 byteSize = m * k * sizeof(T);
+
+    // Setup input buffer A and upload input data.
+    m_inputBufferA =
+        CreateDefaultBuffer(m_device.Get(), m_commandList.Get(), dataA.data(), byteSize, m_inputUploadBufferA);
+
+    byteSize = n * k * sizeof(T);
+    // Setup input buffer B and upload input data.
+    m_inputBufferB =
+        CreateDefaultBuffer(m_device.Get(), m_commandList.Get(), dataB.data(), byteSize, m_inputUploadBufferB);
+
+    byteSize = m * n * sizeof(T);
+    // Create output buffer.
+    ThrowIfFailed(m_device->CreateCommittedResource(
+        get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)), D3D12_HEAP_FLAG_NONE,
+        get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(byteSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)),
+        D3D12_RESOURCE_STATE_UNORDERED_ACCESS, nullptr, IID_PPV_ARGS(&m_outputBuffer)));
+
+    // Create readback buffer.
+    ThrowIfFailed(
+        m_device->CreateCommittedResource(get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK)),
+                                          D3D12_HEAP_FLAG_NONE, get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(byteSize)),
+                                          D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readBackBuffer)));
+    CloseExecuteResetWait();
+}
+
+/**
+ * @brief Initialize DirectML operator.
+ */
+template <typename T> void GPUCore::InitializeOp(int m, int n, int k) {
+    ComPtr<IDMLOperatorInitializer> dmlOperatorInitializer;
+
+    IDMLCompiledOperator *dmlCompiledOperators[] = {m_dmlCompiledOperator.Get()};
+    ThrowIfFailed(m_dmlDevice->CreateOperatorInitializer(ARRAYSIZE(dmlCompiledOperators), dmlCompiledOperators,
+                                                         IID_PPV_ARGS(dmlOperatorInitializer.GetAddressOf())));
+
+    // Query the operator for the required size (in descriptors) of its binding table.
+    // You need to initialize an operator exactly once before it can be executed, and
+    // the two stages require different numbers of descriptors for binding. For simplicity,
+    // we create a single descriptor heap that's large enough to satisfy them both.
+    DML_BINDING_PROPERTIES initializeBindingProperties = dmlOperatorInitializer->GetBindingProperties();
+    DML_BINDING_PROPERTIES executeBindingProperties = m_dmlCompiledOperator->GetBindingProperties();
+    UINT descriptorCount =
+        initializeBindingProperties.RequiredDescriptorCount > executeBindingProperties.RequiredDescriptorCount
+            ? initializeBindingProperties.RequiredDescriptorCount
+            : executeBindingProperties.RequiredDescriptorCount;
+
+    // Create descriptor heaps.
+    std::unique_ptr<D3D12_DESCRIPTOR_HEAP_DESC> descriptorHeapDesc = std::make_unique<D3D12_DESCRIPTOR_HEAP_DESC>();
+    descriptorHeapDesc->Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+    descriptorHeapDesc->NumDescriptors = descriptorCount;
+    descriptorHeapDesc->Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+    ThrowIfFailed(m_device->CreateDescriptorHeap(descriptorHeapDesc.release(), _uuidof(m_descriptorHeap),
+                                                 (void **)m_descriptorHeap.GetAddressOf()));
+
+    // Set the descriptor heap(s).
+    ID3D12DescriptorHeap *d3D12DescriptorHeaps[] = {m_descriptorHeap.Get()};
+    m_commandList->SetDescriptorHeaps(ARRAYSIZE(d3D12DescriptorHeaps), d3D12DescriptorHeaps);
+
+    // Create a binding table over the descriptor heap we just created
+    std::unique_ptr<DML_BINDING_TABLE_DESC> dmlBindingTableDesc = std::make_unique<DML_BINDING_TABLE_DESC>();
+    dmlBindingTableDesc->CPUDescriptorHandle = m_descriptorHeap->GetCPUDescriptorHandleForHeapStart();
+    dmlBindingTableDesc->GPUDescriptorHandle = m_descriptorHeap->GetGPUDescriptorHandleForHeapStart();
+    dmlBindingTableDesc->Dispatchable = dmlOperatorInitializer.Get();
+    dmlBindingTableDesc->SizeInDescriptors = descriptorCount;
+    ThrowIfFailed(
+        m_dmlDevice->CreateBindingTable(dmlBindingTableDesc.get(), IID_PPV_ARGS(m_bindingTable.GetAddressOf())));
+
+    // Create the temporary and persistent resources that are necessary for executing an operator.
+    // The temporary resource is scratch memory (used internally by DirectML), whose contents you don't need to define.
+    // The persistent resource is long-lived, and you need to initialize it using the IDMLOperatorInitializer.
+    UINT64 temporaryResourceSize =
+        max(initializeBindingProperties.TemporaryResourceSize, executeBindingProperties.TemporaryResourceSize);
+    UINT64 persistentResourceSize = executeBindingProperties.PersistentResourceSize;
+
+    // Bind and initialize the operator on the GPU.
+    ComPtr<ID3D12Resource> temporaryBuffer;
+    if (temporaryResourceSize != 0) {
+        ThrowIfFailed(m_device->CreateCommittedResource(
+            get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)), D3D12_HEAP_FLAG_NONE,
+            get_rvalue_ptr(
+                CD3DX12_RESOURCE_DESC::Buffer(temporaryResourceSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)),
+            D3D12_RESOURCE_STATE_COMMON, nullptr, IID_PPV_ARGS(temporaryBuffer.GetAddressOf())));
+
+        if (initializeBindingProperties.TemporaryResourceSize != 0) {
+            DML_BUFFER_BINDING bufferBinding{temporaryBuffer.Get(), 0, temporaryResourceSize};
+            DML_BINDING_DESC bindingDesc{DML_BINDING_TYPE_BUFFER, &bufferBinding};
+            m_bindingTable->BindTemporaryResource(&bindingDesc);
+        }
+    }
+
+    ComPtr<ID3D12Resource> persistentBuffer;
+    if (persistentResourceSize != 0) {
+        ThrowIfFailed(m_device->CreateCommittedResource(
+            get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)), D3D12_HEAP_FLAG_NONE,
+            get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(persistentResourceSize)), D3D12_RESOURCE_STATE_COMMON, nullptr,
+            IID_PPV_ARGS(persistentBuffer.GetAddressOf())));
+
+        // The persistent resource should be bound as the output to the IDMLOperatorInitializer.
+        DML_BUFFER_BINDING bufferBinding{persistentBuffer.Get(), 0, persistentResourceSize};
+        DML_BINDING_DESC bindingDesc{DML_BINDING_TYPE_BUFFER, &bufferBinding};
+        m_bindingTable->BindOutputs(1, &bindingDesc);
+    }
+
+    ThrowIfFailed(m_dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_dmlCommandRecorder)));
+
+    // Record execution of the operator initializer.
+    m_dmlCommandRecorder->RecordDispatch(m_commandList.Get(), dmlOperatorInitializer.Get(), m_bindingTable.Get());
+    CloseExecuteResetWait();
+
+    // Bind and execute the operator on the GPU.
+    m_commandList->SetDescriptorHeaps(ARRAYSIZE(d3D12DescriptorHeaps), d3D12DescriptorHeaps);
+
+    // Reset the binding table to bind for the operator we want to execute (it was previously used to bind for the
+    // initializer).
+    dmlBindingTableDesc->Dispatchable = m_dmlCompiledOperator.Get();
+
+    ThrowIfFailed(m_bindingTable->Reset(dmlBindingTableDesc.get()));
+
+    if (temporaryResourceSize != 0) {
+        DML_BUFFER_BINDING bufferBinding{temporaryBuffer.Get(), 0, temporaryResourceSize};
+        DML_BINDING_DESC bindingDesc{DML_BINDING_TYPE_BUFFER, &bufferBinding};
+        m_bindingTable->BindTemporaryResource(&bindingDesc);
+    }
+
+    if (persistentResourceSize != 0) {
+        DML_BUFFER_BINDING bufferBinding{persistentBuffer.Get(), 0, persistentResourceSize};
+        DML_BINDING_DESC bindingDesc{DML_BINDING_TYPE_BUFFER, &bufferBinding};
+        m_bindingTable->BindPersistentResource(&bindingDesc);
+    }
+
+    CloseExecuteResetWait();
+
+    DML_BUFFER_BINDING inputBufferBindingA{m_inputBufferA.Get(), 0, sizeof(T) * m * k};
+    DML_BINDING_DESC inputBindingDescA{DML_BINDING_TYPE_BUFFER, &inputBufferBindingA};
+
+    DML_BUFFER_BINDING inputBufferBindingB{m_inputBufferB.Get(), 0, sizeof(T) * n * k};
+    DML_BINDING_DESC inputBindingDescB{DML_BINDING_TYPE_BUFFER, &inputBufferBindingB};
+
+    DML_BUFFER_BINDING bufferBinding = {nullptr, 0, 0};
+    DML_BINDING_DESC inputBindingDesc{DML_BINDING_TYPE_NONE, &bufferBinding};
+
+    std::array<DML_BINDING_DESC, 3> inputBindings = {inputBindingDescA, inputBindingDescB, inputBindingDesc};
+    m_bindingTable->BindInputs(3, inputBindings.data());
+
+    DML_BUFFER_BINDING outputBufferBinding{m_outputBuffer.Get(), 0, sizeof(T) * n * m};
+    DML_BINDING_DESC outputBindingDesc{DML_BINDING_TYPE_BUFFER, &outputBufferBinding};
+
+    m_bindingTable->BindOutputs(1, &outputBindingDesc);
+}
+
+#if defined _PRINT_RESULT
+/**
+ * @brief Print the result of the benchmark for debug.
+ */
+template <typename T> void GPUCore::PrintResultForDebug(int m, int n) {
+    // The output buffer now contains the result of the identity operator,
+    // so read it back if you want the CPU to access it.
+    m_commandList->ResourceBarrier(
+        1, get_rvalue_ptr(CD3DX12_RESOURCE_BARRIER::Transition(
+               m_outputBuffer.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE)));
+
+    m_commandList->CopyResource(m_readBackBuffer.Get(), m_outputBuffer.Get());
+
+    CloseExecuteResetWait();
+    D3D12_RANGE tensorBufferRange{0, static_cast<SIZE_T>(sizeof(T) * n * m)};
+    T *outputBufferData{};
+    ThrowIfFailed(m_readBackBuffer->Map(0, &tensorBufferRange, reinterpret_cast<void **>(&outputBufferData)));
+    std::string outputString = "output tensor: ";
+    for (size_t tensorElementIndex{0}; tensorElementIndex < static_cast<SIZE_T>(m * n);
+         ++tensorElementIndex, ++outputBufferData) {
+        outputString += std::to_string(*outputBufferData) + ' ';
+    }
+
+    std::cout << outputString << std::endl;
+    D3D12_RANGE emptyRange{0, 0};
+    m_readBackBuffer->Unmap(0, &emptyRange);
+}
+#endif
+
+/**
+ * @brief Execute the computation GEMM op.
+ * @return the elapsed time in ms.
+ */
+double GPUCore::ExecuteComputeOp() {
+
+    // Execute the compiled GEMM operator and record the GPU time.
+    this->gpuTimer.start(m_commandList.Get(), 0);
+    m_dmlCommandRecorder->RecordDispatch(m_commandList.Get(), m_dmlCompiledOperator.Get(), m_bindingTable.Get());
+    this->gpuTimer.stop(m_commandList.Get(), 0);
+    this->gpuTimer.resolveQueryToCPU(m_commandList.Get(), 0);
+    CloseExecuteResetWait();
+    double timeInMs = this->gpuTimer.getElapsedMsByTimestampPair(0);
+    return timeInMs;
+}
+
+/**
+ * @brief Close and execute command list, wait until command completed.
+ */
+void GPUCore::CloseExecuteResetWait(DWORD dwMilliseconds) {
+    m_commandList->Close();
+    ID3D12CommandList *commandLists[] = {m_commandList.Get()};
+    m_commandQueue->ExecuteCommandLists(ARRAYSIZE(commandLists), commandLists);
+    // Wait until command completed.
+    // Signal and increment the fence value.
+    const UINT64 fenceL = m_currentFence;
+    m_commandQueue->Signal(m_fence.Get(), fenceL);
+    m_currentFence++;
+
+    // Wait until command queue is done.
+    if (m_fence->GetCompletedValue() < fenceL) {
+        m_fence->SetEventOnCompletion(fenceL, m_eventHandle);
+        WaitForSingleObject(m_eventHandle, dwMilliseconds);
+    }
+    ThrowIfFailed(m_commandAllocator->Reset());
+    ThrowIfFailed(m_commandList->Reset(m_commandAllocator.Get(), nullptr));
+}
+
+/**
+ * @brief Create a default buffer and upload data with the upload buffer.
+ * @param device the GPU device object.
+ * @param cmdList the GPU command list object.
+ * @param initData the data that need to upload.
+ * @param byteSize the size of data that need to upload.
+ * @param uploadBuffer the upload that use for upload data.
+ * @return a default buffer object.
+ */
+Microsoft::WRL::ComPtr<ID3D12Resource>
+GPUCore::CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, const void *initData,
+                             UINT64 byteSize, Microsoft::WRL::ComPtr<ID3D12Resource> &uploadBuffer) {
+    ComPtr<ID3D12Resource> defaultBuffer;
+
+    // Create the default buffer on GPU side.
+    CD3DX12_HEAP_PROPERTIES DefaultHeap(D3D12_HEAP_TYPE_DEFAULT);
+    CD3DX12_RESOURCE_DESC defaultResourceDesc =
+        CD3DX12_RESOURCE_DESC::Buffer(byteSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+    ThrowIfFailed(device->CreateCommittedResource(&DefaultHeap, D3D12_HEAP_FLAG_NONE, &defaultResourceDesc,
+                                                  D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+                                                  IID_PPV_ARGS(defaultBuffer.GetAddressOf())));
+
+    // Create upload buffer to upload data.
+    CD3DX12_HEAP_PROPERTIES UploadHeap(D3D12_HEAP_TYPE_UPLOAD);
+    CD3DX12_RESOURCE_DESC UploadResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(byteSize);
+    ThrowIfFailed(device->CreateCommittedResource(&UploadHeap, D3D12_HEAP_FLAG_NONE, &UploadResourceDesc,
+                                                  D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+                                                  IID_PPV_ARGS(uploadBuffer.GetAddressOf())));
+
+    // Upload data to GPU side.
+    D3D12_SUBRESOURCE_DATA subResourceData = {};
+    subResourceData.pData = initData;
+    subResourceData.RowPitch = byteSize;
+    subResourceData.SlicePitch = subResourceData.RowPitch;
+
+    UpdateSubresources<1>(cmdList, defaultBuffer.Get(), uploadBuffer.Get(), 0, 0, 1, &subResourceData);
+    CD3DX12_RESOURCE_BARRIER ReadBarrier = CD3DX12_RESOURCE_BARRIER::Transition(
+        defaultBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_GENERIC_READ);
+    cmdList->ResourceBarrier(1, &ReadBarrier);
+
+    return defaultBuffer;
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.h
new file mode 100644
index 000000000..e619a75ea
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.h
@@ -0,0 +1,151 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers.
+#endif
+
+#include <DirectXPackedVector.h>
+#include <chrono>
+#include <d3d12.h>
+#include <d3d12shader.h>
+#include <d3dcompiler.h>
+#include <directml.h>
+#include <dxgi1_6.h>
+#include <string>
+#include <unordered_map>
+#include <windowsx.h>
+#include <wrl.h>
+
+// linker
+#pragma comment(lib, "dxguid.lib")
+#pragma comment(lib, "dxgi.lib")
+#pragma comment(lib, "D3D12.lib")
+#pragma comment(lib, "d3dcompiler.lib")
+
+#if defined(_DEBUG)
+#include <dxgidebug.h>
+#endif
+
+#include "../directx_third_party/DXSampleHelper.h"
+#include "../directx_third_party/d3dx12.h"
+#include "../directx_utils/D3D12Timer.h"
+#include "BenchmarkOptions.h"
+
+using namespace std;
+using namespace DirectX;
+// Note that while ComPtr is used to manage the lifetime of resources on the CPU,
+// it has no understanding of the lifetime of resources on the GPU. Apps must account
+// for the GPU lifetime of resources to avoid destroying objects that may still be
+// referenced by the GPU.
+// An example of this can be found in the class method: OnDestroy().
+using Microsoft::WRL::ComPtr;
+
+template <typename T> T *get_rvalue_ptr(T &&v) { return &v; }
+
+class GPUCore {
+  public:
+    GPUCore(BenchmarkOptions *opts) : opts(opts) {}
+    ~GPUCore() {}
+
+    /**
+     * @brief Setup GPU and start benchmark.
+     */
+    void Run();
+
+    /**
+     * @brief Create pipeline including
+     *		  create device object, command list, command queue
+     *		  and synchronization objects.
+     */
+    void CreatePipeline();
+
+    /**
+     * @brief Prepare input and output data and buffers of the tensor elements..
+     */
+    template <typename T> void PrepareData(const int m, const int n, const int k);
+
+    /**
+     * @brief Create and initialize DML_TENSOR_DESC.
+     */
+    std::unique_ptr<DML_TENSOR_DESC> CreateTensorDesc(DML_TENSOR_DATA_TYPE dataType, UINT *tensorSizes,
+                                                      int dimensionCount);
+
+    /**
+     * @brief Setup and compile DirectML operator.
+     */
+    void SetupAndCompileOp(int m, int n, int k, DML_TENSOR_DATA_TYPE dataType);
+
+    /**
+     * @brief Initialize DirectML operator.
+     */
+    template <typename T> void InitializeOp(int m, int n, int k);
+
+    /**
+     * @brief Execute the computation GEMM op.
+     * @return the elapsed time in ms.
+     */
+    double ExecuteComputeOp();
+
+    /**
+     * @brief Close and execute command list, wait until command completed.
+     */
+    void CloseExecuteResetWait(DWORD dwMilliseconds = 300000);
+
+#if defined _PRINT_RESULT
+    /**
+     * @brief Print the result of the benchmark for debug.
+     */
+    template <typename T> void PrintResultForDebug(int m, int n);
+#endif
+
+    /**
+     * @brief Create a default buffer and upload data with the upload buffer.
+     * @param device the GPU device object.
+     * @param cmdList the GPU command list object.
+     * @param initData the data that need to upload.
+     * @param byteSize the size of data that need to upload.
+     * @param UploadBuffer the upload that use for upload data.
+     * @return a default buffer object.
+     */
+    Microsoft::WRL::ComPtr<ID3D12Resource> CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList,
+                                                               const void *initData, UINT64 byteSize,
+                                                               Microsoft::WRL::ComPtr<ID3D12Resource> &UploadBuffer);
+
+  private:
+    // Pipeline objects.
+    ComPtr<ID3D12Device> m_device = nullptr;
+    ComPtr<ID3D12CommandAllocator> m_commandAllocator = nullptr;
+    ComPtr<ID3D12CommandQueue> m_commandQueue = nullptr;
+    ComPtr<ID3D12GraphicsCommandList> m_commandList = nullptr;
+    ComPtr<IDMLDevice> m_dmlDevice = nullptr;
+    ComPtr<IDMLCommandRecorder> m_dmlCommandRecorder = nullptr;
+    ComPtr<IDMLCompiledOperator> m_dmlCompiledOperator = nullptr;
+    ComPtr<IDMLBindingTable> m_bindingTable = nullptr;
+    ComPtr<ID3D12DescriptorHeap> m_descriptorHeap = nullptr;
+
+    // Input buffer to pass data into GPU.
+    ComPtr<ID3D12Resource> m_inputBufferA = nullptr;
+    ComPtr<ID3D12Resource> m_inputUploadBufferA = nullptr;
+    ComPtr<ID3D12Resource> m_inputBufferB = nullptr;
+    ComPtr<ID3D12Resource> m_inputUploadBufferB = nullptr;
+
+    // Output buffer that result output on GPU.
+    ComPtr<ID3D12Resource> m_outputBuffer = nullptr;
+
+    // Readback buffer to copy data from GPU side to CPU side.
+    ComPtr<ID3D12Resource> m_readBackBuffer = nullptr;
+
+    // Synchronization objects.
+    ComPtr<ID3D12Fence> m_fence = nullptr;
+    UINT64 m_currentFence = 0;
+    HANDLE m_eventHandle = nullptr;
+
+    // GPU timer.
+    D3D12::D3D12Timer gpuTimer;
+
+    // Options.
+    BenchmarkOptions *opts;
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
new file mode 100644
index 000000000..109d39305
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
@@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="packages\Microsoft.AI.DirectML.1.11.0\build\Microsoft.AI.DirectML.props" Condition="Exists('packages\Microsoft.AI.DirectML.1.11.0\build\Microsoft.AI.DirectML.props')" />
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{8407ef34-a93c-473a-8fac-2598b2695b61}</ProjectGuid>
+    <RootNamespace>GPUCore</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <RunCodeAnalysis>false</RunCodeAnalysis>
+    <EnableClangTidyCodeAnalysis>true</EnableClangTidyCodeAnalysis>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions);_PRINT_RESULT</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\directx_utils\D3D12Timer.cpp" />
+    <ClCompile Include="GPUCore.cpp" />
+    <ClCompile Include="Main.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\directx_third_party\d3dx12.h" />
+    <ClInclude Include="..\directx_third_party\DXSampleHelper.h" />
+    <ClInclude Include="..\directx_utils\D3D12Timer.h" />
+    <ClInclude Include=".\directx_utils\Options.h" />
+    <ClInclude Include="GPUCore.h" />
+    <ClInclude Include="BenchmarkOptions.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
+  <PropertyGroup>
+    <RestorePackages>true</RestorePackages>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="packages\Microsoft.AI.DirectML.1.11.0\build\Microsoft.AI.DirectML.targets" Condition="Exists('packages\Microsoft.AI.DirectML.1.11.0\build\Microsoft.AI.DirectML.targets')" />
+  </ImportGroup>
+  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
+    <PropertyGroup>
+      <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
+    </PropertyGroup>
+    <Error Condition="!Exists('packages\Microsoft.AI.DirectML.1.11.0\build\Microsoft.AI.DirectML.props')" Text="$([System.String]::Format('$(ErrorText)', 'packages\Microsoft.AI.DirectML.1.11.0\build\Microsoft.AI.DirectML.props'))" />
+    <Error Condition="!Exists('packages\Microsoft.AI.DirectML.1.11.0\build\Microsoft.AI.DirectML.targets')" Text="$([System.String]::Format('$(ErrorText)', 'packages\Microsoft.AI.DirectML.1.11.0\build\Microsoft.AI.DirectML.targets'))" />
+  </Target>
+</Project>
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/Main.cpp
new file mode 100644
index 000000000..9e403f5de
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/Main.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "GPUCore.h"
+
+int main(int argc, char *argv[]) {
+    std::unique_ptr<BenchmarkOptions> opts = std::make_unique<BenchmarkOptions>(argc, argv);
+    opts->init();
+    std::unique_ptr<GPUCore> gpucopy = std::make_unique<GPUCore>(opts.get());
+    gpucopy->Run();
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/packages.config b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/packages.config
new file mode 100644
index 000000000..0bf9cc34c
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/packages.config
@@ -0,0 +1,4 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="Microsoft.AI.DirectML" version="1.11.0" targetFramework="native" />
+</packages>
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/DXSampleHelper.h b/superbench/benchmarks/micro_benchmarks/directx_third_party/DXSampleHelper.h
new file mode 100644
index 000000000..780bb0896
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/DXSampleHelper.h
@@ -0,0 +1,275 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+//*********************************************************
+
+#pragma once
+#include <d3d12.h>
+#include <dxgi1_6.h>
+#include <stdexcept>
+#include <wrl.h>
+
+// Note that while ComPtr is used to manage the lifetime of resources on the CPU,
+// it has no understanding of the lifetime of resources on the GPU. Apps must account
+// for the GPU lifetime of resources to avoid destroying objects that may still be
+// referenced by the GPU.
+using Microsoft::WRL::ComPtr;
+
+inline std::string HrToString(HRESULT hr) {
+    char s_str[64] = {};
+    sprintf_s(s_str, "HRESULT of 0x%08X", static_cast<UINT>(hr));
+    return std::string(s_str);
+}
+
+class HrException : public std::runtime_error {
+  public:
+    HrException(HRESULT hr) : std::runtime_error(HrToString(hr)), m_hr(hr) {}
+    HRESULT Error() const { return m_hr; }
+
+  private:
+    const HRESULT m_hr;
+};
+
+#define SAFE_RELEASE(p)                                                                                                \
+    if (p)                                                                                                             \
+    (p)->Release()
+
+inline void ThrowIfFailed(HRESULT hr) {
+    if (FAILED(hr)) {
+        throw HrException(hr);
+    }
+}
+
+inline void GetAssetsPath(_Out_writes_(pathSize) WCHAR *path, UINT pathSize) {
+    if (path == nullptr) {
+        throw std::exception();
+    }
+
+    DWORD size = GetModuleFileName(nullptr, path, pathSize);
+    if (size == 0 || size == pathSize) {
+        // Method failed or path was truncated.
+        throw std::exception();
+    }
+
+    WCHAR *lastSlash = wcsrchr(path, L'\\');
+    if (lastSlash) {
+        *(lastSlash + 1) = L'\0';
+    }
+}
+
+inline HRESULT ReadDataFromFile(LPCWSTR filename, byte **data, UINT *size) {
+    using namespace Microsoft::WRL;
+
+    CREATEFILE2_EXTENDED_PARAMETERS extendedParams = {};
+    extendedParams.dwSize = sizeof(CREATEFILE2_EXTENDED_PARAMETERS);
+    extendedParams.dwFileAttributes = FILE_ATTRIBUTE_NORMAL;
+    extendedParams.dwFileFlags = FILE_FLAG_SEQUENTIAL_SCAN;
+    extendedParams.dwSecurityQosFlags = SECURITY_ANONYMOUS;
+    extendedParams.lpSecurityAttributes = nullptr;
+    extendedParams.hTemplateFile = nullptr;
+
+    Wrappers::FileHandle file(CreateFile2(filename, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, &extendedParams));
+    if (file.Get() == INVALID_HANDLE_VALUE) {
+        throw std::exception();
+    }
+
+    FILE_STANDARD_INFO fileInfo = {};
+    if (!GetFileInformationByHandleEx(file.Get(), FileStandardInfo, &fileInfo, sizeof(fileInfo))) {
+        throw std::exception();
+    }
+
+    if (fileInfo.EndOfFile.HighPart != 0) {
+        throw std::exception();
+    }
+
+    *data = reinterpret_cast<byte *>(malloc(fileInfo.EndOfFile.LowPart));
+    *size = fileInfo.EndOfFile.LowPart;
+
+    if (!ReadFile(file.Get(), *data, fileInfo.EndOfFile.LowPart, nullptr, nullptr)) {
+        throw std::exception();
+    }
+
+    return S_OK;
+}
+
+inline HRESULT ReadDataFromDDSFile(LPCWSTR filename, byte **data, UINT *offset, UINT *size) {
+    if (FAILED(ReadDataFromFile(filename, data, size))) {
+        return E_FAIL;
+    }
+
+    // DDS files always start with the same magic number.
+    static const UINT DDS_MAGIC = 0x20534444;
+    UINT magicNumber = *reinterpret_cast<const UINT *>(*data);
+    if (magicNumber != DDS_MAGIC) {
+        return E_FAIL;
+    }
+
+    struct DDS_PIXELFORMAT {
+        UINT size;
+        UINT flags;
+        UINT fourCC;
+        UINT rgbBitCount;
+        UINT rBitMask;
+        UINT gBitMask;
+        UINT bBitMask;
+        UINT aBitMask;
+    };
+
+    struct DDS_HEADER {
+        UINT size;
+        UINT flags;
+        UINT height;
+        UINT width;
+        UINT pitchOrLinearSize;
+        UINT depth;
+        UINT mipMapCount;
+        UINT reserved1[11];
+        DDS_PIXELFORMAT ddsPixelFormat;
+        UINT caps;
+        UINT caps2;
+        UINT caps3;
+        UINT caps4;
+        UINT reserved2;
+    };
+
+    auto ddsHeader = reinterpret_cast<const DDS_HEADER *>(*data + sizeof(UINT));
+    if (ddsHeader->size != sizeof(DDS_HEADER) || ddsHeader->ddsPixelFormat.size != sizeof(DDS_PIXELFORMAT)) {
+        return E_FAIL;
+    }
+
+    const ptrdiff_t ddsDataOffset = sizeof(UINT) + sizeof(DDS_HEADER);
+    *offset = ddsDataOffset;
+    *size = *size - ddsDataOffset;
+
+    return S_OK;
+}
+
+// Assign a name to the object to aid with debugging.
+#if defined(_DEBUG) || defined(DBG)
+inline void SetName(ID3D12Object *pObject, LPCWSTR name) { pObject->SetName(name); }
+inline void SetNameIndexed(ID3D12Object *pObject, LPCWSTR name, UINT index) {
+    WCHAR fullName[50];
+    if (swprintf_s(fullName, L"%s[%u]", name, index) > 0) {
+        pObject->SetName(fullName);
+    }
+}
+#else
+inline void SetName(ID3D12Object *, LPCWSTR) {}
+inline void SetNameIndexed(ID3D12Object *, LPCWSTR, UINT) {}
+#endif
+
+// Naming helper for ComPtr<T>.
+// Assigns the name of the variable as the name of the object.
+// The indexed variant will include the index in the name of the object.
+#define NAME_D3D12_OBJECT(x) SetName((x).Get(), L#x)
+#define NAME_D3D12_OBJECT_INDEXED(x, n) SetNameIndexed((x)[n].Get(), L#x, n)
+
+inline UINT CalculateConstantBufferByteSize(UINT byteSize) {
+    // Constant buffer size is required to be aligned.
+    return (byteSize + (D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1)) &
+           ~(D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1);
+}
+
+#ifdef D3D_COMPILE_STANDARD_FILE_INCLUDE
+inline Microsoft::WRL::ComPtr<ID3DBlob> CompileShader(const std::wstring &filename, const D3D_SHADER_MACRO *defines,
+                                                      const std::string &entrypoint, const std::string &target) {
+    UINT compileFlags = 0;
+#if defined(_DEBUG) || defined(DBG)
+    compileFlags = D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION;
+#endif
+
+    HRESULT hr;
+
+    Microsoft::WRL::ComPtr<ID3DBlob> byteCode = nullptr;
+    Microsoft::WRL::ComPtr<ID3DBlob> errors;
+    hr = D3DCompileFromFile(filename.c_str(), defines, D3D_COMPILE_STANDARD_FILE_INCLUDE, entrypoint.c_str(),
+                            target.c_str(), compileFlags, 0, &byteCode, &errors);
+
+    if (errors != nullptr) {
+        OutputDebugStringA((char *)errors->GetBufferPointer());
+    }
+    ThrowIfFailed(hr);
+
+    return byteCode;
+}
+#endif
+
+// Resets all elements in a ComPtr array.
+template <class T> void ResetComPtrArray(T *comPtrArray) {
+    for (auto &i : *comPtrArray) {
+        i.Reset();
+    }
+}
+
+// Resets all elements in a unique_ptr array.
+template <class T> void ResetUniquePtrArray(T *uniquePtrArray) {
+    for (auto &i : *uniquePtrArray) {
+        i.reset();
+    }
+}
+
+/**
+ * @brief Helper function for acquiring the first available hardware adapter that supports Direct3D 12.
+ *        If no such adapter can be found, *ppAdapter will be set to nullptr.
+ * @param pFactory a pointer to factory object.
+ * @param[out] ppAdapter a pointer of pointer to a adapter.
+ * @param requestHighPerformanceAdapter the option of adapter.
+ */
+inline void GetHardwareAdapter(IDXGIFactory1 *pFactory, IDXGIAdapter1 **ppAdapter,
+                               bool requestHighPerformanceAdapter = FALSE) {
+    *ppAdapter = nullptr;
+
+    ComPtr<IDXGIAdapter1> adapter;
+
+    ComPtr<IDXGIFactory6> factory6;
+    if (SUCCEEDED(pFactory->QueryInterface(IID_PPV_ARGS(&factory6)))) {
+        for (UINT adapterIndex = 0; SUCCEEDED(factory6->EnumAdapterByGpuPreference(
+                 adapterIndex,
+                 requestHighPerformanceAdapter == true ? DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE
+                                                       : DXGI_GPU_PREFERENCE_UNSPECIFIED,
+                 IID_PPV_ARGS(&adapter)));
+             ++adapterIndex) {
+            DXGI_ADAPTER_DESC1 desc;
+            adapter->GetDesc1(&desc);
+
+            if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) {
+                // Don't select the Basic Render Driver adapter.
+                // If you want a software adapter, pass in "/warp" on the command line.
+                continue;
+            }
+
+            // Check to see whether the adapter supports Direct3D 12, but don't create the
+            // actual device yet.
+            if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, _uuidof(ID3D12Device), nullptr))) {
+                break;
+            }
+        }
+    }
+
+    if (adapter.Get() == nullptr) {
+        for (UINT adapterIndex = 0; SUCCEEDED(pFactory->EnumAdapters1(adapterIndex, &adapter)); ++adapterIndex) {
+            DXGI_ADAPTER_DESC1 desc;
+            adapter->GetDesc1(&desc);
+
+            if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) {
+                // Don't select the Basic Render Driver adapter.
+                // If you want a software adapter, pass in "/warp" on the command line.
+                continue;
+            }
+
+            // Check to see whether the adapter supports Direct3D 12, but don't create the
+            // actual device yet.
+            if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, _uuidof(ID3D12Device), nullptr))) {
+                break;
+            }
+        }
+    }
+
+    *ppAdapter = adapter.Detach();
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/d3dx12.h b/superbench/benchmarks/micro_benchmarks/directx_third_party/d3dx12.h
new file mode 100644
index 000000000..17b2b79a7
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/d3dx12.h
@@ -0,0 +1,3258 @@
+//*********************************************************
+//
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License (MIT).
+//
+//*********************************************************
+
+#ifndef __D3DX12_H__
+#define __D3DX12_H__
+
+#include "d3d12.h"
+
+#if defined(__cplusplus)
+
+struct CD3DX12_DEFAULT {};
+extern const DECLSPEC_SELECTANY CD3DX12_DEFAULT D3D12_DEFAULT;
+
+//------------------------------------------------------------------------------------------------
+inline bool operator==(const D3D12_VIEWPORT &l, const D3D12_VIEWPORT &r) noexcept {
+    return l.TopLeftX == r.TopLeftX && l.TopLeftY == r.TopLeftY && l.Width == r.Width && l.Height == r.Height &&
+           l.MinDepth == r.MinDepth && l.MaxDepth == r.MaxDepth;
+}
+
+//------------------------------------------------------------------------------------------------
+inline bool operator!=(const D3D12_VIEWPORT &l, const D3D12_VIEWPORT &r) noexcept { return !(l == r); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RECT : public D3D12_RECT {
+    CD3DX12_RECT() = default;
+    explicit CD3DX12_RECT(const D3D12_RECT &o) noexcept : D3D12_RECT(o) {}
+    explicit CD3DX12_RECT(LONG Left, LONG Top, LONG Right, LONG Bottom) noexcept {
+        left = Left;
+        top = Top;
+        right = Right;
+        bottom = Bottom;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_VIEWPORT : public D3D12_VIEWPORT {
+    CD3DX12_VIEWPORT() = default;
+    explicit CD3DX12_VIEWPORT(const D3D12_VIEWPORT &o) noexcept : D3D12_VIEWPORT(o) {}
+    explicit CD3DX12_VIEWPORT(FLOAT topLeftX, FLOAT topLeftY, FLOAT width, FLOAT height,
+                              FLOAT minDepth = D3D12_MIN_DEPTH, FLOAT maxDepth = D3D12_MAX_DEPTH) noexcept {
+        TopLeftX = topLeftX;
+        TopLeftY = topLeftY;
+        Width = width;
+        Height = height;
+        MinDepth = minDepth;
+        MaxDepth = maxDepth;
+    }
+    explicit CD3DX12_VIEWPORT(_In_ ID3D12Resource *pResource, UINT mipSlice = 0, FLOAT topLeftX = 0.0f,
+                              FLOAT topLeftY = 0.0f, FLOAT minDepth = D3D12_MIN_DEPTH,
+                              FLOAT maxDepth = D3D12_MAX_DEPTH) noexcept {
+        auto Desc = pResource->GetDesc();
+        const UINT64 SubresourceWidth = Desc.Width >> mipSlice;
+        const UINT64 SubresourceHeight = Desc.Height >> mipSlice;
+        switch (Desc.Dimension) {
+        case D3D12_RESOURCE_DIMENSION_BUFFER:
+            TopLeftX = topLeftX;
+            TopLeftY = 0.0f;
+            Width = float(Desc.Width) - topLeftX;
+            Height = 1.0f;
+            break;
+        case D3D12_RESOURCE_DIMENSION_TEXTURE1D:
+            TopLeftX = topLeftX;
+            TopLeftY = 0.0f;
+            Width = (SubresourceWidth ? float(SubresourceWidth) : 1.0f) - topLeftX;
+            Height = 1.0f;
+            break;
+        case D3D12_RESOURCE_DIMENSION_TEXTURE2D:
+        case D3D12_RESOURCE_DIMENSION_TEXTURE3D:
+            TopLeftX = topLeftX;
+            TopLeftY = topLeftY;
+            Width = (SubresourceWidth ? float(SubresourceWidth) : 1.0f) - topLeftX;
+            Height = (SubresourceHeight ? float(SubresourceHeight) : 1.0f) - topLeftY;
+            break;
+        default:
+            break;
+        }
+
+        MinDepth = minDepth;
+        MaxDepth = maxDepth;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_BOX : public D3D12_BOX {
+    CD3DX12_BOX() = default;
+    explicit CD3DX12_BOX(const D3D12_BOX &o) noexcept : D3D12_BOX(o) {}
+    explicit CD3DX12_BOX(LONG Left, LONG Right) noexcept {
+        left = static_cast<UINT>(Left);
+        top = 0;
+        front = 0;
+        right = static_cast<UINT>(Right);
+        bottom = 1;
+        back = 1;
+    }
+    explicit CD3DX12_BOX(LONG Left, LONG Top, LONG Right, LONG Bottom) noexcept {
+        left = static_cast<UINT>(Left);
+        top = static_cast<UINT>(Top);
+        front = 0;
+        right = static_cast<UINT>(Right);
+        bottom = static_cast<UINT>(Bottom);
+        back = 1;
+    }
+    explicit CD3DX12_BOX(LONG Left, LONG Top, LONG Front, LONG Right, LONG Bottom, LONG Back) noexcept {
+        left = static_cast<UINT>(Left);
+        top = static_cast<UINT>(Top);
+        front = static_cast<UINT>(Front);
+        right = static_cast<UINT>(Right);
+        bottom = static_cast<UINT>(Bottom);
+        back = static_cast<UINT>(Back);
+    }
+};
+inline bool operator==(const D3D12_BOX &l, const D3D12_BOX &r) noexcept {
+    return l.left == r.left && l.top == r.top && l.front == r.front && l.right == r.right && l.bottom == r.bottom &&
+           l.back == r.back;
+}
+inline bool operator!=(const D3D12_BOX &l, const D3D12_BOX &r) noexcept { return !(l == r); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_DEPTH_STENCIL_DESC : public D3D12_DEPTH_STENCIL_DESC {
+    CD3DX12_DEPTH_STENCIL_DESC() = default;
+    explicit CD3DX12_DEPTH_STENCIL_DESC(const D3D12_DEPTH_STENCIL_DESC &o) noexcept : D3D12_DEPTH_STENCIL_DESC(o) {}
+    explicit CD3DX12_DEPTH_STENCIL_DESC(CD3DX12_DEFAULT) noexcept {
+        DepthEnable = TRUE;
+        DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL;
+        DepthFunc = D3D12_COMPARISON_FUNC_LESS;
+        StencilEnable = FALSE;
+        StencilReadMask = D3D12_DEFAULT_STENCIL_READ_MASK;
+        StencilWriteMask = D3D12_DEFAULT_STENCIL_WRITE_MASK;
+        const D3D12_DEPTH_STENCILOP_DESC defaultStencilOp = {D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP,
+                                                             D3D12_STENCIL_OP_KEEP, D3D12_COMPARISON_FUNC_ALWAYS};
+        FrontFace = defaultStencilOp;
+        BackFace = defaultStencilOp;
+    }
+    explicit CD3DX12_DEPTH_STENCIL_DESC(BOOL depthEnable, D3D12_DEPTH_WRITE_MASK depthWriteMask,
+                                        D3D12_COMPARISON_FUNC depthFunc, BOOL stencilEnable, UINT8 stencilReadMask,
+                                        UINT8 stencilWriteMask, D3D12_STENCIL_OP frontStencilFailOp,
+                                        D3D12_STENCIL_OP frontStencilDepthFailOp, D3D12_STENCIL_OP frontStencilPassOp,
+                                        D3D12_COMPARISON_FUNC frontStencilFunc, D3D12_STENCIL_OP backStencilFailOp,
+                                        D3D12_STENCIL_OP backStencilDepthFailOp, D3D12_STENCIL_OP backStencilPassOp,
+                                        D3D12_COMPARISON_FUNC backStencilFunc) noexcept {
+        DepthEnable = depthEnable;
+        DepthWriteMask = depthWriteMask;
+        DepthFunc = depthFunc;
+        StencilEnable = stencilEnable;
+        StencilReadMask = stencilReadMask;
+        StencilWriteMask = stencilWriteMask;
+        FrontFace.StencilFailOp = frontStencilFailOp;
+        FrontFace.StencilDepthFailOp = frontStencilDepthFailOp;
+        FrontFace.StencilPassOp = frontStencilPassOp;
+        FrontFace.StencilFunc = frontStencilFunc;
+        BackFace.StencilFailOp = backStencilFailOp;
+        BackFace.StencilDepthFailOp = backStencilDepthFailOp;
+        BackFace.StencilPassOp = backStencilPassOp;
+        BackFace.StencilFunc = backStencilFunc;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_DEPTH_STENCIL_DESC1 : public D3D12_DEPTH_STENCIL_DESC1 {
+    CD3DX12_DEPTH_STENCIL_DESC1() = default;
+    explicit CD3DX12_DEPTH_STENCIL_DESC1(const D3D12_DEPTH_STENCIL_DESC1 &o) noexcept : D3D12_DEPTH_STENCIL_DESC1(o) {}
+    explicit CD3DX12_DEPTH_STENCIL_DESC1(const D3D12_DEPTH_STENCIL_DESC &o) noexcept {
+        DepthEnable = o.DepthEnable;
+        DepthWriteMask = o.DepthWriteMask;
+        DepthFunc = o.DepthFunc;
+        StencilEnable = o.StencilEnable;
+        StencilReadMask = o.StencilReadMask;
+        StencilWriteMask = o.StencilWriteMask;
+        FrontFace.StencilFailOp = o.FrontFace.StencilFailOp;
+        FrontFace.StencilDepthFailOp = o.FrontFace.StencilDepthFailOp;
+        FrontFace.StencilPassOp = o.FrontFace.StencilPassOp;
+        FrontFace.StencilFunc = o.FrontFace.StencilFunc;
+        BackFace.StencilFailOp = o.BackFace.StencilFailOp;
+        BackFace.StencilDepthFailOp = o.BackFace.StencilDepthFailOp;
+        BackFace.StencilPassOp = o.BackFace.StencilPassOp;
+        BackFace.StencilFunc = o.BackFace.StencilFunc;
+        DepthBoundsTestEnable = FALSE;
+    }
+    explicit CD3DX12_DEPTH_STENCIL_DESC1(CD3DX12_DEFAULT) noexcept {
+        DepthEnable = TRUE;
+        DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL;
+        DepthFunc = D3D12_COMPARISON_FUNC_LESS;
+        StencilEnable = FALSE;
+        StencilReadMask = D3D12_DEFAULT_STENCIL_READ_MASK;
+        StencilWriteMask = D3D12_DEFAULT_STENCIL_WRITE_MASK;
+        const D3D12_DEPTH_STENCILOP_DESC defaultStencilOp = {D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP,
+                                                             D3D12_STENCIL_OP_KEEP, D3D12_COMPARISON_FUNC_ALWAYS};
+        FrontFace = defaultStencilOp;
+        BackFace = defaultStencilOp;
+        DepthBoundsTestEnable = FALSE;
+    }
+    explicit CD3DX12_DEPTH_STENCIL_DESC1(BOOL depthEnable, D3D12_DEPTH_WRITE_MASK depthWriteMask,
+                                         D3D12_COMPARISON_FUNC depthFunc, BOOL stencilEnable, UINT8 stencilReadMask,
+                                         UINT8 stencilWriteMask, D3D12_STENCIL_OP frontStencilFailOp,
+                                         D3D12_STENCIL_OP frontStencilDepthFailOp, D3D12_STENCIL_OP frontStencilPassOp,
+                                         D3D12_COMPARISON_FUNC frontStencilFunc, D3D12_STENCIL_OP backStencilFailOp,
+                                         D3D12_STENCIL_OP backStencilDepthFailOp, D3D12_STENCIL_OP backStencilPassOp,
+                                         D3D12_COMPARISON_FUNC backStencilFunc, BOOL depthBoundsTestEnable) noexcept {
+        DepthEnable = depthEnable;
+        DepthWriteMask = depthWriteMask;
+        DepthFunc = depthFunc;
+        StencilEnable = stencilEnable;
+        StencilReadMask = stencilReadMask;
+        StencilWriteMask = stencilWriteMask;
+        FrontFace.StencilFailOp = frontStencilFailOp;
+        FrontFace.StencilDepthFailOp = frontStencilDepthFailOp;
+        FrontFace.StencilPassOp = frontStencilPassOp;
+        FrontFace.StencilFunc = frontStencilFunc;
+        BackFace.StencilFailOp = backStencilFailOp;
+        BackFace.StencilDepthFailOp = backStencilDepthFailOp;
+        BackFace.StencilPassOp = backStencilPassOp;
+        BackFace.StencilFunc = backStencilFunc;
+        DepthBoundsTestEnable = depthBoundsTestEnable;
+    }
+    operator D3D12_DEPTH_STENCIL_DESC() const noexcept {
+        D3D12_DEPTH_STENCIL_DESC D;
+        D.DepthEnable = DepthEnable;
+        D.DepthWriteMask = DepthWriteMask;
+        D.DepthFunc = DepthFunc;
+        D.StencilEnable = StencilEnable;
+        D.StencilReadMask = StencilReadMask;
+        D.StencilWriteMask = StencilWriteMask;
+        D.FrontFace.StencilFailOp = FrontFace.StencilFailOp;
+        D.FrontFace.StencilDepthFailOp = FrontFace.StencilDepthFailOp;
+        D.FrontFace.StencilPassOp = FrontFace.StencilPassOp;
+        D.FrontFace.StencilFunc = FrontFace.StencilFunc;
+        D.BackFace.StencilFailOp = BackFace.StencilFailOp;
+        D.BackFace.StencilDepthFailOp = BackFace.StencilDepthFailOp;
+        D.BackFace.StencilPassOp = BackFace.StencilPassOp;
+        D.BackFace.StencilFunc = BackFace.StencilFunc;
+        return D;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_BLEND_DESC : public D3D12_BLEND_DESC {
+    CD3DX12_BLEND_DESC() = default;
+    explicit CD3DX12_BLEND_DESC(const D3D12_BLEND_DESC &o) noexcept : D3D12_BLEND_DESC(o) {}
+    explicit CD3DX12_BLEND_DESC(CD3DX12_DEFAULT) noexcept {
+        AlphaToCoverageEnable = FALSE;
+        IndependentBlendEnable = FALSE;
+        const D3D12_RENDER_TARGET_BLEND_DESC defaultRenderTargetBlendDesc = {
+            FALSE,
+            FALSE,
+            D3D12_BLEND_ONE,
+            D3D12_BLEND_ZERO,
+            D3D12_BLEND_OP_ADD,
+            D3D12_BLEND_ONE,
+            D3D12_BLEND_ZERO,
+            D3D12_BLEND_OP_ADD,
+            D3D12_LOGIC_OP_NOOP,
+            D3D12_COLOR_WRITE_ENABLE_ALL,
+        };
+        for (UINT i = 0; i < D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT; ++i)
+            RenderTarget[i] = defaultRenderTargetBlendDesc;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RASTERIZER_DESC : public D3D12_RASTERIZER_DESC {
+    CD3DX12_RASTERIZER_DESC() = default;
+    explicit CD3DX12_RASTERIZER_DESC(const D3D12_RASTERIZER_DESC &o) noexcept : D3D12_RASTERIZER_DESC(o) {}
+    explicit CD3DX12_RASTERIZER_DESC(CD3DX12_DEFAULT) noexcept {
+        FillMode = D3D12_FILL_MODE_SOLID;
+        CullMode = D3D12_CULL_MODE_BACK;
+        FrontCounterClockwise = FALSE;
+        DepthBias = D3D12_DEFAULT_DEPTH_BIAS;
+        DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP;
+        SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS;
+        DepthClipEnable = TRUE;
+        MultisampleEnable = FALSE;
+        AntialiasedLineEnable = FALSE;
+        ForcedSampleCount = 0;
+        ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF;
+    }
+    explicit CD3DX12_RASTERIZER_DESC(D3D12_FILL_MODE fillMode, D3D12_CULL_MODE cullMode, BOOL frontCounterClockwise,
+                                     INT depthBias, FLOAT depthBiasClamp, FLOAT slopeScaledDepthBias,
+                                     BOOL depthClipEnable, BOOL multisampleEnable, BOOL antialiasedLineEnable,
+                                     UINT forcedSampleCount,
+                                     D3D12_CONSERVATIVE_RASTERIZATION_MODE conservativeRaster) noexcept {
+        FillMode = fillMode;
+        CullMode = cullMode;
+        FrontCounterClockwise = frontCounterClockwise;
+        DepthBias = depthBias;
+        DepthBiasClamp = depthBiasClamp;
+        SlopeScaledDepthBias = slopeScaledDepthBias;
+        DepthClipEnable = depthClipEnable;
+        MultisampleEnable = multisampleEnable;
+        AntialiasedLineEnable = antialiasedLineEnable;
+        ForcedSampleCount = forcedSampleCount;
+        ConservativeRaster = conservativeRaster;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RESOURCE_ALLOCATION_INFO : public D3D12_RESOURCE_ALLOCATION_INFO {
+    CD3DX12_RESOURCE_ALLOCATION_INFO() = default;
+    explicit CD3DX12_RESOURCE_ALLOCATION_INFO(const D3D12_RESOURCE_ALLOCATION_INFO &o) noexcept
+        : D3D12_RESOURCE_ALLOCATION_INFO(o) {}
+    CD3DX12_RESOURCE_ALLOCATION_INFO(UINT64 size, UINT64 alignment) noexcept {
+        SizeInBytes = size;
+        Alignment = alignment;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_HEAP_PROPERTIES : public D3D12_HEAP_PROPERTIES {
+    CD3DX12_HEAP_PROPERTIES() = default;
+    explicit CD3DX12_HEAP_PROPERTIES(const D3D12_HEAP_PROPERTIES &o) noexcept : D3D12_HEAP_PROPERTIES(o) {}
+    CD3DX12_HEAP_PROPERTIES(D3D12_CPU_PAGE_PROPERTY cpuPageProperty, D3D12_MEMORY_POOL memoryPoolPreference,
+                            UINT creationNodeMask = 1, UINT nodeMask = 1)
+    noexcept {
+        Type = D3D12_HEAP_TYPE_CUSTOM;
+        CPUPageProperty = cpuPageProperty;
+        MemoryPoolPreference = memoryPoolPreference;
+        CreationNodeMask = creationNodeMask;
+        VisibleNodeMask = nodeMask;
+    }
+    explicit CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE type, UINT creationNodeMask = 1, UINT nodeMask = 1) noexcept {
+        Type = type;
+        CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
+        MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
+        CreationNodeMask = creationNodeMask;
+        VisibleNodeMask = nodeMask;
+    }
+    bool IsCPUAccessible() const noexcept {
+        return Type == D3D12_HEAP_TYPE_UPLOAD || Type == D3D12_HEAP_TYPE_READBACK ||
+               (Type == D3D12_HEAP_TYPE_CUSTOM && (CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE ||
+                                                   CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_BACK));
+    }
+};
+inline bool operator==(const D3D12_HEAP_PROPERTIES &l, const D3D12_HEAP_PROPERTIES &r) noexcept {
+    return l.Type == r.Type && l.CPUPageProperty == r.CPUPageProperty &&
+           l.MemoryPoolPreference == r.MemoryPoolPreference && l.CreationNodeMask == r.CreationNodeMask &&
+           l.VisibleNodeMask == r.VisibleNodeMask;
+}
+inline bool operator!=(const D3D12_HEAP_PROPERTIES &l, const D3D12_HEAP_PROPERTIES &r) noexcept { return !(l == r); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_HEAP_DESC : public D3D12_HEAP_DESC {
+    CD3DX12_HEAP_DESC() = default;
+    explicit CD3DX12_HEAP_DESC(const D3D12_HEAP_DESC &o) noexcept : D3D12_HEAP_DESC(o) {}
+    CD3DX12_HEAP_DESC(UINT64 size, D3D12_HEAP_PROPERTIES properties, UINT64 alignment = 0,
+                      D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE)
+    noexcept {
+        SizeInBytes = size;
+        Properties = properties;
+        Alignment = alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC(UINT64 size, D3D12_HEAP_TYPE type, UINT64 alignment = 0,
+                      D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE)
+    noexcept {
+        SizeInBytes = size;
+        Properties = CD3DX12_HEAP_PROPERTIES(type);
+        Alignment = alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC(UINT64 size, D3D12_CPU_PAGE_PROPERTY cpuPageProperty, D3D12_MEMORY_POOL memoryPoolPreference,
+                      UINT64 alignment = 0, D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE)
+    noexcept {
+        SizeInBytes = size;
+        Properties = CD3DX12_HEAP_PROPERTIES(cpuPageProperty, memoryPoolPreference);
+        Alignment = alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, D3D12_HEAP_PROPERTIES properties,
+                      D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE)
+    noexcept {
+        SizeInBytes = resAllocInfo.SizeInBytes;
+        Properties = properties;
+        Alignment = resAllocInfo.Alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, D3D12_HEAP_TYPE type,
+                      D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE)
+    noexcept {
+        SizeInBytes = resAllocInfo.SizeInBytes;
+        Properties = CD3DX12_HEAP_PROPERTIES(type);
+        Alignment = resAllocInfo.Alignment;
+        Flags = flags;
+    }
+    CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, D3D12_CPU_PAGE_PROPERTY cpuPageProperty,
+                      D3D12_MEMORY_POOL memoryPoolPreference, D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE)
+    noexcept {
+        SizeInBytes = resAllocInfo.SizeInBytes;
+        Properties = CD3DX12_HEAP_PROPERTIES(cpuPageProperty, memoryPoolPreference);
+        Alignment = resAllocInfo.Alignment;
+        Flags = flags;
+    }
+    bool IsCPUAccessible() const noexcept {
+        return static_cast<const CD3DX12_HEAP_PROPERTIES *>(&Properties)->IsCPUAccessible();
+    }
+};
+inline bool operator==(const D3D12_HEAP_DESC &l, const D3D12_HEAP_DESC &r) noexcept {
+    return l.SizeInBytes == r.SizeInBytes && l.Properties == r.Properties && l.Alignment == r.Alignment &&
+           l.Flags == r.Flags;
+}
+inline bool operator!=(const D3D12_HEAP_DESC &l, const D3D12_HEAP_DESC &r) noexcept { return !(l == r); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_CLEAR_VALUE : public D3D12_CLEAR_VALUE {
+    CD3DX12_CLEAR_VALUE() = default;
+    explicit CD3DX12_CLEAR_VALUE(const D3D12_CLEAR_VALUE &o) noexcept : D3D12_CLEAR_VALUE(o) {}
+    CD3DX12_CLEAR_VALUE(DXGI_FORMAT format, const FLOAT color[4]) noexcept {
+        Format = format;
+        memcpy(Color, color, sizeof(Color));
+    }
+    CD3DX12_CLEAR_VALUE(DXGI_FORMAT format, FLOAT depth, UINT8 stencil) noexcept {
+        Format = format;
+        memset(&Color, 0, sizeof(Color));
+        /* Use memcpy to preserve NAN values */
+        memcpy(&DepthStencil.Depth, &depth, sizeof(depth));
+        DepthStencil.Stencil = stencil;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RANGE : public D3D12_RANGE {
+    CD3DX12_RANGE() = default;
+    explicit CD3DX12_RANGE(const D3D12_RANGE &o) noexcept : D3D12_RANGE(o) {}
+    CD3DX12_RANGE(SIZE_T begin, SIZE_T end) noexcept {
+        Begin = begin;
+        End = end;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RANGE_UINT64 : public D3D12_RANGE_UINT64 {
+    CD3DX12_RANGE_UINT64() = default;
+    explicit CD3DX12_RANGE_UINT64(const D3D12_RANGE_UINT64 &o) noexcept : D3D12_RANGE_UINT64(o) {}
+    CD3DX12_RANGE_UINT64(UINT64 begin, UINT64 end) noexcept {
+        Begin = begin;
+        End = end;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_SUBRESOURCE_RANGE_UINT64 : public D3D12_SUBRESOURCE_RANGE_UINT64 {
+    CD3DX12_SUBRESOURCE_RANGE_UINT64() = default;
+    explicit CD3DX12_SUBRESOURCE_RANGE_UINT64(const D3D12_SUBRESOURCE_RANGE_UINT64 &o) noexcept
+        : D3D12_SUBRESOURCE_RANGE_UINT64(o) {}
+    CD3DX12_SUBRESOURCE_RANGE_UINT64(UINT subresource, const D3D12_RANGE_UINT64 &range) noexcept {
+        Subresource = subresource;
+        Range = range;
+    }
+    CD3DX12_SUBRESOURCE_RANGE_UINT64(UINT subresource, UINT64 begin, UINT64 end) noexcept {
+        Subresource = subresource;
+        Range.Begin = begin;
+        Range.End = end;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_SHADER_BYTECODE : public D3D12_SHADER_BYTECODE {
+    CD3DX12_SHADER_BYTECODE() = default;
+    explicit CD3DX12_SHADER_BYTECODE(const D3D12_SHADER_BYTECODE &o) noexcept : D3D12_SHADER_BYTECODE(o) {}
+    CD3DX12_SHADER_BYTECODE(_In_ ID3DBlob *pShaderBlob) noexcept {
+        pShaderBytecode = pShaderBlob->GetBufferPointer();
+        BytecodeLength = pShaderBlob->GetBufferSize();
+    }
+    CD3DX12_SHADER_BYTECODE(const void *_pShaderBytecode, SIZE_T bytecodeLength) noexcept {
+        pShaderBytecode = _pShaderBytecode;
+        BytecodeLength = bytecodeLength;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_TILED_RESOURCE_COORDINATE : public D3D12_TILED_RESOURCE_COORDINATE {
+    CD3DX12_TILED_RESOURCE_COORDINATE() = default;
+    explicit CD3DX12_TILED_RESOURCE_COORDINATE(const D3D12_TILED_RESOURCE_COORDINATE &o) noexcept
+        : D3D12_TILED_RESOURCE_COORDINATE(o) {}
+    CD3DX12_TILED_RESOURCE_COORDINATE(UINT x, UINT y, UINT z, UINT subresource) noexcept {
+        X = x;
+        Y = y;
+        Z = z;
+        Subresource = subresource;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_TILE_REGION_SIZE : public D3D12_TILE_REGION_SIZE {
+    CD3DX12_TILE_REGION_SIZE() = default;
+    explicit CD3DX12_TILE_REGION_SIZE(const D3D12_TILE_REGION_SIZE &o) noexcept : D3D12_TILE_REGION_SIZE(o) {}
+    CD3DX12_TILE_REGION_SIZE(UINT numTiles, BOOL useBox, UINT width, UINT16 height, UINT16 depth) noexcept {
+        NumTiles = numTiles;
+        UseBox = useBox;
+        Width = width;
+        Height = height;
+        Depth = depth;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_SUBRESOURCE_TILING : public D3D12_SUBRESOURCE_TILING {
+    CD3DX12_SUBRESOURCE_TILING() = default;
+    explicit CD3DX12_SUBRESOURCE_TILING(const D3D12_SUBRESOURCE_TILING &o) noexcept : D3D12_SUBRESOURCE_TILING(o) {}
+    CD3DX12_SUBRESOURCE_TILING(UINT widthInTiles, UINT16 heightInTiles, UINT16 depthInTiles,
+                               UINT startTileIndexInOverallResource)
+    noexcept {
+        WidthInTiles = widthInTiles;
+        HeightInTiles = heightInTiles;
+        DepthInTiles = depthInTiles;
+        StartTileIndexInOverallResource = startTileIndexInOverallResource;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_TILE_SHAPE : public D3D12_TILE_SHAPE {
+    CD3DX12_TILE_SHAPE() = default;
+    explicit CD3DX12_TILE_SHAPE(const D3D12_TILE_SHAPE &o) noexcept : D3D12_TILE_SHAPE(o) {}
+    CD3DX12_TILE_SHAPE(UINT widthInTexels, UINT heightInTexels, UINT depthInTexels) noexcept {
+        WidthInTexels = widthInTexels;
+        HeightInTexels = heightInTexels;
+        DepthInTexels = depthInTexels;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RESOURCE_BARRIER : public D3D12_RESOURCE_BARRIER {
+    CD3DX12_RESOURCE_BARRIER() = default;
+    explicit CD3DX12_RESOURCE_BARRIER(const D3D12_RESOURCE_BARRIER &o) noexcept : D3D12_RESOURCE_BARRIER(o) {}
+    static inline CD3DX12_RESOURCE_BARRIER
+    Transition(_In_ ID3D12Resource *pResource, D3D12_RESOURCE_STATES stateBefore, D3D12_RESOURCE_STATES stateAfter,
+               UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES,
+               D3D12_RESOURCE_BARRIER_FLAGS flags = D3D12_RESOURCE_BARRIER_FLAG_NONE) noexcept {
+        CD3DX12_RESOURCE_BARRIER result = {};
+        D3D12_RESOURCE_BARRIER &barrier = result;
+        result.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+        result.Flags = flags;
+        barrier.Transition.pResource = pResource;
+        barrier.Transition.StateBefore = stateBefore;
+        barrier.Transition.StateAfter = stateAfter;
+        barrier.Transition.Subresource = subresource;
+        return result;
+    }
+    static inline CD3DX12_RESOURCE_BARRIER Aliasing(_In_ ID3D12Resource *pResourceBefore,
+                                                    _In_ ID3D12Resource *pResourceAfter) noexcept {
+        CD3DX12_RESOURCE_BARRIER result = {};
+        D3D12_RESOURCE_BARRIER &barrier = result;
+        result.Type = D3D12_RESOURCE_BARRIER_TYPE_ALIASING;
+        barrier.Aliasing.pResourceBefore = pResourceBefore;
+        barrier.Aliasing.pResourceAfter = pResourceAfter;
+        return result;
+    }
+    static inline CD3DX12_RESOURCE_BARRIER UAV(_In_ ID3D12Resource *pResource) noexcept {
+        CD3DX12_RESOURCE_BARRIER result = {};
+        D3D12_RESOURCE_BARRIER &barrier = result;
+        result.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
+        barrier.UAV.pResource = pResource;
+        return result;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_PACKED_MIP_INFO : public D3D12_PACKED_MIP_INFO {
+    CD3DX12_PACKED_MIP_INFO() = default;
+    explicit CD3DX12_PACKED_MIP_INFO(const D3D12_PACKED_MIP_INFO &o) noexcept : D3D12_PACKED_MIP_INFO(o) {}
+    CD3DX12_PACKED_MIP_INFO(UINT8 numStandardMips, UINT8 numPackedMips, UINT numTilesForPackedMips,
+                            UINT startTileIndexInOverallResource)
+    noexcept {
+        NumStandardMips = numStandardMips;
+        NumPackedMips = numPackedMips;
+        NumTilesForPackedMips = numTilesForPackedMips;
+        StartTileIndexInOverallResource = startTileIndexInOverallResource;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_SUBRESOURCE_FOOTPRINT : public D3D12_SUBRESOURCE_FOOTPRINT {
+    CD3DX12_SUBRESOURCE_FOOTPRINT() = default;
+    explicit CD3DX12_SUBRESOURCE_FOOTPRINT(const D3D12_SUBRESOURCE_FOOTPRINT &o) noexcept
+        : D3D12_SUBRESOURCE_FOOTPRINT(o) {}
+    CD3DX12_SUBRESOURCE_FOOTPRINT(DXGI_FORMAT format, UINT width, UINT height, UINT depth, UINT rowPitch) noexcept {
+        Format = format;
+        Width = width;
+        Height = height;
+        Depth = depth;
+        RowPitch = rowPitch;
+    }
+    explicit CD3DX12_SUBRESOURCE_FOOTPRINT(const D3D12_RESOURCE_DESC &resDesc, UINT rowPitch) noexcept {
+        Format = resDesc.Format;
+        Width = UINT(resDesc.Width);
+        Height = resDesc.Height;
+        Depth = (resDesc.Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? resDesc.DepthOrArraySize : 1);
+        RowPitch = rowPitch;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_TEXTURE_COPY_LOCATION : public D3D12_TEXTURE_COPY_LOCATION {
+    CD3DX12_TEXTURE_COPY_LOCATION() = default;
+    explicit CD3DX12_TEXTURE_COPY_LOCATION(const D3D12_TEXTURE_COPY_LOCATION &o) noexcept
+        : D3D12_TEXTURE_COPY_LOCATION(o) {}
+    CD3DX12_TEXTURE_COPY_LOCATION(_In_ ID3D12Resource *pRes) noexcept {
+        pResource = pRes;
+        Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+        PlacedFootprint = {};
+    }
+    CD3DX12_TEXTURE_COPY_LOCATION(_In_ ID3D12Resource *pRes, D3D12_PLACED_SUBRESOURCE_FOOTPRINT const &Footprint)
+    noexcept {
+        pResource = pRes;
+        Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
+        PlacedFootprint = Footprint;
+    }
+    CD3DX12_TEXTURE_COPY_LOCATION(_In_ ID3D12Resource *pRes, UINT Sub) noexcept {
+        pResource = pRes;
+        Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+        PlacedFootprint = {};
+        SubresourceIndex = Sub;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_DESCRIPTOR_RANGE : public D3D12_DESCRIPTOR_RANGE {
+    CD3DX12_DESCRIPTOR_RANGE() = default;
+    explicit CD3DX12_DESCRIPTOR_RANGE(const D3D12_DESCRIPTOR_RANGE &o) noexcept : D3D12_DESCRIPTOR_RANGE(o) {}
+    CD3DX12_DESCRIPTOR_RANGE(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, UINT numDescriptors, UINT baseShaderRegister,
+                             UINT registerSpace = 0,
+                             UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND)
+    noexcept {
+        Init(rangeType, numDescriptors, baseShaderRegister, registerSpace, offsetInDescriptorsFromTableStart);
+    }
+
+    inline void Init(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, UINT numDescriptors, UINT baseShaderRegister,
+                     UINT registerSpace = 0,
+                     UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) noexcept {
+        Init(*this, rangeType, numDescriptors, baseShaderRegister, registerSpace, offsetInDescriptorsFromTableStart);
+    }
+
+    static inline void Init(_Out_ D3D12_DESCRIPTOR_RANGE &range, D3D12_DESCRIPTOR_RANGE_TYPE rangeType,
+                            UINT numDescriptors, UINT baseShaderRegister, UINT registerSpace = 0,
+                            UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) noexcept {
+        range.RangeType = rangeType;
+        range.NumDescriptors = numDescriptors;
+        range.BaseShaderRegister = baseShaderRegister;
+        range.RegisterSpace = registerSpace;
+        range.OffsetInDescriptorsFromTableStart = offsetInDescriptorsFromTableStart;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_DESCRIPTOR_TABLE : public D3D12_ROOT_DESCRIPTOR_TABLE {
+    CD3DX12_ROOT_DESCRIPTOR_TABLE() = default;
+    explicit CD3DX12_ROOT_DESCRIPTOR_TABLE(const D3D12_ROOT_DESCRIPTOR_TABLE &o) noexcept
+        : D3D12_ROOT_DESCRIPTOR_TABLE(o) {}
+    CD3DX12_ROOT_DESCRIPTOR_TABLE(UINT numDescriptorRanges,
+                                  _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges)
+    noexcept {
+        Init(numDescriptorRanges, _pDescriptorRanges);
+    }
+
+    inline void Init(UINT numDescriptorRanges,
+                     _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges) noexcept {
+        Init(*this, numDescriptorRanges, _pDescriptorRanges);
+    }
+
+    static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR_TABLE &rootDescriptorTable, UINT numDescriptorRanges,
+                            _In_reads_opt_(numDescriptorRanges)
+                                const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges) noexcept {
+        rootDescriptorTable.NumDescriptorRanges = numDescriptorRanges;
+        rootDescriptorTable.pDescriptorRanges = _pDescriptorRanges;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_CONSTANTS : public D3D12_ROOT_CONSTANTS {
+    CD3DX12_ROOT_CONSTANTS() = default;
+    explicit CD3DX12_ROOT_CONSTANTS(const D3D12_ROOT_CONSTANTS &o) noexcept : D3D12_ROOT_CONSTANTS(o) {}
+    CD3DX12_ROOT_CONSTANTS(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0) noexcept {
+        Init(num32BitValues, shaderRegister, registerSpace);
+    }
+
+    inline void Init(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0) noexcept {
+        Init(*this, num32BitValues, shaderRegister, registerSpace);
+    }
+
+    static inline void Init(_Out_ D3D12_ROOT_CONSTANTS &rootConstants, UINT num32BitValues, UINT shaderRegister,
+                            UINT registerSpace = 0) noexcept {
+        rootConstants.Num32BitValues = num32BitValues;
+        rootConstants.ShaderRegister = shaderRegister;
+        rootConstants.RegisterSpace = registerSpace;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_DESCRIPTOR : public D3D12_ROOT_DESCRIPTOR {
+    CD3DX12_ROOT_DESCRIPTOR() = default;
+    explicit CD3DX12_ROOT_DESCRIPTOR(const D3D12_ROOT_DESCRIPTOR &o) noexcept : D3D12_ROOT_DESCRIPTOR(o) {}
+    CD3DX12_ROOT_DESCRIPTOR(UINT shaderRegister, UINT registerSpace = 0) noexcept {
+        Init(shaderRegister, registerSpace);
+    }
+
+    inline void Init(UINT shaderRegister, UINT registerSpace = 0) noexcept {
+        Init(*this, shaderRegister, registerSpace);
+    }
+
+    static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR &table, UINT shaderRegister, UINT registerSpace = 0) noexcept {
+        table.ShaderRegister = shaderRegister;
+        table.RegisterSpace = registerSpace;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_PARAMETER : public D3D12_ROOT_PARAMETER {
+    CD3DX12_ROOT_PARAMETER() = default;
+    explicit CD3DX12_ROOT_PARAMETER(const D3D12_ROOT_PARAMETER &o) noexcept : D3D12_ROOT_PARAMETER(o) {}
+
+    static inline void
+    InitAsDescriptorTable(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT numDescriptorRanges,
+                          _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *pDescriptorRanges,
+                          D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR_TABLE::Init(rootParam.DescriptorTable, numDescriptorRanges, pDescriptorRanges);
+    }
+
+    static inline void InitAsConstants(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT num32BitValues, UINT shaderRegister,
+                                       UINT registerSpace = 0,
+                                       D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_CONSTANTS::Init(rootParam.Constants, num32BitValues, shaderRegister, registerSpace);
+    }
+
+    static inline void
+    InitAsConstantBufferView(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT shaderRegister, UINT registerSpace = 0,
+                             D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace);
+    }
+
+    static inline void
+    InitAsShaderResourceView(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT shaderRegister, UINT registerSpace = 0,
+                             D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace);
+    }
+
+    static inline void
+    InitAsUnorderedAccessView(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT shaderRegister, UINT registerSpace = 0,
+                              D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace);
+    }
+
+    inline void InitAsDescriptorTable(UINT numDescriptorRanges,
+                                      _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *pDescriptorRanges,
+                                      D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsDescriptorTable(*this, numDescriptorRanges, pDescriptorRanges, visibility);
+    }
+
+    inline void InitAsConstants(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0,
+                                D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsConstants(*this, num32BitValues, shaderRegister, registerSpace, visibility);
+    }
+
+    inline void InitAsConstantBufferView(UINT shaderRegister, UINT registerSpace = 0,
+                                         D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsConstantBufferView(*this, shaderRegister, registerSpace, visibility);
+    }
+
+    inline void InitAsShaderResourceView(UINT shaderRegister, UINT registerSpace = 0,
+                                         D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsShaderResourceView(*this, shaderRegister, registerSpace, visibility);
+    }
+
+    inline void InitAsUnorderedAccessView(UINT shaderRegister, UINT registerSpace = 0,
+                                          D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsUnorderedAccessView(*this, shaderRegister, registerSpace, visibility);
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_STATIC_SAMPLER_DESC : public D3D12_STATIC_SAMPLER_DESC {
+    CD3DX12_STATIC_SAMPLER_DESC() = default;
+    explicit CD3DX12_STATIC_SAMPLER_DESC(const D3D12_STATIC_SAMPLER_DESC &o) noexcept : D3D12_STATIC_SAMPLER_DESC(o) {}
+    CD3DX12_STATIC_SAMPLER_DESC(UINT shaderRegister, D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC,
+                                D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+                                D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+                                D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+                                FLOAT mipLODBias = 0, UINT maxAnisotropy = 16,
+                                D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL,
+                                D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE,
+                                FLOAT minLOD = 0.f, FLOAT maxLOD = D3D12_FLOAT32_MAX,
+                                D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL,
+                                UINT registerSpace = 0)
+    noexcept {
+        Init(shaderRegister, filter, addressU, addressV, addressW, mipLODBias, maxAnisotropy, comparisonFunc,
+             borderColor, minLOD, maxLOD, shaderVisibility, registerSpace);
+    }
+
+    static inline void Init(_Out_ D3D12_STATIC_SAMPLER_DESC &samplerDesc, UINT shaderRegister,
+                            D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC,
+                            D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+                            D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+                            D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, FLOAT mipLODBias = 0,
+                            UINT maxAnisotropy = 16,
+                            D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL,
+                            D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE,
+                            FLOAT minLOD = 0.f, FLOAT maxLOD = D3D12_FLOAT32_MAX,
+                            D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL,
+                            UINT registerSpace = 0) noexcept {
+        samplerDesc.ShaderRegister = shaderRegister;
+        samplerDesc.Filter = filter;
+        samplerDesc.AddressU = addressU;
+        samplerDesc.AddressV = addressV;
+        samplerDesc.AddressW = addressW;
+        samplerDesc.MipLODBias = mipLODBias;
+        samplerDesc.MaxAnisotropy = maxAnisotropy;
+        samplerDesc.ComparisonFunc = comparisonFunc;
+        samplerDesc.BorderColor = borderColor;
+        samplerDesc.MinLOD = minLOD;
+        samplerDesc.MaxLOD = maxLOD;
+        samplerDesc.ShaderVisibility = shaderVisibility;
+        samplerDesc.RegisterSpace = registerSpace;
+    }
+    inline void Init(UINT shaderRegister, D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC,
+                     D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+                     D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+                     D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, FLOAT mipLODBias = 0,
+                     UINT maxAnisotropy = 16, D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL,
+                     D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, FLOAT minLOD = 0.f,
+                     FLOAT maxLOD = D3D12_FLOAT32_MAX,
+                     D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL,
+                     UINT registerSpace = 0) noexcept {
+        Init(*this, shaderRegister, filter, addressU, addressV, addressW, mipLODBias, maxAnisotropy, comparisonFunc,
+             borderColor, minLOD, maxLOD, shaderVisibility, registerSpace);
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_SIGNATURE_DESC : public D3D12_ROOT_SIGNATURE_DESC {
+    CD3DX12_ROOT_SIGNATURE_DESC() = default;
+    explicit CD3DX12_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC &o) noexcept : D3D12_ROOT_SIGNATURE_DESC(o) {}
+    CD3DX12_ROOT_SIGNATURE_DESC(UINT numParameters,
+                                _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters,
+                                UINT numStaticSamplers = 0,
+                                _In_reads_opt_(numStaticSamplers)
+                                    const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+                                D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    noexcept {
+        Init(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+    CD3DX12_ROOT_SIGNATURE_DESC(CD3DX12_DEFAULT) noexcept {
+        Init(0, nullptr, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
+    }
+
+    inline void Init(UINT numParameters, _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters,
+                     UINT numStaticSamplers = 0,
+                     _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+                     D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept {
+        Init(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+
+    static inline void
+    Init(_Out_ D3D12_ROOT_SIGNATURE_DESC &desc, UINT numParameters,
+         _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, UINT numStaticSamplers = 0,
+         _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+         D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept {
+        desc.NumParameters = numParameters;
+        desc.pParameters = _pParameters;
+        desc.NumStaticSamplers = numStaticSamplers;
+        desc.pStaticSamplers = _pStaticSamplers;
+        desc.Flags = flags;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_DESCRIPTOR_RANGE1 : public D3D12_DESCRIPTOR_RANGE1 {
+    CD3DX12_DESCRIPTOR_RANGE1() = default;
+    explicit CD3DX12_DESCRIPTOR_RANGE1(const D3D12_DESCRIPTOR_RANGE1 &o) noexcept : D3D12_DESCRIPTOR_RANGE1(o) {}
+    CD3DX12_DESCRIPTOR_RANGE1(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, UINT numDescriptors, UINT baseShaderRegister,
+                              UINT registerSpace = 0,
+                              D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE,
+                              UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND)
+    noexcept {
+        Init(rangeType, numDescriptors, baseShaderRegister, registerSpace, flags, offsetInDescriptorsFromTableStart);
+    }
+
+    inline void Init(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, UINT numDescriptors, UINT baseShaderRegister,
+                     UINT registerSpace = 0, D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE,
+                     UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) noexcept {
+        Init(*this, rangeType, numDescriptors, baseShaderRegister, registerSpace, flags,
+             offsetInDescriptorsFromTableStart);
+    }
+
+    static inline void Init(_Out_ D3D12_DESCRIPTOR_RANGE1 &range, D3D12_DESCRIPTOR_RANGE_TYPE rangeType,
+                            UINT numDescriptors, UINT baseShaderRegister, UINT registerSpace = 0,
+                            D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE,
+                            UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) noexcept {
+        range.RangeType = rangeType;
+        range.NumDescriptors = numDescriptors;
+        range.BaseShaderRegister = baseShaderRegister;
+        range.RegisterSpace = registerSpace;
+        range.Flags = flags;
+        range.OffsetInDescriptorsFromTableStart = offsetInDescriptorsFromTableStart;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_DESCRIPTOR_TABLE1 : public D3D12_ROOT_DESCRIPTOR_TABLE1 {
+    CD3DX12_ROOT_DESCRIPTOR_TABLE1() = default;
+    explicit CD3DX12_ROOT_DESCRIPTOR_TABLE1(const D3D12_ROOT_DESCRIPTOR_TABLE1 &o) noexcept
+        : D3D12_ROOT_DESCRIPTOR_TABLE1(o) {}
+    CD3DX12_ROOT_DESCRIPTOR_TABLE1(UINT numDescriptorRanges, _In_reads_opt_(numDescriptorRanges)
+                                                                 const D3D12_DESCRIPTOR_RANGE1 *_pDescriptorRanges)
+    noexcept {
+        Init(numDescriptorRanges, _pDescriptorRanges);
+    }
+
+    inline void Init(UINT numDescriptorRanges,
+                     _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1 *_pDescriptorRanges) noexcept {
+        Init(*this, numDescriptorRanges, _pDescriptorRanges);
+    }
+
+    static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR_TABLE1 &rootDescriptorTable, UINT numDescriptorRanges,
+                            _In_reads_opt_(numDescriptorRanges)
+                                const D3D12_DESCRIPTOR_RANGE1 *_pDescriptorRanges) noexcept {
+        rootDescriptorTable.NumDescriptorRanges = numDescriptorRanges;
+        rootDescriptorTable.pDescriptorRanges = _pDescriptorRanges;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_DESCRIPTOR1 : public D3D12_ROOT_DESCRIPTOR1 {
+    CD3DX12_ROOT_DESCRIPTOR1() = default;
+    explicit CD3DX12_ROOT_DESCRIPTOR1(const D3D12_ROOT_DESCRIPTOR1 &o) noexcept : D3D12_ROOT_DESCRIPTOR1(o) {}
+    CD3DX12_ROOT_DESCRIPTOR1(UINT shaderRegister, UINT registerSpace = 0,
+                             D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE)
+    noexcept {
+        Init(shaderRegister, registerSpace, flags);
+    }
+
+    inline void Init(UINT shaderRegister, UINT registerSpace = 0,
+                     D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE) noexcept {
+        Init(*this, shaderRegister, registerSpace, flags);
+    }
+
+    static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR1 &table, UINT shaderRegister, UINT registerSpace = 0,
+                            D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE) noexcept {
+        table.ShaderRegister = shaderRegister;
+        table.RegisterSpace = registerSpace;
+        table.Flags = flags;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_ROOT_PARAMETER1 : public D3D12_ROOT_PARAMETER1 {
+    CD3DX12_ROOT_PARAMETER1() = default;
+    explicit CD3DX12_ROOT_PARAMETER1(const D3D12_ROOT_PARAMETER1 &o) noexcept : D3D12_ROOT_PARAMETER1(o) {}
+
+    static inline void
+    InitAsDescriptorTable(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT numDescriptorRanges,
+                          _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1 *pDescriptorRanges,
+                          D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR_TABLE1::Init(rootParam.DescriptorTable, numDescriptorRanges, pDescriptorRanges);
+    }
+
+    static inline void InitAsConstants(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT num32BitValues, UINT shaderRegister,
+                                       UINT registerSpace = 0,
+                                       D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_CONSTANTS::Init(rootParam.Constants, num32BitValues, shaderRegister, registerSpace);
+    }
+
+    static inline void
+    InitAsConstantBufferView(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT shaderRegister, UINT registerSpace = 0,
+                             D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+                             D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags);
+    }
+
+    static inline void
+    InitAsShaderResourceView(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT shaderRegister, UINT registerSpace = 0,
+                             D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+                             D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags);
+    }
+
+    static inline void
+    InitAsUnorderedAccessView(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT shaderRegister, UINT registerSpace = 0,
+                              D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+                              D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV;
+        rootParam.ShaderVisibility = visibility;
+        CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags);
+    }
+
+    inline void InitAsDescriptorTable(UINT numDescriptorRanges,
+                                      _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1 *pDescriptorRanges,
+                                      D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsDescriptorTable(*this, numDescriptorRanges, pDescriptorRanges, visibility);
+    }
+
+    inline void InitAsConstants(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0,
+                                D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsConstants(*this, num32BitValues, shaderRegister, registerSpace, visibility);
+    }
+
+    inline void InitAsConstantBufferView(UINT shaderRegister, UINT registerSpace = 0,
+                                         D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+                                         D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsConstantBufferView(*this, shaderRegister, registerSpace, flags, visibility);
+    }
+
+    inline void InitAsShaderResourceView(UINT shaderRegister, UINT registerSpace = 0,
+                                         D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+                                         D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsShaderResourceView(*this, shaderRegister, registerSpace, flags, visibility);
+    }
+
+    inline void InitAsUnorderedAccessView(UINT shaderRegister, UINT registerSpace = 0,
+                                          D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE,
+                                          D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept {
+        InitAsUnorderedAccessView(*this, shaderRegister, registerSpace, flags, visibility);
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC : public D3D12_VERSIONED_ROOT_SIGNATURE_DESC {
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC() = default;
+    explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_VERSIONED_ROOT_SIGNATURE_DESC &o) noexcept
+        : D3D12_VERSIONED_ROOT_SIGNATURE_DESC(o) {}
+    explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC &o) noexcept {
+        Version = D3D_ROOT_SIGNATURE_VERSION_1_0;
+        Desc_1_0 = o;
+    }
+    explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC1 &o) noexcept {
+        Version = D3D_ROOT_SIGNATURE_VERSION_1_1;
+        Desc_1_1 = o;
+    }
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(UINT numParameters,
+                                          _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters,
+                                          UINT numStaticSamplers = 0,
+                                          _In_reads_opt_(numStaticSamplers)
+                                              const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+                                          D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    noexcept {
+        Init_1_0(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(UINT numParameters,
+                                          _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1 *_pParameters,
+                                          UINT numStaticSamplers = 0,
+                                          _In_reads_opt_(numStaticSamplers)
+                                              const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+                                          D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE)
+    noexcept {
+        Init_1_1(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+    CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(CD3DX12_DEFAULT) noexcept {
+        Init_1_1(0, nullptr, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
+    }
+
+    inline void Init_1_0(UINT numParameters, _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters,
+                         UINT numStaticSamplers = 0,
+                         _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+                         D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept {
+        Init_1_0(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+
+    static inline void
+    Init_1_0(_Out_ D3D12_VERSIONED_ROOT_SIGNATURE_DESC &desc, UINT numParameters,
+             _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, UINT numStaticSamplers = 0,
+             _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+             D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept {
+        desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_0;
+        desc.Desc_1_0.NumParameters = numParameters;
+        desc.Desc_1_0.pParameters = _pParameters;
+        desc.Desc_1_0.NumStaticSamplers = numStaticSamplers;
+        desc.Desc_1_0.pStaticSamplers = _pStaticSamplers;
+        desc.Desc_1_0.Flags = flags;
+    }
+
+    inline void Init_1_1(UINT numParameters, _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1 *_pParameters,
+                         UINT numStaticSamplers = 0,
+                         _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+                         D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept {
+        Init_1_1(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags);
+    }
+
+    static inline void
+    Init_1_1(_Out_ D3D12_VERSIONED_ROOT_SIGNATURE_DESC &desc, UINT numParameters,
+             _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1 *_pParameters, UINT numStaticSamplers = 0,
+             _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr,
+             D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept {
+        desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1;
+        desc.Desc_1_1.NumParameters = numParameters;
+        desc.Desc_1_1.pParameters = _pParameters;
+        desc.Desc_1_1.NumStaticSamplers = numStaticSamplers;
+        desc.Desc_1_1.pStaticSamplers = _pStaticSamplers;
+        desc.Desc_1_1.Flags = flags;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_CPU_DESCRIPTOR_HANDLE : public D3D12_CPU_DESCRIPTOR_HANDLE {
+    CD3DX12_CPU_DESCRIPTOR_HANDLE() = default;
+    explicit CD3DX12_CPU_DESCRIPTOR_HANDLE(const D3D12_CPU_DESCRIPTOR_HANDLE &o) noexcept
+        : D3D12_CPU_DESCRIPTOR_HANDLE(o) {}
+    CD3DX12_CPU_DESCRIPTOR_HANDLE(CD3DX12_DEFAULT) noexcept { ptr = 0; }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, INT offsetScaledByIncrementSize)
+    noexcept {
+        InitOffsetted(other, offsetScaledByIncrementSize);
+    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, INT offsetInDescriptors,
+                                  UINT descriptorIncrementSize)
+    noexcept {
+        InitOffsetted(other, offsetInDescriptors, descriptorIncrementSize);
+    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE &Offset(INT offsetInDescriptors, UINT descriptorIncrementSize) noexcept {
+        ptr = SIZE_T(INT64(ptr) + INT64(offsetInDescriptors) * INT64(descriptorIncrementSize));
+        return *this;
+    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE &Offset(INT offsetScaledByIncrementSize) noexcept {
+        ptr = SIZE_T(INT64(ptr) + INT64(offsetScaledByIncrementSize));
+        return *this;
+    }
+    bool operator==(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other) const noexcept { return (ptr == other.ptr); }
+    bool operator!=(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other) const noexcept { return (ptr != other.ptr); }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE &operator=(const D3D12_CPU_DESCRIPTOR_HANDLE &other) noexcept {
+        ptr = other.ptr;
+        return *this;
+    }
+
+    inline void InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize) noexcept {
+        InitOffsetted(*this, base, offsetScaledByIncrementSize);
+    }
+
+    inline void InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors,
+                              UINT descriptorIncrementSize) noexcept {
+        InitOffsetted(*this, base, offsetInDescriptors, descriptorIncrementSize);
+    }
+
+    static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle,
+                                     _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base,
+                                     INT offsetScaledByIncrementSize) noexcept {
+        handle.ptr = SIZE_T(INT64(base.ptr) + INT64(offsetScaledByIncrementSize));
+    }
+
+    static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle,
+                                     _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors,
+                                     UINT descriptorIncrementSize) noexcept {
+        handle.ptr = SIZE_T(INT64(base.ptr) + INT64(offsetInDescriptors) * INT64(descriptorIncrementSize));
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_GPU_DESCRIPTOR_HANDLE : public D3D12_GPU_DESCRIPTOR_HANDLE {
+    CD3DX12_GPU_DESCRIPTOR_HANDLE() = default;
+    explicit CD3DX12_GPU_DESCRIPTOR_HANDLE(const D3D12_GPU_DESCRIPTOR_HANDLE &o) noexcept
+        : D3D12_GPU_DESCRIPTOR_HANDLE(o) {}
+    CD3DX12_GPU_DESCRIPTOR_HANDLE(CD3DX12_DEFAULT) noexcept { ptr = 0; }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, INT offsetScaledByIncrementSize)
+    noexcept {
+        InitOffsetted(other, offsetScaledByIncrementSize);
+    }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, INT offsetInDescriptors,
+                                  UINT descriptorIncrementSize)
+    noexcept {
+        InitOffsetted(other, offsetInDescriptors, descriptorIncrementSize);
+    }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE &Offset(INT offsetInDescriptors, UINT descriptorIncrementSize) noexcept {
+        ptr = UINT64(INT64(ptr) + INT64(offsetInDescriptors) * INT64(descriptorIncrementSize));
+        return *this;
+    }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE &Offset(INT offsetScaledByIncrementSize) noexcept {
+        ptr = UINT64(INT64(ptr) + INT64(offsetScaledByIncrementSize));
+        return *this;
+    }
+    inline bool operator==(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other) const noexcept { return (ptr == other.ptr); }
+    inline bool operator!=(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other) const noexcept { return (ptr != other.ptr); }
+    CD3DX12_GPU_DESCRIPTOR_HANDLE &operator=(const D3D12_GPU_DESCRIPTOR_HANDLE &other) noexcept {
+        ptr = other.ptr;
+        return *this;
+    }
+
+    inline void InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize) noexcept {
+        InitOffsetted(*this, base, offsetScaledByIncrementSize);
+    }
+
+    inline void InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors,
+                              UINT descriptorIncrementSize) noexcept {
+        InitOffsetted(*this, base, offsetInDescriptors, descriptorIncrementSize);
+    }
+
+    static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle,
+                                     _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base,
+                                     INT offsetScaledByIncrementSize) noexcept {
+        handle.ptr = UINT64(INT64(base.ptr) + INT64(offsetScaledByIncrementSize));
+    }
+
+    static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle,
+                                     _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors,
+                                     UINT descriptorIncrementSize) noexcept {
+        handle.ptr = UINT64(INT64(base.ptr) + INT64(offsetInDescriptors) * INT64(descriptorIncrementSize));
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+inline constexpr UINT D3D12CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice, UINT MipLevels,
+                                           UINT ArraySize) noexcept {
+    return MipSlice + ArraySlice * MipLevels + PlaneSlice * MipLevels * ArraySize;
+}
+
+//------------------------------------------------------------------------------------------------
+template <typename T, typename U, typename V>
+inline void D3D12DecomposeSubresource(UINT Subresource, UINT MipLevels, UINT ArraySize, _Out_ T &MipSlice,
+                                      _Out_ U &ArraySlice, _Out_ V &PlaneSlice) noexcept {
+    MipSlice = static_cast<T>(Subresource % MipLevels);
+    ArraySlice = static_cast<U>((Subresource / MipLevels) % ArraySize);
+    PlaneSlice = static_cast<V>(Subresource / (MipLevels * ArraySize));
+}
+
+//------------------------------------------------------------------------------------------------
+inline UINT8 D3D12GetFormatPlaneCount(_In_ ID3D12Device *pDevice, DXGI_FORMAT Format) noexcept {
+    D3D12_FEATURE_DATA_FORMAT_INFO formatInfo = {Format, 0};
+    if (FAILED(pDevice->CheckFeatureSupport(D3D12_FEATURE_FORMAT_INFO, &formatInfo, sizeof(formatInfo)))) {
+        return 0;
+    }
+    return formatInfo.PlaneCount;
+}
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RESOURCE_DESC : public D3D12_RESOURCE_DESC {
+    CD3DX12_RESOURCE_DESC() = default;
+    explicit CD3DX12_RESOURCE_DESC(const D3D12_RESOURCE_DESC &o) noexcept : D3D12_RESOURCE_DESC(o) {}
+    CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION dimension, UINT64 alignment, UINT64 width, UINT height,
+                          UINT16 depthOrArraySize, UINT16 mipLevels, DXGI_FORMAT format, UINT sampleCount,
+                          UINT sampleQuality, D3D12_TEXTURE_LAYOUT layout, D3D12_RESOURCE_FLAGS flags)
+    noexcept {
+        Dimension = dimension;
+        Alignment = alignment;
+        Width = width;
+        Height = height;
+        DepthOrArraySize = depthOrArraySize;
+        MipLevels = mipLevels;
+        Format = format;
+        SampleDesc.Count = sampleCount;
+        SampleDesc.Quality = sampleQuality;
+        Layout = layout;
+        Flags = flags;
+    }
+    static inline CD3DX12_RESOURCE_DESC Buffer(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo,
+                                               D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE) noexcept {
+        return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_BUFFER, resAllocInfo.Alignment, resAllocInfo.SizeInBytes,
+                                     1, 1, 1, DXGI_FORMAT_UNKNOWN, 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags);
+    }
+    static inline CD3DX12_RESOURCE_DESC Buffer(UINT64 width, D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+                                               UINT64 alignment = 0) noexcept {
+        return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_BUFFER, alignment, width, 1, 1, 1, DXGI_FORMAT_UNKNOWN, 1,
+                                     0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags);
+    }
+    static inline CD3DX12_RESOURCE_DESC Tex1D(DXGI_FORMAT format, UINT64 width, UINT16 arraySize = 1,
+                                              UINT16 mipLevels = 0,
+                                              D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+                                              D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+                                              UINT64 alignment = 0) noexcept {
+        return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_TEXTURE1D, alignment, width, 1, arraySize, mipLevels,
+                                     format, 1, 0, layout, flags);
+    }
+    static inline CD3DX12_RESOURCE_DESC Tex2D(DXGI_FORMAT format, UINT64 width, UINT height, UINT16 arraySize = 1,
+                                              UINT16 mipLevels = 0, UINT sampleCount = 1, UINT sampleQuality = 0,
+                                              D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+                                              D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+                                              UINT64 alignment = 0) noexcept {
+        return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_TEXTURE2D, alignment, width, height, arraySize, mipLevels,
+                                     format, sampleCount, sampleQuality, layout, flags);
+    }
+    static inline CD3DX12_RESOURCE_DESC Tex3D(DXGI_FORMAT format, UINT64 width, UINT height, UINT16 depth,
+                                              UINT16 mipLevels = 0,
+                                              D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+                                              D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+                                              UINT64 alignment = 0) noexcept {
+        return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_TEXTURE3D, alignment, width, height, depth, mipLevels,
+                                     format, 1, 0, layout, flags);
+    }
+    inline UINT16 Depth() const noexcept {
+        return (Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1);
+    }
+    inline UINT16 ArraySize() const noexcept {
+        return (Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1);
+    }
+    inline UINT8 PlaneCount(_In_ ID3D12Device *pDevice) const noexcept {
+        return D3D12GetFormatPlaneCount(pDevice, Format);
+    }
+    inline UINT Subresources(_In_ ID3D12Device *pDevice) const noexcept {
+        return MipLevels * ArraySize() * PlaneCount(pDevice);
+    }
+    inline UINT CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice) noexcept {
+        return D3D12CalcSubresource(MipSlice, ArraySlice, PlaneSlice, MipLevels, ArraySize());
+    }
+};
+inline bool operator==(const D3D12_RESOURCE_DESC &l, const D3D12_RESOURCE_DESC &r) noexcept {
+    return l.Dimension == r.Dimension && l.Alignment == r.Alignment && l.Width == r.Width && l.Height == r.Height &&
+           l.DepthOrArraySize == r.DepthOrArraySize && l.MipLevels == r.MipLevels && l.Format == r.Format &&
+           l.SampleDesc.Count == r.SampleDesc.Count && l.SampleDesc.Quality == r.SampleDesc.Quality &&
+           l.Layout == r.Layout && l.Flags == r.Flags;
+}
+inline bool operator!=(const D3D12_RESOURCE_DESC &l, const D3D12_RESOURCE_DESC &r) noexcept { return !(l == r); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RESOURCE_DESC1 : public D3D12_RESOURCE_DESC1 {
+    CD3DX12_RESOURCE_DESC1() = default;
+    explicit CD3DX12_RESOURCE_DESC1(const D3D12_RESOURCE_DESC1 &o) noexcept : D3D12_RESOURCE_DESC1(o) {}
+    CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION dimension, UINT64 alignment, UINT64 width, UINT height,
+                           UINT16 depthOrArraySize, UINT16 mipLevels, DXGI_FORMAT format, UINT sampleCount,
+                           UINT sampleQuality, D3D12_TEXTURE_LAYOUT layout, D3D12_RESOURCE_FLAGS flags,
+                           UINT samplerFeedbackMipRegionWidth = 0, UINT samplerFeedbackMipRegionHeight = 0,
+                           UINT samplerFeedbackMipRegionDepth = 0)
+    noexcept {
+        Dimension = dimension;
+        Alignment = alignment;
+        Width = width;
+        Height = height;
+        DepthOrArraySize = depthOrArraySize;
+        MipLevels = mipLevels;
+        Format = format;
+        SampleDesc.Count = sampleCount;
+        SampleDesc.Quality = sampleQuality;
+        Layout = layout;
+        Flags = flags;
+        SamplerFeedbackMipRegion.Width = samplerFeedbackMipRegionWidth;
+        SamplerFeedbackMipRegion.Height = samplerFeedbackMipRegionHeight;
+        SamplerFeedbackMipRegion.Depth = samplerFeedbackMipRegionDepth;
+    }
+    static inline CD3DX12_RESOURCE_DESC1 Buffer(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo,
+                                                D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE) noexcept {
+        return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_BUFFER, resAllocInfo.Alignment, resAllocInfo.SizeInBytes,
+                                      1, 1, 1, DXGI_FORMAT_UNKNOWN, 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags, 0, 0,
+                                      0);
+    }
+    static inline CD3DX12_RESOURCE_DESC1 Buffer(UINT64 width, D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+                                                UINT64 alignment = 0) noexcept {
+        return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_BUFFER, alignment, width, 1, 1, 1, DXGI_FORMAT_UNKNOWN,
+                                      1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags, 0, 0, 0);
+    }
+    static inline CD3DX12_RESOURCE_DESC1 Tex1D(DXGI_FORMAT format, UINT64 width, UINT16 arraySize = 1,
+                                               UINT16 mipLevels = 0,
+                                               D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+                                               D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+                                               UINT64 alignment = 0) noexcept {
+        return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_TEXTURE1D, alignment, width, 1, arraySize, mipLevels,
+                                      format, 1, 0, layout, flags, 0, 0, 0);
+    }
+    static inline CD3DX12_RESOURCE_DESC1 Tex2D(DXGI_FORMAT format, UINT64 width, UINT height, UINT16 arraySize = 1,
+                                               UINT16 mipLevels = 0, UINT sampleCount = 1, UINT sampleQuality = 0,
+                                               D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+                                               D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+                                               UINT64 alignment = 0, UINT samplerFeedbackMipRegionWidth = 0,
+                                               UINT samplerFeedbackMipRegionHeight = 0,
+                                               UINT samplerFeedbackMipRegionDepth = 0) noexcept {
+        return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_TEXTURE2D, alignment, width, height, arraySize,
+                                      mipLevels, format, sampleCount, sampleQuality, layout, flags,
+                                      samplerFeedbackMipRegionWidth, samplerFeedbackMipRegionHeight,
+                                      samplerFeedbackMipRegionDepth);
+    }
+    static inline CD3DX12_RESOURCE_DESC1 Tex3D(DXGI_FORMAT format, UINT64 width, UINT height, UINT16 depth,
+                                               UINT16 mipLevels = 0,
+                                               D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE,
+                                               D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN,
+                                               UINT64 alignment = 0) noexcept {
+        return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_TEXTURE3D, alignment, width, height, depth, mipLevels,
+                                      format, 1, 0, layout, flags, 0, 0, 0);
+    }
+    inline UINT16 Depth() const noexcept {
+        return (Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1);
+    }
+    inline UINT16 ArraySize() const noexcept {
+        return (Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1);
+    }
+    inline UINT8 PlaneCount(_In_ ID3D12Device *pDevice) const noexcept {
+        return D3D12GetFormatPlaneCount(pDevice, Format);
+    }
+    inline UINT Subresources(_In_ ID3D12Device *pDevice) const noexcept {
+        return MipLevels * ArraySize() * PlaneCount(pDevice);
+    }
+    inline UINT CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice) noexcept {
+        return D3D12CalcSubresource(MipSlice, ArraySlice, PlaneSlice, MipLevels, ArraySize());
+    }
+};
+inline bool operator==(const D3D12_RESOURCE_DESC1 &l, const D3D12_RESOURCE_DESC1 &r) noexcept {
+    return l.Dimension == r.Dimension && l.Alignment == r.Alignment && l.Width == r.Width && l.Height == r.Height &&
+           l.DepthOrArraySize == r.DepthOrArraySize && l.MipLevels == r.MipLevels && l.Format == r.Format &&
+           l.SampleDesc.Count == r.SampleDesc.Count && l.SampleDesc.Quality == r.SampleDesc.Quality &&
+           l.Layout == r.Layout && l.Flags == r.Flags &&
+           l.SamplerFeedbackMipRegion.Width == r.SamplerFeedbackMipRegion.Width &&
+           l.SamplerFeedbackMipRegion.Height == r.SamplerFeedbackMipRegion.Height &&
+           l.SamplerFeedbackMipRegion.Depth == r.SamplerFeedbackMipRegion.Depth;
+}
+inline bool operator!=(const D3D12_RESOURCE_DESC1 &l, const D3D12_RESOURCE_DESC1 &r) noexcept { return !(l == r); }
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_VIEW_INSTANCING_DESC : public D3D12_VIEW_INSTANCING_DESC {
+    CD3DX12_VIEW_INSTANCING_DESC() = default;
+    explicit CD3DX12_VIEW_INSTANCING_DESC(const D3D12_VIEW_INSTANCING_DESC &o) noexcept
+        : D3D12_VIEW_INSTANCING_DESC(o) {}
+    explicit CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT) noexcept {
+        ViewInstanceCount = 0;
+        pViewInstanceLocations = nullptr;
+        Flags = D3D12_VIEW_INSTANCING_FLAG_NONE;
+    }
+    explicit CD3DX12_VIEW_INSTANCING_DESC(UINT InViewInstanceCount,
+                                          const D3D12_VIEW_INSTANCE_LOCATION *InViewInstanceLocations,
+                                          D3D12_VIEW_INSTANCING_FLAGS InFlags) noexcept {
+        ViewInstanceCount = InViewInstanceCount;
+        pViewInstanceLocations = InViewInstanceLocations;
+        Flags = InFlags;
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+// Row-by-row memcpy
+inline void MemcpySubresource(_In_ const D3D12_MEMCPY_DEST *pDest, _In_ const D3D12_SUBRESOURCE_DATA *pSrc,
+                              SIZE_T RowSizeInBytes, UINT NumRows, UINT NumSlices) noexcept {
+    for (UINT z = 0; z < NumSlices; ++z) {
+        auto pDestSlice = static_cast<BYTE *>(pDest->pData) + pDest->SlicePitch * z;
+        auto pSrcSlice = static_cast<const BYTE *>(pSrc->pData) + pSrc->SlicePitch * LONG_PTR(z);
+        for (UINT y = 0; y < NumRows; ++y) {
+            memcpy(pDestSlice + pDest->RowPitch * y, pSrcSlice + pSrc->RowPitch * LONG_PTR(y), RowSizeInBytes);
+        }
+    }
+}
+
+//------------------------------------------------------------------------------------------------
+// Row-by-row memcpy
+inline void MemcpySubresource(_In_ const D3D12_MEMCPY_DEST *pDest, _In_ const void *pResourceData,
+                              _In_ const D3D12_SUBRESOURCE_INFO *pSrc, SIZE_T RowSizeInBytes, UINT NumRows,
+                              UINT NumSlices) noexcept {
+    for (UINT z = 0; z < NumSlices; ++z) {
+        auto pDestSlice = static_cast<BYTE *>(pDest->pData) + pDest->SlicePitch * z;
+        auto pSrcSlice = (static_cast<const BYTE *>(pResourceData) + pSrc->Offset) + pSrc->DepthPitch * ULONG_PTR(z);
+        for (UINT y = 0; y < NumRows; ++y) {
+            memcpy(pDestSlice + pDest->RowPitch * y, pSrcSlice + pSrc->RowPitch * ULONG_PTR(y), RowSizeInBytes);
+        }
+    }
+}
+
+//------------------------------------------------------------------------------------------------
+// Returns required size of a buffer to be used for data upload
+inline UINT64 GetRequiredIntermediateSize(_In_ ID3D12Resource *pDestinationResource,
+                                          _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource,
+                                          _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource)
+                                              UINT NumSubresources) noexcept {
+    auto Desc = pDestinationResource->GetDesc();
+    UINT64 RequiredSize = 0;
+
+    ID3D12Device *pDevice = nullptr;
+    pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast<void **>(&pDevice));
+    pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, 0, nullptr, nullptr, nullptr,
+                                   &RequiredSize);
+    pDevice->Release();
+
+    return RequiredSize;
+}
+
+//------------------------------------------------------------------------------------------------
+// All arrays must be populated (e.g. by calling GetCopyableFootprints)
+inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource,
+                                 _In_ ID3D12Resource *pIntermediate,
+                                 _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource,
+                                 _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources,
+                                 UINT64 RequiredSize,
+                                 _In_reads_(NumSubresources) const D3D12_PLACED_SUBRESOURCE_FOOTPRINT *pLayouts,
+                                 _In_reads_(NumSubresources) const UINT *pNumRows,
+                                 _In_reads_(NumSubresources) const UINT64 *pRowSizesInBytes,
+                                 _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA *pSrcData) noexcept {
+    // Minor validation
+    auto IntermediateDesc = pIntermediate->GetDesc();
+    auto DestinationDesc = pDestinationResource->GetDesc();
+    if (IntermediateDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
+        IntermediateDesc.Width < RequiredSize + pLayouts[0].Offset || RequiredSize > SIZE_T(-1) ||
+        (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER &&
+         (FirstSubresource != 0 || NumSubresources != 1))) {
+        return 0;
+    }
+
+    BYTE *pData;
+    HRESULT hr = pIntermediate->Map(0, nullptr, reinterpret_cast<void **>(&pData));
+    if (FAILED(hr)) {
+        return 0;
+    }
+
+    for (UINT i = 0; i < NumSubresources; ++i) {
+        if (pRowSizesInBytes[i] > SIZE_T(-1))
+            return 0;
+        D3D12_MEMCPY_DEST DestData = {pData + pLayouts[i].Offset, pLayouts[i].Footprint.RowPitch,
+                                      SIZE_T(pLayouts[i].Footprint.RowPitch) * SIZE_T(pNumRows[i])};
+        MemcpySubresource(&DestData, &pSrcData[i], static_cast<SIZE_T>(pRowSizesInBytes[i]), pNumRows[i],
+                          pLayouts[i].Footprint.Depth);
+    }
+    pIntermediate->Unmap(0, nullptr);
+
+    if (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER) {
+        pCmdList->CopyBufferRegion(pDestinationResource, 0, pIntermediate, pLayouts[0].Offset,
+                                   pLayouts[0].Footprint.Width);
+    } else {
+        for (UINT i = 0; i < NumSubresources; ++i) {
+            CD3DX12_TEXTURE_COPY_LOCATION Dst(pDestinationResource, i + FirstSubresource);
+            CD3DX12_TEXTURE_COPY_LOCATION Src(pIntermediate, pLayouts[i]);
+            pCmdList->CopyTextureRegion(&Dst, 0, 0, 0, &Src, nullptr);
+        }
+    }
+    return RequiredSize;
+}
+
+//------------------------------------------------------------------------------------------------
+// All arrays must be populated (e.g. by calling GetCopyableFootprints)
+inline UINT64
+UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource,
+                   _In_ ID3D12Resource *pIntermediate, _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource,
+                   _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources, UINT64 RequiredSize,
+                   _In_reads_(NumSubresources) const D3D12_PLACED_SUBRESOURCE_FOOTPRINT *pLayouts,
+                   _In_reads_(NumSubresources) const UINT *pNumRows,
+                   _In_reads_(NumSubresources) const UINT64 *pRowSizesInBytes, _In_ const void *pResourceData,
+                   _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_INFO *pSrcData) noexcept {
+    // Minor validation
+    auto IntermediateDesc = pIntermediate->GetDesc();
+    auto DestinationDesc = pDestinationResource->GetDesc();
+    if (IntermediateDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
+        IntermediateDesc.Width < RequiredSize + pLayouts[0].Offset || RequiredSize > SIZE_T(-1) ||
+        (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER &&
+         (FirstSubresource != 0 || NumSubresources != 1))) {
+        return 0;
+    }
+
+    BYTE *pData;
+    HRESULT hr = pIntermediate->Map(0, nullptr, reinterpret_cast<void **>(&pData));
+    if (FAILED(hr)) {
+        return 0;
+    }
+
+    for (UINT i = 0; i < NumSubresources; ++i) {
+        if (pRowSizesInBytes[i] > SIZE_T(-1))
+            return 0;
+        D3D12_MEMCPY_DEST DestData = {pData + pLayouts[i].Offset, pLayouts[i].Footprint.RowPitch,
+                                      SIZE_T(pLayouts[i].Footprint.RowPitch) * SIZE_T(pNumRows[i])};
+        MemcpySubresource(&DestData, pResourceData, &pSrcData[i], static_cast<SIZE_T>(pRowSizesInBytes[i]), pNumRows[i],
+                          pLayouts[i].Footprint.Depth);
+    }
+    pIntermediate->Unmap(0, nullptr);
+
+    if (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER) {
+        pCmdList->CopyBufferRegion(pDestinationResource, 0, pIntermediate, pLayouts[0].Offset,
+                                   pLayouts[0].Footprint.Width);
+    } else {
+        for (UINT i = 0; i < NumSubresources; ++i) {
+            CD3DX12_TEXTURE_COPY_LOCATION Dst(pDestinationResource, i + FirstSubresource);
+            CD3DX12_TEXTURE_COPY_LOCATION Src(pIntermediate, pLayouts[i]);
+            pCmdList->CopyTextureRegion(&Dst, 0, 0, 0, &Src, nullptr);
+        }
+    }
+    return RequiredSize;
+}
+
+//------------------------------------------------------------------------------------------------
+// Heap-allocating UpdateSubresources implementation
+inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource,
+                                 _In_ ID3D12Resource *pIntermediate, UINT64 IntermediateOffset,
+                                 _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource,
+                                 _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources,
+                                 _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA *pSrcData) noexcept {
+    UINT64 RequiredSize = 0;
+    auto MemToAlloc = static_cast<UINT64>(sizeof(D3D12_PLACED_SUBRESOURCE_FOOTPRINT) + sizeof(UINT) + sizeof(UINT64)) *
+                      NumSubresources;
+    if (MemToAlloc > SIZE_MAX) {
+        return 0;
+    }
+    void *pMem = HeapAlloc(GetProcessHeap(), 0, static_cast<SIZE_T>(MemToAlloc));
+    if (pMem == nullptr) {
+        return 0;
+    }
+    auto pLayouts = static_cast<D3D12_PLACED_SUBRESOURCE_FOOTPRINT *>(pMem);
+    auto pRowSizesInBytes = reinterpret_cast<UINT64 *>(pLayouts + NumSubresources);
+    auto pNumRows = reinterpret_cast<UINT *>(pRowSizesInBytes + NumSubresources);
+
+    auto Desc = pDestinationResource->GetDesc();
+    ID3D12Device *pDevice = nullptr;
+    pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast<void **>(&pDevice));
+    pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, pLayouts, pNumRows,
+                                   pRowSizesInBytes, &RequiredSize);
+    pDevice->Release();
+
+    UINT64 Result = UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources,
+                                       RequiredSize, pLayouts, pNumRows, pRowSizesInBytes, pSrcData);
+    HeapFree(GetProcessHeap(), 0, pMem);
+    return Result;
+}
+
+//------------------------------------------------------------------------------------------------
+// Heap-allocating UpdateSubresources implementation
+inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource,
+                                 _In_ ID3D12Resource *pIntermediate, UINT64 IntermediateOffset,
+                                 _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource,
+                                 _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources,
+                                 _In_ const void *pResourceData,
+                                 _In_reads_(NumSubresources) D3D12_SUBRESOURCE_INFO *pSrcData) noexcept {
+    UINT64 RequiredSize = 0;
+    auto MemToAlloc = static_cast<UINT64>(sizeof(D3D12_PLACED_SUBRESOURCE_FOOTPRINT) + sizeof(UINT) + sizeof(UINT64)) *
+                      NumSubresources;
+    if (MemToAlloc > SIZE_MAX) {
+        return 0;
+    }
+    void *pMem = HeapAlloc(GetProcessHeap(), 0, static_cast<SIZE_T>(MemToAlloc));
+    if (pMem == nullptr) {
+        return 0;
+    }
+    auto pLayouts = reinterpret_cast<D3D12_PLACED_SUBRESOURCE_FOOTPRINT *>(pMem);
+    auto pRowSizesInBytes = reinterpret_cast<UINT64 *>(pLayouts + NumSubresources);
+    auto pNumRows = reinterpret_cast<UINT *>(pRowSizesInBytes + NumSubresources);
+
+    auto Desc = pDestinationResource->GetDesc();
+    ID3D12Device *pDevice = nullptr;
+    pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast<void **>(&pDevice));
+    pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, pLayouts, pNumRows,
+                                   pRowSizesInBytes, &RequiredSize);
+    pDevice->Release();
+
+    UINT64 Result = UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources,
+                                       RequiredSize, pLayouts, pNumRows, pRowSizesInBytes, pResourceData, pSrcData);
+    HeapFree(GetProcessHeap(), 0, pMem);
+    return Result;
+}
+
+//------------------------------------------------------------------------------------------------
+// Stack-allocating UpdateSubresources implementation
+template <UINT MaxSubresources>
+inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource,
+                                 _In_ ID3D12Resource *pIntermediate, UINT64 IntermediateOffset,
+                                 _In_range_(0, MaxSubresources) UINT FirstSubresource,
+                                 _In_range_(1, MaxSubresources - FirstSubresource) UINT NumSubresources,
+                                 _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA *pSrcData) noexcept {
+    UINT64 RequiredSize = 0;
+    D3D12_PLACED_SUBRESOURCE_FOOTPRINT Layouts[MaxSubresources];
+    UINT NumRows[MaxSubresources];
+    UINT64 RowSizesInBytes[MaxSubresources];
+
+    auto Desc = pDestinationResource->GetDesc();
+    ID3D12Device *pDevice = nullptr;
+    pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast<void **>(&pDevice));
+    pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, Layouts, NumRows,
+                                   RowSizesInBytes, &RequiredSize);
+    pDevice->Release();
+
+    return UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources,
+                              RequiredSize, Layouts, NumRows, RowSizesInBytes, pSrcData);
+}
+
+//------------------------------------------------------------------------------------------------
+// Stack-allocating UpdateSubresources implementation
+template <UINT MaxSubresources>
+inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource,
+                                 _In_ ID3D12Resource *pIntermediate, UINT64 IntermediateOffset,
+                                 _In_range_(0, MaxSubresources) UINT FirstSubresource,
+                                 _In_range_(1, MaxSubresources - FirstSubresource) UINT NumSubresources,
+                                 _In_ const void *pResourceData,
+                                 _In_reads_(NumSubresources) D3D12_SUBRESOURCE_INFO *pSrcData) noexcept {
+    UINT64 RequiredSize = 0;
+    D3D12_PLACED_SUBRESOURCE_FOOTPRINT Layouts[MaxSubresources];
+    UINT NumRows[MaxSubresources];
+    UINT64 RowSizesInBytes[MaxSubresources];
+
+    auto Desc = pDestinationResource->GetDesc();
+    ID3D12Device *pDevice = nullptr;
+    pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast<void **>(&pDevice));
+    pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, Layouts, NumRows,
+                                   RowSizesInBytes, &RequiredSize);
+    pDevice->Release();
+
+    return UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources,
+                              RequiredSize, Layouts, NumRows, RowSizesInBytes, pResourceData, pSrcData);
+}
+
+//------------------------------------------------------------------------------------------------
+inline constexpr bool D3D12IsLayoutOpaque(D3D12_TEXTURE_LAYOUT Layout) noexcept {
+    return Layout == D3D12_TEXTURE_LAYOUT_UNKNOWN || Layout == D3D12_TEXTURE_LAYOUT_64KB_UNDEFINED_SWIZZLE;
+}
+
+//------------------------------------------------------------------------------------------------
+template <typename t_CommandListType>
+inline ID3D12CommandList *const *CommandListCast(t_CommandListType *const *pp) noexcept {
+    // This cast is useful for passing strongly typed command list pointers into
+    // ExecuteCommandLists.
+    // This cast is valid as long as the const-ness is respected. D3D12 APIs do
+    // respect the const-ness of their arguments.
+    return reinterpret_cast<ID3D12CommandList *const *>(pp);
+}
+
+//------------------------------------------------------------------------------------------------
+// D3D12 exports a new method for serializing root signatures in the Windows 10 Anniversary Update.
+// To help enable root signature 1.1 features when they are available and not require maintaining
+// two code paths for building root signatures, this helper method reconstructs a 1.0 signature when
+// 1.1 is not supported.
+inline HRESULT D3DX12SerializeVersionedRootSignature(_In_ const D3D12_VERSIONED_ROOT_SIGNATURE_DESC *pRootSignatureDesc,
+                                                     D3D_ROOT_SIGNATURE_VERSION MaxVersion, _Outptr_ ID3DBlob **ppBlob,
+                                                     _Always_(_Outptr_opt_result_maybenull_)
+                                                         ID3DBlob **ppErrorBlob) noexcept {
+    if (ppErrorBlob != nullptr) {
+        *ppErrorBlob = nullptr;
+    }
+
+    switch (MaxVersion) {
+    case D3D_ROOT_SIGNATURE_VERSION_1_0:
+        switch (pRootSignatureDesc->Version) {
+        case D3D_ROOT_SIGNATURE_VERSION_1_0:
+            return D3D12SerializeRootSignature(&pRootSignatureDesc->Desc_1_0, D3D_ROOT_SIGNATURE_VERSION_1, ppBlob,
+                                               ppErrorBlob);
+
+        case D3D_ROOT_SIGNATURE_VERSION_1_1: {
+            HRESULT hr = S_OK;
+            const D3D12_ROOT_SIGNATURE_DESC1 &desc_1_1 = pRootSignatureDesc->Desc_1_1;
+
+            const SIZE_T ParametersSize = sizeof(D3D12_ROOT_PARAMETER) * desc_1_1.NumParameters;
+            void *pParameters = (ParametersSize > 0) ? HeapAlloc(GetProcessHeap(), 0, ParametersSize) : nullptr;
+            if (ParametersSize > 0 && pParameters == nullptr) {
+                hr = E_OUTOFMEMORY;
+            }
+            auto pParameters_1_0 = static_cast<D3D12_ROOT_PARAMETER *>(pParameters);
+
+            if (SUCCEEDED(hr)) {
+                for (UINT n = 0; n < desc_1_1.NumParameters; n++) {
+                    __analysis_assume(ParametersSize == sizeof(D3D12_ROOT_PARAMETER) * desc_1_1.NumParameters);
+                    pParameters_1_0[n].ParameterType = desc_1_1.pParameters[n].ParameterType;
+                    pParameters_1_0[n].ShaderVisibility = desc_1_1.pParameters[n].ShaderVisibility;
+
+                    switch (desc_1_1.pParameters[n].ParameterType) {
+                    case D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS:
+                        pParameters_1_0[n].Constants.Num32BitValues = desc_1_1.pParameters[n].Constants.Num32BitValues;
+                        pParameters_1_0[n].Constants.RegisterSpace = desc_1_1.pParameters[n].Constants.RegisterSpace;
+                        pParameters_1_0[n].Constants.ShaderRegister = desc_1_1.pParameters[n].Constants.ShaderRegister;
+                        break;
+
+                    case D3D12_ROOT_PARAMETER_TYPE_CBV:
+                    case D3D12_ROOT_PARAMETER_TYPE_SRV:
+                    case D3D12_ROOT_PARAMETER_TYPE_UAV:
+                        pParameters_1_0[n].Descriptor.RegisterSpace = desc_1_1.pParameters[n].Descriptor.RegisterSpace;
+                        pParameters_1_0[n].Descriptor.ShaderRegister =
+                            desc_1_1.pParameters[n].Descriptor.ShaderRegister;
+                        break;
+
+                    case D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE:
+                        const D3D12_ROOT_DESCRIPTOR_TABLE1 &table_1_1 = desc_1_1.pParameters[n].DescriptorTable;
+
+                        const SIZE_T DescriptorRangesSize =
+                            sizeof(D3D12_DESCRIPTOR_RANGE) * table_1_1.NumDescriptorRanges;
+                        void *pDescriptorRanges = (DescriptorRangesSize > 0 && SUCCEEDED(hr))
+                                                      ? HeapAlloc(GetProcessHeap(), 0, DescriptorRangesSize)
+                                                      : nullptr;
+                        if (DescriptorRangesSize > 0 && pDescriptorRanges == nullptr) {
+                            hr = E_OUTOFMEMORY;
+                        }
+                        auto pDescriptorRanges_1_0 = static_cast<D3D12_DESCRIPTOR_RANGE *>(pDescriptorRanges);
+
+                        if (SUCCEEDED(hr)) {
+                            for (UINT x = 0; x < table_1_1.NumDescriptorRanges; x++) {
+                                __analysis_assume(DescriptorRangesSize ==
+                                                  sizeof(D3D12_DESCRIPTOR_RANGE) * table_1_1.NumDescriptorRanges);
+                                pDescriptorRanges_1_0[x].BaseShaderRegister =
+                                    table_1_1.pDescriptorRanges[x].BaseShaderRegister;
+                                pDescriptorRanges_1_0[x].NumDescriptors = table_1_1.pDescriptorRanges[x].NumDescriptors;
+                                pDescriptorRanges_1_0[x].OffsetInDescriptorsFromTableStart =
+                                    table_1_1.pDescriptorRanges[x].OffsetInDescriptorsFromTableStart;
+                                pDescriptorRanges_1_0[x].RangeType = table_1_1.pDescriptorRanges[x].RangeType;
+                                pDescriptorRanges_1_0[x].RegisterSpace = table_1_1.pDescriptorRanges[x].RegisterSpace;
+                            }
+                        }
+
+                        D3D12_ROOT_DESCRIPTOR_TABLE &table_1_0 = pParameters_1_0[n].DescriptorTable;
+                        table_1_0.NumDescriptorRanges = table_1_1.NumDescriptorRanges;
+                        table_1_0.pDescriptorRanges = pDescriptorRanges_1_0;
+                    }
+                }
+            }
+
+            if (SUCCEEDED(hr)) {
+                CD3DX12_ROOT_SIGNATURE_DESC desc_1_0(desc_1_1.NumParameters, pParameters_1_0,
+                                                     desc_1_1.NumStaticSamplers, desc_1_1.pStaticSamplers,
+                                                     desc_1_1.Flags);
+                hr = D3D12SerializeRootSignature(&desc_1_0, D3D_ROOT_SIGNATURE_VERSION_1, ppBlob, ppErrorBlob);
+            }
+
+            if (pParameters) {
+                for (UINT n = 0; n < desc_1_1.NumParameters; n++) {
+                    if (desc_1_1.pParameters[n].ParameterType == D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE) {
+                        auto pDescriptorRanges_1_0 = pParameters_1_0[n].DescriptorTable.pDescriptorRanges;
+                        HeapFree(GetProcessHeap(), 0,
+                                 reinterpret_cast<void *>(const_cast<D3D12_DESCRIPTOR_RANGE *>(pDescriptorRanges_1_0)));
+                    }
+                }
+                HeapFree(GetProcessHeap(), 0, pParameters);
+            }
+            return hr;
+        }
+        }
+        break;
+
+    case D3D_ROOT_SIGNATURE_VERSION_1_1:
+        return D3D12SerializeVersionedRootSignature(pRootSignatureDesc, ppBlob, ppErrorBlob);
+    }
+
+    return E_INVALIDARG;
+}
+
+//------------------------------------------------------------------------------------------------
+struct CD3DX12_RT_FORMAT_ARRAY : public D3D12_RT_FORMAT_ARRAY {
+    CD3DX12_RT_FORMAT_ARRAY() = default;
+    explicit CD3DX12_RT_FORMAT_ARRAY(const D3D12_RT_FORMAT_ARRAY &o) noexcept : D3D12_RT_FORMAT_ARRAY(o) {}
+    explicit CD3DX12_RT_FORMAT_ARRAY(_In_reads_(NumFormats) const DXGI_FORMAT *pFormats, UINT NumFormats) noexcept {
+        NumRenderTargets = NumFormats;
+        memcpy(RTFormats, pFormats, sizeof(RTFormats));
+        // assumes ARRAY_SIZE(pFormats) == ARRAY_SIZE(RTFormats)
+    }
+};
+
+//------------------------------------------------------------------------------------------------
+// Pipeline State Stream Helpers
+//------------------------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------------------------
+// Stream Subobjects, i.e. elements of a stream
+
+struct DefaultSampleMask {
+    operator UINT() noexcept { return UINT_MAX; }
+};
+struct DefaultSampleDesc {
+    operator DXGI_SAMPLE_DESC() noexcept { return DXGI_SAMPLE_DESC{1, 0}; }
+};
+
+#pragma warning(push)
+#pragma warning(disable : 4324)
+template <typename InnerStructType, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE Type, typename DefaultArg = InnerStructType>
+class alignas(void *) CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT {
+  private:
+    D3D12_PIPELINE_STATE_SUBOBJECT_TYPE _Type;
+    InnerStructType _Inner;
+
+  public:
+    CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT() noexcept : _Type(Type), _Inner(DefaultArg()) {}
+    CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT(InnerStructType const &i) noexcept : _Type(Type), _Inner(i) {}
+    CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT &operator=(InnerStructType const &i) noexcept {
+        _Type = Type;
+        _Inner = i;
+        return *this;
+    }
+    operator InnerStructType const &() const noexcept { return _Inner; }
+    operator InnerStructType &() noexcept { return _Inner; }
+    InnerStructType *operator&() noexcept { return &_Inner; }
+    InnerStructType const *operator&() const noexcept { return &_Inner; }
+};
+#pragma warning(pop)
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_PIPELINE_STATE_FLAGS, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_FLAGS>
+    CD3DX12_PIPELINE_STATE_STREAM_FLAGS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<UINT, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_NODE_MASK>
+    CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<ID3D12RootSignature *,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_ROOT_SIGNATURE>
+    CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_INPUT_LAYOUT_DESC,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_INPUT_LAYOUT>
+    CD3DX12_PIPELINE_STATE_STREAM_INPUT_LAYOUT;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_INDEX_BUFFER_STRIP_CUT_VALUE,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_IB_STRIP_CUT_VALUE>
+    CD3DX12_PIPELINE_STATE_STREAM_IB_STRIP_CUT_VALUE;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_PRIMITIVE_TOPOLOGY_TYPE,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_PRIMITIVE_TOPOLOGY>
+    CD3DX12_PIPELINE_STATE_STREAM_PRIMITIVE_TOPOLOGY;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_SHADER_BYTECODE, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_VS>
+    CD3DX12_PIPELINE_STATE_STREAM_VS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_SHADER_BYTECODE, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_GS>
+    CD3DX12_PIPELINE_STATE_STREAM_GS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_STREAM_OUTPUT_DESC,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_STREAM_OUTPUT>
+    CD3DX12_PIPELINE_STATE_STREAM_STREAM_OUTPUT;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_SHADER_BYTECODE, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_HS>
+    CD3DX12_PIPELINE_STATE_STREAM_HS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_SHADER_BYTECODE, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DS>
+    CD3DX12_PIPELINE_STATE_STREAM_DS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_SHADER_BYTECODE, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_PS>
+    CD3DX12_PIPELINE_STATE_STREAM_PS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_SHADER_BYTECODE, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_AS>
+    CD3DX12_PIPELINE_STATE_STREAM_AS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_SHADER_BYTECODE, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_MS>
+    CD3DX12_PIPELINE_STATE_STREAM_MS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_SHADER_BYTECODE, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_CS>
+    CD3DX12_PIPELINE_STATE_STREAM_CS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<CD3DX12_BLEND_DESC, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_BLEND,
+                                                CD3DX12_DEFAULT>
+    CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<CD3DX12_DEPTH_STENCIL_DESC,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL, CD3DX12_DEFAULT>
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<CD3DX12_DEPTH_STENCIL_DESC1,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL1, CD3DX12_DEFAULT>
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<DXGI_FORMAT, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL_FORMAT>
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<CD3DX12_RASTERIZER_DESC, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_RASTERIZER,
+                                                CD3DX12_DEFAULT>
+    CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_RT_FORMAT_ARRAY,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_RENDER_TARGET_FORMATS>
+    CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<DXGI_SAMPLE_DESC, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_SAMPLE_DESC,
+                                                DefaultSampleDesc>
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<UINT, D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_SAMPLE_MASK,
+                                                DefaultSampleMask>
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<D3D12_CACHED_PIPELINE_STATE,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_CACHED_PSO>
+    CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO;
+typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT<CD3DX12_VIEW_INSTANCING_DESC,
+                                                D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_VIEW_INSTANCING, CD3DX12_DEFAULT>
+    CD3DX12_PIPELINE_STATE_STREAM_VIEW_INSTANCING;
+
+//------------------------------------------------------------------------------------------------
+// Stream Parser Helpers
+
+struct ID3DX12PipelineParserCallbacks {
+    // Subobject Callbacks
+    virtual void FlagsCb(D3D12_PIPELINE_STATE_FLAGS) {}
+    virtual void NodeMaskCb(UINT) {}
+    virtual void RootSignatureCb(ID3D12RootSignature *) {}
+    virtual void InputLayoutCb(const D3D12_INPUT_LAYOUT_DESC &) {}
+    virtual void IBStripCutValueCb(D3D12_INDEX_BUFFER_STRIP_CUT_VALUE) {}
+    virtual void PrimitiveTopologyTypeCb(D3D12_PRIMITIVE_TOPOLOGY_TYPE) {}
+    virtual void VSCb(const D3D12_SHADER_BYTECODE &) {}
+    virtual void GSCb(const D3D12_SHADER_BYTECODE &) {}
+    virtual void StreamOutputCb(const D3D12_STREAM_OUTPUT_DESC &) {}
+    virtual void HSCb(const D3D12_SHADER_BYTECODE &) {}
+    virtual void DSCb(const D3D12_SHADER_BYTECODE &) {}
+    virtual void PSCb(const D3D12_SHADER_BYTECODE &) {}
+    virtual void CSCb(const D3D12_SHADER_BYTECODE &) {}
+    virtual void ASCb(const D3D12_SHADER_BYTECODE &) {}
+    virtual void MSCb(const D3D12_SHADER_BYTECODE &) {}
+    virtual void BlendStateCb(const D3D12_BLEND_DESC &) {}
+    virtual void DepthStencilStateCb(const D3D12_DEPTH_STENCIL_DESC &) {}
+    virtual void DepthStencilState1Cb(const D3D12_DEPTH_STENCIL_DESC1 &) {}
+    virtual void DSVFormatCb(DXGI_FORMAT) {}
+    virtual void RasterizerStateCb(const D3D12_RASTERIZER_DESC &) {}
+    virtual void RTVFormatsCb(const D3D12_RT_FORMAT_ARRAY &) {}
+    virtual void SampleDescCb(const DXGI_SAMPLE_DESC &) {}
+    virtual void SampleMaskCb(UINT) {}
+    virtual void ViewInstancingCb(const D3D12_VIEW_INSTANCING_DESC &) {}
+    virtual void CachedPSOCb(const D3D12_CACHED_PIPELINE_STATE &) {}
+
+    // Error Callbacks
+    virtual void ErrorBadInputParameter(UINT /*ParameterIndex*/) {}
+    virtual void ErrorDuplicateSubobject(D3D12_PIPELINE_STATE_SUBOBJECT_TYPE /*DuplicateType*/) {}
+    virtual void ErrorUnknownSubobject(UINT /*UnknownTypeValue*/) {}
+
+    virtual ~ID3DX12PipelineParserCallbacks() = default;
+};
+
+struct D3DX12_MESH_SHADER_PIPELINE_STATE_DESC {
+    ID3D12RootSignature *pRootSignature;
+    D3D12_SHADER_BYTECODE AS;
+    D3D12_SHADER_BYTECODE MS;
+    D3D12_SHADER_BYTECODE PS;
+    D3D12_BLEND_DESC BlendState;
+    UINT SampleMask;
+    D3D12_RASTERIZER_DESC RasterizerState;
+    D3D12_DEPTH_STENCIL_DESC DepthStencilState;
+    D3D12_PRIMITIVE_TOPOLOGY_TYPE PrimitiveTopologyType;
+    UINT NumRenderTargets;
+    DXGI_FORMAT RTVFormats[D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT];
+    DXGI_FORMAT DSVFormat;
+    DXGI_SAMPLE_DESC SampleDesc;
+    UINT NodeMask;
+    D3D12_CACHED_PIPELINE_STATE CachedPSO;
+    D3D12_PIPELINE_STATE_FLAGS Flags;
+};
+
+// CD3DX12_PIPELINE_STATE_STREAM2 Works on OS Build 19041+ (where there is a new mesh shader pipeline).
+// Use CD3DX12_PIPELINE_STATE_STREAM1 for OS Build 16299+ (where there is a new view instancing subobject).
+// Use CD3DX12_PIPELINE_STATE_STREAM for OS Build 15063+ support.
+struct CD3DX12_PIPELINE_STATE_STREAM2 {
+    CD3DX12_PIPELINE_STATE_STREAM2() = default;
+    // Mesh and amplification shaders must be set manually, since they do not have representation in
+    // D3D12_GRAPHICS_PIPELINE_STATE_DESC
+    CD3DX12_PIPELINE_STATE_STREAM2(const D3D12_GRAPHICS_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature),
+          InputLayout(Desc.InputLayout), IBStripCutValue(Desc.IBStripCutValue),
+          PrimitiveTopologyType(Desc.PrimitiveTopologyType), VS(Desc.VS), GS(Desc.GS), StreamOutput(Desc.StreamOutput),
+          HS(Desc.HS), DS(Desc.DS), PS(Desc.PS), BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)),
+          DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat),
+          RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)),
+          RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc),
+          SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO),
+          ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {}
+    CD3DX12_PIPELINE_STATE_STREAM2(const D3DX12_MESH_SHADER_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature),
+          PrimitiveTopologyType(Desc.PrimitiveTopologyType), PS(Desc.PS), AS(Desc.AS), MS(Desc.MS),
+          BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)),
+          DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat),
+          RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)),
+          RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc),
+          SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO),
+          ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {}
+    CD3DX12_PIPELINE_STATE_STREAM2(const D3D12_COMPUTE_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature),
+          CS(CD3DX12_SHADER_BYTECODE(Desc.CS)), CachedPSO(Desc.CachedPSO) {
+        static_cast<D3D12_DEPTH_STENCIL_DESC1 &>(DepthStencilState).DepthEnable = false;
+    }
+    CD3DX12_PIPELINE_STATE_STREAM_FLAGS Flags;
+    CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK NodeMask;
+    CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE pRootSignature;
+    CD3DX12_PIPELINE_STATE_STREAM_INPUT_LAYOUT InputLayout;
+    CD3DX12_PIPELINE_STATE_STREAM_IB_STRIP_CUT_VALUE IBStripCutValue;
+    CD3DX12_PIPELINE_STATE_STREAM_PRIMITIVE_TOPOLOGY PrimitiveTopologyType;
+    CD3DX12_PIPELINE_STATE_STREAM_VS VS;
+    CD3DX12_PIPELINE_STATE_STREAM_GS GS;
+    CD3DX12_PIPELINE_STATE_STREAM_STREAM_OUTPUT StreamOutput;
+    CD3DX12_PIPELINE_STATE_STREAM_HS HS;
+    CD3DX12_PIPELINE_STATE_STREAM_DS DS;
+    CD3DX12_PIPELINE_STATE_STREAM_PS PS;
+    CD3DX12_PIPELINE_STATE_STREAM_AS AS;
+    CD3DX12_PIPELINE_STATE_STREAM_MS MS;
+    CD3DX12_PIPELINE_STATE_STREAM_CS CS;
+    CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC BlendState;
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1 DepthStencilState;
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT DSVFormat;
+    CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER RasterizerState;
+    CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS RTVFormats;
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC SampleDesc;
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK SampleMask;
+    CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO CachedPSO;
+    CD3DX12_PIPELINE_STATE_STREAM_VIEW_INSTANCING ViewInstancingDesc;
+    D3D12_GRAPHICS_PIPELINE_STATE_DESC GraphicsDescV0() const noexcept {
+        D3D12_GRAPHICS_PIPELINE_STATE_DESC D;
+        D.Flags = this->Flags;
+        D.NodeMask = this->NodeMask;
+        D.pRootSignature = this->pRootSignature;
+        D.InputLayout = this->InputLayout;
+        D.IBStripCutValue = this->IBStripCutValue;
+        D.PrimitiveTopologyType = this->PrimitiveTopologyType;
+        D.VS = this->VS;
+        D.GS = this->GS;
+        D.StreamOutput = this->StreamOutput;
+        D.HS = this->HS;
+        D.DS = this->DS;
+        D.PS = this->PS;
+        D.BlendState = this->BlendState;
+        D.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(D3D12_DEPTH_STENCIL_DESC1(this->DepthStencilState));
+        D.DSVFormat = this->DSVFormat;
+        D.RasterizerState = this->RasterizerState;
+        D.NumRenderTargets = D3D12_RT_FORMAT_ARRAY(this->RTVFormats).NumRenderTargets;
+        memcpy(D.RTVFormats, D3D12_RT_FORMAT_ARRAY(this->RTVFormats).RTFormats, sizeof(D.RTVFormats));
+        D.SampleDesc = this->SampleDesc;
+        D.SampleMask = this->SampleMask;
+        D.CachedPSO = this->CachedPSO;
+        return D;
+    }
+    D3D12_COMPUTE_PIPELINE_STATE_DESC ComputeDescV0() const noexcept {
+        D3D12_COMPUTE_PIPELINE_STATE_DESC D;
+        D.Flags = this->Flags;
+        D.NodeMask = this->NodeMask;
+        D.pRootSignature = this->pRootSignature;
+        D.CS = this->CS;
+        D.CachedPSO = this->CachedPSO;
+        return D;
+    }
+};
+
+// CD3DX12_PIPELINE_STATE_STREAM1 Works on OS Build 16299+ (where there is a new view instancing subobject).
+// Use CD3DX12_PIPELINE_STATE_STREAM for OS Build 15063+ support.
+struct CD3DX12_PIPELINE_STATE_STREAM1 {
+    CD3DX12_PIPELINE_STATE_STREAM1() = default;
+    // Mesh and amplification shaders must be set manually, since they do not have representation in
+    // D3D12_GRAPHICS_PIPELINE_STATE_DESC
+    CD3DX12_PIPELINE_STATE_STREAM1(const D3D12_GRAPHICS_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature),
+          InputLayout(Desc.InputLayout), IBStripCutValue(Desc.IBStripCutValue),
+          PrimitiveTopologyType(Desc.PrimitiveTopologyType), VS(Desc.VS), GS(Desc.GS), StreamOutput(Desc.StreamOutput),
+          HS(Desc.HS), DS(Desc.DS), PS(Desc.PS), BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)),
+          DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat),
+          RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)),
+          RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc),
+          SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO),
+          ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {}
+    CD3DX12_PIPELINE_STATE_STREAM1(const D3DX12_MESH_SHADER_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature),
+          PrimitiveTopologyType(Desc.PrimitiveTopologyType), PS(Desc.PS),
+          BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)),
+          DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat),
+          RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)),
+          RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc),
+          SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO),
+          ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {}
+    CD3DX12_PIPELINE_STATE_STREAM1(const D3D12_COMPUTE_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature),
+          CS(CD3DX12_SHADER_BYTECODE(Desc.CS)), CachedPSO(Desc.CachedPSO) {
+        static_cast<D3D12_DEPTH_STENCIL_DESC1 &>(DepthStencilState).DepthEnable = false;
+    }
+    CD3DX12_PIPELINE_STATE_STREAM_FLAGS Flags;
+    CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK NodeMask;
+    CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE pRootSignature;
+    CD3DX12_PIPELINE_STATE_STREAM_INPUT_LAYOUT InputLayout;
+    CD3DX12_PIPELINE_STATE_STREAM_IB_STRIP_CUT_VALUE IBStripCutValue;
+    CD3DX12_PIPELINE_STATE_STREAM_PRIMITIVE_TOPOLOGY PrimitiveTopologyType;
+    CD3DX12_PIPELINE_STATE_STREAM_VS VS;
+    CD3DX12_PIPELINE_STATE_STREAM_GS GS;
+    CD3DX12_PIPELINE_STATE_STREAM_STREAM_OUTPUT StreamOutput;
+    CD3DX12_PIPELINE_STATE_STREAM_HS HS;
+    CD3DX12_PIPELINE_STATE_STREAM_DS DS;
+    CD3DX12_PIPELINE_STATE_STREAM_PS PS;
+    CD3DX12_PIPELINE_STATE_STREAM_CS CS;
+    CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC BlendState;
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1 DepthStencilState;
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT DSVFormat;
+    CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER RasterizerState;
+    CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS RTVFormats;
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC SampleDesc;
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK SampleMask;
+    CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO CachedPSO;
+    CD3DX12_PIPELINE_STATE_STREAM_VIEW_INSTANCING ViewInstancingDesc;
+    D3D12_GRAPHICS_PIPELINE_STATE_DESC GraphicsDescV0() const noexcept {
+        D3D12_GRAPHICS_PIPELINE_STATE_DESC D;
+        D.Flags = this->Flags;
+        D.NodeMask = this->NodeMask;
+        D.pRootSignature = this->pRootSignature;
+        D.InputLayout = this->InputLayout;
+        D.IBStripCutValue = this->IBStripCutValue;
+        D.PrimitiveTopologyType = this->PrimitiveTopologyType;
+        D.VS = this->VS;
+        D.GS = this->GS;
+        D.StreamOutput = this->StreamOutput;
+        D.HS = this->HS;
+        D.DS = this->DS;
+        D.PS = this->PS;
+        D.BlendState = this->BlendState;
+        D.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(D3D12_DEPTH_STENCIL_DESC1(this->DepthStencilState));
+        D.DSVFormat = this->DSVFormat;
+        D.RasterizerState = this->RasterizerState;
+        D.NumRenderTargets = D3D12_RT_FORMAT_ARRAY(this->RTVFormats).NumRenderTargets;
+        memcpy(D.RTVFormats, D3D12_RT_FORMAT_ARRAY(this->RTVFormats).RTFormats, sizeof(D.RTVFormats));
+        D.SampleDesc = this->SampleDesc;
+        D.SampleMask = this->SampleMask;
+        D.CachedPSO = this->CachedPSO;
+        return D;
+    }
+    D3D12_COMPUTE_PIPELINE_STATE_DESC ComputeDescV0() const noexcept {
+        D3D12_COMPUTE_PIPELINE_STATE_DESC D;
+        D.Flags = this->Flags;
+        D.NodeMask = this->NodeMask;
+        D.pRootSignature = this->pRootSignature;
+        D.CS = this->CS;
+        D.CachedPSO = this->CachedPSO;
+        return D;
+    }
+};
+
+struct CD3DX12_PIPELINE_MESH_STATE_STREAM {
+    CD3DX12_PIPELINE_MESH_STATE_STREAM() = default;
+    CD3DX12_PIPELINE_MESH_STATE_STREAM(const D3DX12_MESH_SHADER_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), PS(Desc.PS), AS(Desc.AS),
+          MS(Desc.MS), BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)),
+          DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat),
+          RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)),
+          RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc),
+          SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO),
+          ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {}
+    CD3DX12_PIPELINE_STATE_STREAM_FLAGS Flags;
+    CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK NodeMask;
+    CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE pRootSignature;
+    CD3DX12_PIPELINE_STATE_STREAM_PS PS;
+    CD3DX12_PIPELINE_STATE_STREAM_AS AS;
+    CD3DX12_PIPELINE_STATE_STREAM_MS MS;
+    CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC BlendState;
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1 DepthStencilState;
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT DSVFormat;
+    CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER RasterizerState;
+    CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS RTVFormats;
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC SampleDesc;
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK SampleMask;
+    CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO CachedPSO;
+    CD3DX12_PIPELINE_STATE_STREAM_VIEW_INSTANCING ViewInstancingDesc;
+    D3DX12_MESH_SHADER_PIPELINE_STATE_DESC MeshShaderDescV0() const noexcept {
+        D3DX12_MESH_SHADER_PIPELINE_STATE_DESC D;
+        D.Flags = this->Flags;
+        D.NodeMask = this->NodeMask;
+        D.pRootSignature = this->pRootSignature;
+        D.PS = this->PS;
+        D.AS = this->AS;
+        D.MS = this->MS;
+        D.BlendState = this->BlendState;
+        D.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(D3D12_DEPTH_STENCIL_DESC1(this->DepthStencilState));
+        D.DSVFormat = this->DSVFormat;
+        D.RasterizerState = this->RasterizerState;
+        D.NumRenderTargets = D3D12_RT_FORMAT_ARRAY(this->RTVFormats).NumRenderTargets;
+        memcpy(D.RTVFormats, D3D12_RT_FORMAT_ARRAY(this->RTVFormats).RTFormats, sizeof(D.RTVFormats));
+        D.SampleDesc = this->SampleDesc;
+        D.SampleMask = this->SampleMask;
+        D.CachedPSO = this->CachedPSO;
+        return D;
+    }
+};
+
+// CD3DX12_PIPELINE_STATE_STREAM works on OS Build 15063+ but does not support new subobject(s) added in OS Build
+// 16299+. See CD3DX12_PIPELINE_STATE_STREAM1 for instance.
+struct CD3DX12_PIPELINE_STATE_STREAM {
+    CD3DX12_PIPELINE_STATE_STREAM() = default;
+    CD3DX12_PIPELINE_STATE_STREAM(const D3D12_GRAPHICS_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature),
+          InputLayout(Desc.InputLayout), IBStripCutValue(Desc.IBStripCutValue),
+          PrimitiveTopologyType(Desc.PrimitiveTopologyType), VS(Desc.VS), GS(Desc.GS), StreamOutput(Desc.StreamOutput),
+          HS(Desc.HS), DS(Desc.DS), PS(Desc.PS), BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)),
+          DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat),
+          RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)),
+          RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc),
+          SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO) {}
+    CD3DX12_PIPELINE_STATE_STREAM(const D3D12_COMPUTE_PIPELINE_STATE_DESC &Desc)
+    noexcept
+        : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature),
+          CS(CD3DX12_SHADER_BYTECODE(Desc.CS)), CachedPSO(Desc.CachedPSO) {}
+    CD3DX12_PIPELINE_STATE_STREAM_FLAGS Flags;
+    CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK NodeMask;
+    CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE pRootSignature;
+    CD3DX12_PIPELINE_STATE_STREAM_INPUT_LAYOUT InputLayout;
+    CD3DX12_PIPELINE_STATE_STREAM_IB_STRIP_CUT_VALUE IBStripCutValue;
+    CD3DX12_PIPELINE_STATE_STREAM_PRIMITIVE_TOPOLOGY PrimitiveTopologyType;
+    CD3DX12_PIPELINE_STATE_STREAM_VS VS;
+    CD3DX12_PIPELINE_STATE_STREAM_GS GS;
+    CD3DX12_PIPELINE_STATE_STREAM_STREAM_OUTPUT StreamOutput;
+    CD3DX12_PIPELINE_STATE_STREAM_HS HS;
+    CD3DX12_PIPELINE_STATE_STREAM_DS DS;
+    CD3DX12_PIPELINE_STATE_STREAM_PS PS;
+    CD3DX12_PIPELINE_STATE_STREAM_CS CS;
+    CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC BlendState;
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1 DepthStencilState;
+    CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT DSVFormat;
+    CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER RasterizerState;
+    CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS RTVFormats;
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC SampleDesc;
+    CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK SampleMask;
+    CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO CachedPSO;
+    D3D12_GRAPHICS_PIPELINE_STATE_DESC GraphicsDescV0() const noexcept {
+        D3D12_GRAPHICS_PIPELINE_STATE_DESC D;
+        D.Flags = this->Flags;
+        D.NodeMask = this->NodeMask;
+        D.pRootSignature = this->pRootSignature;
+        D.InputLayout = this->InputLayout;
+        D.IBStripCutValue = this->IBStripCutValue;
+        D.PrimitiveTopologyType = this->PrimitiveTopologyType;
+        D.VS = this->VS;
+        D.GS = this->GS;
+        D.StreamOutput = this->StreamOutput;
+        D.HS = this->HS;
+        D.DS = this->DS;
+        D.PS = this->PS;
+        D.BlendState = this->BlendState;
+        D.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(D3D12_DEPTH_STENCIL_DESC1(this->DepthStencilState));
+        D.DSVFormat = this->DSVFormat;
+        D.RasterizerState = this->RasterizerState;
+        D.NumRenderTargets = D3D12_RT_FORMAT_ARRAY(this->RTVFormats).NumRenderTargets;
+        memcpy(D.RTVFormats, D3D12_RT_FORMAT_ARRAY(this->RTVFormats).RTFormats, sizeof(D.RTVFormats));
+        D.SampleDesc = this->SampleDesc;
+        D.SampleMask = this->SampleMask;
+        D.CachedPSO = this->CachedPSO;
+        return D;
+    }
+    D3D12_COMPUTE_PIPELINE_STATE_DESC ComputeDescV0() const noexcept {
+        D3D12_COMPUTE_PIPELINE_STATE_DESC D;
+        D.Flags = this->Flags;
+        D.NodeMask = this->NodeMask;
+        D.pRootSignature = this->pRootSignature;
+        D.CS = this->CS;
+        D.CachedPSO = this->CachedPSO;
+        return D;
+    }
+};
+
+struct CD3DX12_PIPELINE_STATE_STREAM2_PARSE_HELPER : public ID3DX12PipelineParserCallbacks {
+    CD3DX12_PIPELINE_STATE_STREAM2 PipelineStream;
+    CD3DX12_PIPELINE_STATE_STREAM2_PARSE_HELPER() noexcept : SeenDSS(false) {
+        // Adjust defaults to account for absent members.
+        PipelineStream.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+
+        // Depth disabled if no DSV format specified.
+        static_cast<D3D12_DEPTH_STENCIL_DESC1 &>(PipelineStream.DepthStencilState).DepthEnable = false;
+    }
+
+    // ID3DX12PipelineParserCallbacks
+    void FlagsCb(D3D12_PIPELINE_STATE_FLAGS Flags) override { PipelineStream.Flags = Flags; }
+    void NodeMaskCb(UINT NodeMask) override { PipelineStream.NodeMask = NodeMask; }
+    void RootSignatureCb(ID3D12RootSignature *pRootSignature) override {
+        PipelineStream.pRootSignature = pRootSignature;
+    }
+    void InputLayoutCb(const D3D12_INPUT_LAYOUT_DESC &InputLayout) override {
+        PipelineStream.InputLayout = InputLayout;
+    }
+    void IBStripCutValueCb(D3D12_INDEX_BUFFER_STRIP_CUT_VALUE IBStripCutValue) override {
+        PipelineStream.IBStripCutValue = IBStripCutValue;
+    }
+    void PrimitiveTopologyTypeCb(D3D12_PRIMITIVE_TOPOLOGY_TYPE PrimitiveTopologyType) override {
+        PipelineStream.PrimitiveTopologyType = PrimitiveTopologyType;
+    }
+    void VSCb(const D3D12_SHADER_BYTECODE &VS) override { PipelineStream.VS = VS; }
+    void GSCb(const D3D12_SHADER_BYTECODE &GS) override { PipelineStream.GS = GS; }
+    void StreamOutputCb(const D3D12_STREAM_OUTPUT_DESC &StreamOutput) override {
+        PipelineStream.StreamOutput = StreamOutput;
+    }
+    void HSCb(const D3D12_SHADER_BYTECODE &HS) override { PipelineStream.HS = HS; }
+    void DSCb(const D3D12_SHADER_BYTECODE &DS) override { PipelineStream.DS = DS; }
+    void PSCb(const D3D12_SHADER_BYTECODE &PS) override { PipelineStream.PS = PS; }
+    void CSCb(const D3D12_SHADER_BYTECODE &CS) override { PipelineStream.CS = CS; }
+    void ASCb(const D3D12_SHADER_BYTECODE &AS) override { PipelineStream.AS = AS; }
+    void MSCb(const D3D12_SHADER_BYTECODE &MS) override { PipelineStream.MS = MS; }
+    void BlendStateCb(const D3D12_BLEND_DESC &BlendState) override {
+        PipelineStream.BlendState = CD3DX12_BLEND_DESC(BlendState);
+    }
+    void DepthStencilStateCb(const D3D12_DEPTH_STENCIL_DESC &DepthStencilState) override {
+        PipelineStream.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(DepthStencilState);
+        SeenDSS = true;
+    }
+    void DepthStencilState1Cb(const D3D12_DEPTH_STENCIL_DESC1 &DepthStencilState) override {
+        PipelineStream.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(DepthStencilState);
+        SeenDSS = true;
+    }
+    void DSVFormatCb(DXGI_FORMAT DSVFormat) override {
+        PipelineStream.DSVFormat = DSVFormat;
+        if (!SeenDSS && DSVFormat != DXGI_FORMAT_UNKNOWN) {
+            // Re-enable depth for the default state.
+            static_cast<D3D12_DEPTH_STENCIL_DESC1 &>(PipelineStream.DepthStencilState).DepthEnable = true;
+        }
+    }
+    void RasterizerStateCb(const D3D12_RASTERIZER_DESC &RasterizerState) override {
+        PipelineStream.RasterizerState = CD3DX12_RASTERIZER_DESC(RasterizerState);
+    }
+    void RTVFormatsCb(const D3D12_RT_FORMAT_ARRAY &RTVFormats) override { PipelineStream.RTVFormats = RTVFormats; }
+    void SampleDescCb(const DXGI_SAMPLE_DESC &SampleDesc) override { PipelineStream.SampleDesc = SampleDesc; }
+    void SampleMaskCb(UINT SampleMask) override { PipelineStream.SampleMask = SampleMask; }
+    void ViewInstancingCb(const D3D12_VIEW_INSTANCING_DESC &ViewInstancingDesc) override {
+        PipelineStream.ViewInstancingDesc = CD3DX12_VIEW_INSTANCING_DESC(ViewInstancingDesc);
+    }
+    void CachedPSOCb(const D3D12_CACHED_PIPELINE_STATE &CachedPSO) override { PipelineStream.CachedPSO = CachedPSO; }
+
+  private:
+    bool SeenDSS;
+};
+
+struct CD3DX12_PIPELINE_STATE_STREAM_PARSE_HELPER : public ID3DX12PipelineParserCallbacks {
+    CD3DX12_PIPELINE_STATE_STREAM1 PipelineStream;
+    CD3DX12_PIPELINE_STATE_STREAM_PARSE_HELPER() noexcept : SeenDSS(false) {
+        // Adjust defaults to account for absent members.
+        PipelineStream.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+
+        // Depth disabled if no DSV format specified.
+        static_cast<D3D12_DEPTH_STENCIL_DESC1 &>(PipelineStream.DepthStencilState).DepthEnable = false;
+    }
+
+    // ID3DX12PipelineParserCallbacks
+    void FlagsCb(D3D12_PIPELINE_STATE_FLAGS Flags) override { PipelineStream.Flags = Flags; }
+    void NodeMaskCb(UINT NodeMask) override { PipelineStream.NodeMask = NodeMask; }
+    void RootSignatureCb(ID3D12RootSignature *pRootSignature) override {
+        PipelineStream.pRootSignature = pRootSignature;
+    }
+    void InputLayoutCb(const D3D12_INPUT_LAYOUT_DESC &InputLayout) override {
+        PipelineStream.InputLayout = InputLayout;
+    }
+    void IBStripCutValueCb(D3D12_INDEX_BUFFER_STRIP_CUT_VALUE IBStripCutValue) override {
+        PipelineStream.IBStripCutValue = IBStripCutValue;
+    }
+    void PrimitiveTopologyTypeCb(D3D12_PRIMITIVE_TOPOLOGY_TYPE PrimitiveTopologyType) override {
+        PipelineStream.PrimitiveTopologyType = PrimitiveTopologyType;
+    }
+    void VSCb(const D3D12_SHADER_BYTECODE &VS) override { PipelineStream.VS = VS; }
+    void GSCb(const D3D12_SHADER_BYTECODE &GS) override { PipelineStream.GS = GS; }
+    void StreamOutputCb(const D3D12_STREAM_OUTPUT_DESC &StreamOutput) override {
+        PipelineStream.StreamOutput = StreamOutput;
+    }
+    void HSCb(const D3D12_SHADER_BYTECODE &HS) override { PipelineStream.HS = HS; }
+    void DSCb(const D3D12_SHADER_BYTECODE &DS) override { PipelineStream.DS = DS; }
+    void PSCb(const D3D12_SHADER_BYTECODE &PS) override { PipelineStream.PS = PS; }
+    void CSCb(const D3D12_SHADER_BYTECODE &CS) override { PipelineStream.CS = CS; }
+    void BlendStateCb(const D3D12_BLEND_DESC &BlendState) override {
+        PipelineStream.BlendState = CD3DX12_BLEND_DESC(BlendState);
+    }
+    void DepthStencilStateCb(const D3D12_DEPTH_STENCIL_DESC &DepthStencilState) override {
+        PipelineStream.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(DepthStencilState);
+        SeenDSS = true;
+    }
+    void DepthStencilState1Cb(const D3D12_DEPTH_STENCIL_DESC1 &DepthStencilState) override {
+        PipelineStream.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(DepthStencilState);
+        SeenDSS = true;
+    }
+    void DSVFormatCb(DXGI_FORMAT DSVFormat) override {
+        PipelineStream.DSVFormat = DSVFormat;
+        if (!SeenDSS && DSVFormat != DXGI_FORMAT_UNKNOWN) {
+            // Re-enable depth for the default state.
+            static_cast<D3D12_DEPTH_STENCIL_DESC1 &>(PipelineStream.DepthStencilState).DepthEnable = true;
+        }
+    }
+    void RasterizerStateCb(const D3D12_RASTERIZER_DESC &RasterizerState) override {
+        PipelineStream.RasterizerState = CD3DX12_RASTERIZER_DESC(RasterizerState);
+    }
+    void RTVFormatsCb(const D3D12_RT_FORMAT_ARRAY &RTVFormats) override { PipelineStream.RTVFormats = RTVFormats; }
+    void SampleDescCb(const DXGI_SAMPLE_DESC &SampleDesc) override { PipelineStream.SampleDesc = SampleDesc; }
+    void SampleMaskCb(UINT SampleMask) override { PipelineStream.SampleMask = SampleMask; }
+    void ViewInstancingCb(const D3D12_VIEW_INSTANCING_DESC &ViewInstancingDesc) override {
+        PipelineStream.ViewInstancingDesc = CD3DX12_VIEW_INSTANCING_DESC(ViewInstancingDesc);
+    }
+    void CachedPSOCb(const D3D12_CACHED_PIPELINE_STATE &CachedPSO) override { PipelineStream.CachedPSO = CachedPSO; }
+
+  private:
+    bool SeenDSS;
+};
+
+inline D3D12_PIPELINE_STATE_SUBOBJECT_TYPE
+D3DX12GetBaseSubobjectType(D3D12_PIPELINE_STATE_SUBOBJECT_TYPE SubobjectType) noexcept {
+    switch (SubobjectType) {
+    case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL1:
+        return D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL;
+    default:
+        return SubobjectType;
+    }
+}
+
+inline HRESULT D3DX12ParsePipelineStream(const D3D12_PIPELINE_STATE_STREAM_DESC &Desc,
+                                         ID3DX12PipelineParserCallbacks *pCallbacks) {
+    if (pCallbacks == nullptr) {
+        return E_INVALIDARG;
+    }
+
+    if (Desc.SizeInBytes == 0 || Desc.pPipelineStateSubobjectStream == nullptr) {
+        pCallbacks->ErrorBadInputParameter(1); // first parameter issue
+        return E_INVALIDARG;
+    }
+
+    bool SubobjectSeen[D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_MAX_VALID] = {};
+    for (SIZE_T CurOffset = 0, SizeOfSubobject = 0; CurOffset < Desc.SizeInBytes; CurOffset += SizeOfSubobject) {
+        BYTE *pStream = static_cast<BYTE *>(Desc.pPipelineStateSubobjectStream) + CurOffset;
+        auto SubobjectType = *reinterpret_cast<D3D12_PIPELINE_STATE_SUBOBJECT_TYPE *>(pStream);
+        if (SubobjectType < 0 || SubobjectType >= D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_MAX_VALID) {
+            pCallbacks->ErrorUnknownSubobject(SubobjectType);
+            return E_INVALIDARG;
+        }
+        if (SubobjectSeen[D3DX12GetBaseSubobjectType(SubobjectType)]) {
+            pCallbacks->ErrorDuplicateSubobject(SubobjectType);
+            return E_INVALIDARG; // disallow subobject duplicates in a stream
+        }
+        SubobjectSeen[SubobjectType] = true;
+        switch (SubobjectType) {
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_ROOT_SIGNATURE:
+            pCallbacks->RootSignatureCb(
+                *reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::pRootSignature) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::pRootSignature);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_VS:
+            pCallbacks->VSCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::VS) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::VS);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_PS:
+            pCallbacks->PSCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::PS) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::PS);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DS:
+            pCallbacks->DSCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::DS) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::DS);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_HS:
+            pCallbacks->HSCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::HS) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::HS);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_GS:
+            pCallbacks->GSCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::GS) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::GS);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_CS:
+            pCallbacks->CSCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::CS) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::CS);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_AS:
+            pCallbacks->ASCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM2::AS) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM2::AS);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_MS:
+            pCallbacks->MSCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM2::MS) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM2::MS);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_STREAM_OUTPUT:
+            pCallbacks->StreamOutputCb(
+                *reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::StreamOutput) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::StreamOutput);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_BLEND:
+            pCallbacks->BlendStateCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::BlendState) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::BlendState);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_SAMPLE_MASK:
+            pCallbacks->SampleMaskCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::SampleMask) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::SampleMask);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_RASTERIZER:
+            pCallbacks->RasterizerStateCb(
+                *reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::RasterizerState) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::RasterizerState);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL:
+            pCallbacks->DepthStencilStateCb(*reinterpret_cast<CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL1:
+            pCallbacks->DepthStencilState1Cb(
+                *reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::DepthStencilState) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::DepthStencilState);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_INPUT_LAYOUT:
+            pCallbacks->InputLayoutCb(
+                *reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::InputLayout) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::InputLayout);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_IB_STRIP_CUT_VALUE:
+            pCallbacks->IBStripCutValueCb(
+                *reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::IBStripCutValue) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::IBStripCutValue);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_PRIMITIVE_TOPOLOGY:
+            pCallbacks->PrimitiveTopologyTypeCb(
+                *reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::PrimitiveTopologyType) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::PrimitiveTopologyType);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_RENDER_TARGET_FORMATS:
+            pCallbacks->RTVFormatsCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::RTVFormats) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::RTVFormats);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL_FORMAT:
+            pCallbacks->DSVFormatCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::DSVFormat) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::DSVFormat);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_SAMPLE_DESC:
+            pCallbacks->SampleDescCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::SampleDesc) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::SampleDesc);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_NODE_MASK:
+            pCallbacks->NodeMaskCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::NodeMask) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::NodeMask);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_CACHED_PSO:
+            pCallbacks->CachedPSOCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::CachedPSO) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::CachedPSO);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_FLAGS:
+            pCallbacks->FlagsCb(*reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM::Flags) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::Flags);
+            break;
+        case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_VIEW_INSTANCING:
+            pCallbacks->ViewInstancingCb(
+                *reinterpret_cast<decltype(CD3DX12_PIPELINE_STATE_STREAM1::ViewInstancingDesc) *>(pStream));
+            SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM1::ViewInstancingDesc);
+            break;
+        default:
+            pCallbacks->ErrorUnknownSubobject(SubobjectType);
+            return E_INVALIDARG;
+        }
+    }
+
+    return S_OK;
+}
+
+//------------------------------------------------------------------------------------------------
+inline bool operator==(const D3D12_CLEAR_VALUE &a, const D3D12_CLEAR_VALUE &b) noexcept {
+    if (a.Format != b.Format)
+        return false;
+    if (a.Format == DXGI_FORMAT_D24_UNORM_S8_UINT || a.Format == DXGI_FORMAT_D16_UNORM ||
+        a.Format == DXGI_FORMAT_D32_FLOAT || a.Format == DXGI_FORMAT_D32_FLOAT_S8X24_UINT) {
+        return (a.DepthStencil.Depth == b.DepthStencil.Depth) && (a.DepthStencil.Stencil == b.DepthStencil.Stencil);
+    } else {
+        return (a.Color[0] == b.Color[0]) && (a.Color[1] == b.Color[1]) && (a.Color[2] == b.Color[2]) &&
+               (a.Color[3] == b.Color[3]);
+    }
+}
+inline bool operator==(const D3D12_RENDER_PASS_BEGINNING_ACCESS_CLEAR_PARAMETERS &a,
+                       const D3D12_RENDER_PASS_BEGINNING_ACCESS_CLEAR_PARAMETERS &b) noexcept {
+    return a.ClearValue == b.ClearValue;
+}
+inline bool operator==(const D3D12_RENDER_PASS_ENDING_ACCESS_RESOLVE_PARAMETERS &a,
+                       const D3D12_RENDER_PASS_ENDING_ACCESS_RESOLVE_PARAMETERS &b) noexcept {
+    if (a.pSrcResource != b.pSrcResource)
+        return false;
+    if (a.pDstResource != b.pDstResource)
+        return false;
+    if (a.SubresourceCount != b.SubresourceCount)
+        return false;
+    if (a.Format != b.Format)
+        return false;
+    if (a.ResolveMode != b.ResolveMode)
+        return false;
+    if (a.PreserveResolveSource != b.PreserveResolveSource)
+        return false;
+    return true;
+}
+inline bool operator==(const D3D12_RENDER_PASS_BEGINNING_ACCESS &a,
+                       const D3D12_RENDER_PASS_BEGINNING_ACCESS &b) noexcept {
+    if (a.Type != b.Type)
+        return false;
+    if (a.Type == D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE_CLEAR && !(a.Clear == b.Clear))
+        return false;
+    return true;
+}
+inline bool operator==(const D3D12_RENDER_PASS_ENDING_ACCESS &a, const D3D12_RENDER_PASS_ENDING_ACCESS &b) noexcept {
+    if (a.Type != b.Type)
+        return false;
+    if (a.Type == D3D12_RENDER_PASS_ENDING_ACCESS_TYPE_RESOLVE && !(a.Resolve == b.Resolve))
+        return false;
+    return true;
+}
+inline bool operator==(const D3D12_RENDER_PASS_RENDER_TARGET_DESC &a,
+                       const D3D12_RENDER_PASS_RENDER_TARGET_DESC &b) noexcept {
+    if (a.cpuDescriptor.ptr != b.cpuDescriptor.ptr)
+        return false;
+    if (!(a.BeginningAccess == b.BeginningAccess))
+        return false;
+    if (!(a.EndingAccess == b.EndingAccess))
+        return false;
+    return true;
+}
+inline bool operator==(const D3D12_RENDER_PASS_DEPTH_STENCIL_DESC &a,
+                       const D3D12_RENDER_PASS_DEPTH_STENCIL_DESC &b) noexcept {
+    if (a.cpuDescriptor.ptr != b.cpuDescriptor.ptr)
+        return false;
+    if (!(a.DepthBeginningAccess == b.DepthBeginningAccess))
+        return false;
+    if (!(a.StencilBeginningAccess == b.StencilBeginningAccess))
+        return false;
+    if (!(a.DepthEndingAccess == b.DepthEndingAccess))
+        return false;
+    if (!(a.StencilEndingAccess == b.StencilEndingAccess))
+        return false;
+    return true;
+}
+
+#ifndef D3DX12_NO_STATE_OBJECT_HELPERS
+
+//================================================================================================
+// D3DX12 State Object Creation Helpers
+//
+// Helper classes for creating new style state objects out of an arbitrary set of subobjects.
+// Uses STL
+//
+// Start by instantiating CD3DX12_STATE_OBJECT_DESC (see it's public methods).
+// One of its methods is CreateSubobject(), which has a comment showing a couple of options for
+// defining subobjects using the helper classes for each subobject (CD3DX12_DXIL_LIBRARY_SUBOBJECT
+// etc.). The subobject helpers each have methods specific to the subobject for configuring it's
+// contents.
+//
+//================================================================================================
+#include <list>
+#include <memory>
+#include <string>
+#include <vector>
+#ifndef D3DX12_USE_ATL
+#include <wrl/client.h>
+#define D3DX12_COM_PTR Microsoft::WRL::ComPtr
+#define D3DX12_COM_PTR_GET(x) x.Get()
+#define D3DX12_COM_PTR_ADDRESSOF(x) x.GetAddressOf()
+#else
+#include <atlbase.h>
+#define D3DX12_COM_PTR ATL::CComPtr
+#define D3DX12_COM_PTR_GET(x) x.p
+#define D3DX12_COM_PTR_ADDRESSOF(x) &x.p
+#endif
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_STATE_OBJECT_DESC {
+  public:
+    CD3DX12_STATE_OBJECT_DESC() noexcept { Init(D3D12_STATE_OBJECT_TYPE_COLLECTION); }
+    CD3DX12_STATE_OBJECT_DESC(D3D12_STATE_OBJECT_TYPE Type) noexcept { Init(Type); }
+    void SetStateObjectType(D3D12_STATE_OBJECT_TYPE Type) noexcept { m_Desc.Type = Type; }
+    operator const D3D12_STATE_OBJECT_DESC &() {
+        // Do final preparation work
+        m_RepointedAssociations.clear();
+        m_SubobjectArray.clear();
+        m_SubobjectArray.reserve(m_Desc.NumSubobjects);
+        // Flatten subobjects into an array (each flattened subobject still has a
+        // member that's a pointer to it's desc that's not flattened)
+        for (auto Iter = m_SubobjectList.begin(); Iter != m_SubobjectList.end(); Iter++) {
+            m_SubobjectArray.push_back(*Iter);
+            // Store new location in array so we can redirect pointers contained in subobjects
+            Iter->pSubobjectArrayLocation = &m_SubobjectArray.back();
+        }
+        // For subobjects with pointer fields, create a new copy of those subobject definitions
+        // with fixed pointers
+        for (UINT i = 0; i < m_Desc.NumSubobjects; i++) {
+            if (m_SubobjectArray[i].Type == D3D12_STATE_SUBOBJECT_TYPE_SUBOBJECT_TO_EXPORTS_ASSOCIATION) {
+                auto pOriginalSubobjectAssociation =
+                    static_cast<const D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION *>(m_SubobjectArray[i].pDesc);
+                D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION Repointed = *pOriginalSubobjectAssociation;
+                auto pWrapper =
+                    static_cast<const SUBOBJECT_WRAPPER *>(pOriginalSubobjectAssociation->pSubobjectToAssociate);
+                Repointed.pSubobjectToAssociate = pWrapper->pSubobjectArrayLocation;
+                m_RepointedAssociations.push_back(Repointed);
+                m_SubobjectArray[i].pDesc = &m_RepointedAssociations.back();
+            }
+        }
+        // Below: using ugly way to get pointer in case .data() is not defined
+        m_Desc.pSubobjects = m_Desc.NumSubobjects ? &m_SubobjectArray[0] : nullptr;
+        return m_Desc;
+    }
+    operator const D3D12_STATE_OBJECT_DESC *() {
+        // Cast calls the above final preparation work
+        return &static_cast<const D3D12_STATE_OBJECT_DESC &>(*this);
+    }
+
+    // CreateSubobject creates a sububject helper (e.g. CD3DX12_HIT_GROUP_SUBOBJECT)
+    // whose lifetime is owned by this class.
+    // e.g.
+    //
+    //    CD3DX12_STATE_OBJECT_DESC Collection1(D3D12_STATE_OBJECT_TYPE_COLLECTION);
+    //    auto Lib0 = Collection1.CreateSubobject<CD3DX12_DXIL_LIBRARY_SUBOBJECT>();
+    //    Lib0->SetDXILLibrary(&pMyAppDxilLibs[0]);
+    //    Lib0->DefineExport(L"rayGenShader0"); // in practice these export listings might be
+    //                                          // data/engine driven
+    //    etc.
+    //
+    // Alternatively, users can instantiate sububject helpers explicitly, such as via local
+    // variables instead, passing the state object desc that should point to it into the helper
+    // constructor (or call mySubobjectHelper.AddToStateObject(Collection1)).
+    // In this alternative scenario, the user must keep the subobject alive as long as the state
+    // object it is associated with is alive, else it's pointer references will be stale.
+    // e.g.
+    //
+    //    CD3DX12_STATE_OBJECT_DESC RaytracingState2(D3D12_STATE_OBJECT_TYPE_RAYTRACING_PIPELINE);
+    //    CD3DX12_DXIL_LIBRARY_SUBOBJECT LibA(RaytracingState2);
+    //    LibA.SetDXILLibrary(&pMyAppDxilLibs[4]); // not manually specifying exports
+    //                                             // - meaning all exports in the libraries
+    //                                             // are exported
+    //    etc.
+
+    template <typename T> T *CreateSubobject() {
+        T *pSubobject = new T(*this);
+        m_OwnedSubobjectHelpers.emplace_back(pSubobject);
+        return pSubobject;
+    }
+
+  private:
+    D3D12_STATE_SUBOBJECT *TrackSubobject(D3D12_STATE_SUBOBJECT_TYPE Type, void *pDesc) {
+        SUBOBJECT_WRAPPER Subobject;
+        Subobject.pSubobjectArrayLocation = nullptr;
+        Subobject.Type = Type;
+        Subobject.pDesc = pDesc;
+        m_SubobjectList.push_back(Subobject);
+        m_Desc.NumSubobjects++;
+        return &m_SubobjectList.back();
+    }
+    void Init(D3D12_STATE_OBJECT_TYPE Type) noexcept {
+        SetStateObjectType(Type);
+        m_Desc.pSubobjects = nullptr;
+        m_Desc.NumSubobjects = 0;
+        m_SubobjectList.clear();
+        m_SubobjectArray.clear();
+        m_RepointedAssociations.clear();
+    }
+    typedef struct SUBOBJECT_WRAPPER : public D3D12_STATE_SUBOBJECT {
+        D3D12_STATE_SUBOBJECT *pSubobjectArrayLocation; // new location when flattened into array
+        // for repointing pointers in subobjects
+    } SUBOBJECT_WRAPPER;
+    D3D12_STATE_OBJECT_DESC m_Desc;
+    std::list<SUBOBJECT_WRAPPER> m_SubobjectList; // Pointers to list nodes handed out so
+    // these can be edited live
+    std::vector<D3D12_STATE_SUBOBJECT> m_SubobjectArray; // Built at the end, copying list contents
+
+    std::list<D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION>
+        m_RepointedAssociations; // subobject type that contains pointers to other subobjects,
+    // repointed to flattened array
+
+    class StringContainer {
+      public:
+        LPCWSTR LocalCopy(LPCWSTR string, bool bSingleString = false) {
+            if (string) {
+                if (bSingleString) {
+                    m_Strings.clear();
+                    m_Strings.push_back(string);
+                } else {
+                    m_Strings.push_back(string);
+                }
+                return m_Strings.back().c_str();
+            } else {
+                return nullptr;
+            }
+        }
+        void clear() noexcept { m_Strings.clear(); }
+
+      private:
+        std::list<std::wstring> m_Strings;
+    };
+
+    class SUBOBJECT_HELPER_BASE {
+      public:
+        SUBOBJECT_HELPER_BASE() noexcept { Init(); }
+        virtual ~SUBOBJECT_HELPER_BASE() = default;
+        virtual D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept = 0;
+        void AddToStateObject(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+            m_pSubobject = ContainingStateObject.TrackSubobject(Type(), Data());
+        }
+
+      protected:
+        virtual void *Data() noexcept = 0;
+        void Init() noexcept { m_pSubobject = nullptr; }
+        D3D12_STATE_SUBOBJECT *m_pSubobject;
+    };
+
+#if (__cplusplus >= 201103L)
+    std::list<std::unique_ptr<const SUBOBJECT_HELPER_BASE>> m_OwnedSubobjectHelpers;
+#else
+    class OWNED_HELPER {
+      public:
+        OWNED_HELPER(const SUBOBJECT_HELPER_BASE *pHelper) noexcept { m_pHelper = pHelper; }
+        ~OWNED_HELPER() { delete m_pHelper; }
+        const SUBOBJECT_HELPER_BASE *m_pHelper;
+    };
+
+    std::list<OWNED_HELPER> m_OwnedSubobjectHelpers;
+#endif
+
+    friend class CD3DX12_DXIL_LIBRARY_SUBOBJECT;
+    friend class CD3DX12_EXISTING_COLLECTION_SUBOBJECT;
+    friend class CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT;
+    friend class CD3DX12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION;
+    friend class CD3DX12_HIT_GROUP_SUBOBJECT;
+    friend class CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT;
+    friend class CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT;
+    friend class CD3DX12_RAYTRACING_PIPELINE_CONFIG1_SUBOBJECT;
+    friend class CD3DX12_GLOBAL_ROOT_SIGNATURE_SUBOBJECT;
+    friend class CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT;
+    friend class CD3DX12_STATE_OBJECT_CONFIG_SUBOBJECT;
+    friend class CD3DX12_NODE_MASK_SUBOBJECT;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_DXIL_LIBRARY_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_DXIL_LIBRARY_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_DXIL_LIBRARY_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetDXILLibrary(const D3D12_SHADER_BYTECODE *pCode) noexcept {
+        static const D3D12_SHADER_BYTECODE Default = {};
+        m_Desc.DXILLibrary = pCode ? *pCode : Default;
+    }
+    void DefineExport(LPCWSTR Name, LPCWSTR ExportToRename = nullptr,
+                      D3D12_EXPORT_FLAGS Flags = D3D12_EXPORT_FLAG_NONE) {
+        D3D12_EXPORT_DESC Export;
+        Export.Name = m_Strings.LocalCopy(Name);
+        Export.ExportToRename = m_Strings.LocalCopy(ExportToRename);
+        Export.Flags = Flags;
+        m_Exports.push_back(Export);
+        m_Desc.pExports = &m_Exports[0]; // using ugly way to get pointer in case .data() is not defined
+        m_Desc.NumExports = static_cast<UINT>(m_Exports.size());
+    }
+    template <size_t N> void DefineExports(LPCWSTR (&Exports)[N]) {
+        for (UINT i = 0; i < N; i++) {
+            DefineExport(Exports[i]);
+        }
+    }
+    void DefineExports(const LPCWSTR *Exports, UINT N) {
+        for (UINT i = 0; i < N; i++) {
+            DefineExport(Exports[i]);
+        }
+    }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_DXIL_LIBRARY; }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_DXIL_LIBRARY_DESC &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+        m_Strings.clear();
+        m_Exports.clear();
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_DXIL_LIBRARY_DESC m_Desc;
+    CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings;
+    std::vector<D3D12_EXPORT_DESC> m_Exports;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_EXISTING_COLLECTION_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_EXISTING_COLLECTION_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_EXISTING_COLLECTION_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetExistingCollection(ID3D12StateObject *pExistingCollection) noexcept {
+        m_Desc.pExistingCollection = pExistingCollection;
+        m_CollectionRef = pExistingCollection;
+    }
+    void DefineExport(LPCWSTR Name, LPCWSTR ExportToRename = nullptr,
+                      D3D12_EXPORT_FLAGS Flags = D3D12_EXPORT_FLAG_NONE) {
+        D3D12_EXPORT_DESC Export;
+        Export.Name = m_Strings.LocalCopy(Name);
+        Export.ExportToRename = m_Strings.LocalCopy(ExportToRename);
+        Export.Flags = Flags;
+        m_Exports.push_back(Export);
+        m_Desc.pExports = &m_Exports[0]; // using ugly way to get pointer in case .data() is not defined
+        m_Desc.NumExports = static_cast<UINT>(m_Exports.size());
+    }
+    template <size_t N> void DefineExports(LPCWSTR (&Exports)[N]) {
+        for (UINT i = 0; i < N; i++) {
+            DefineExport(Exports[i]);
+        }
+    }
+    void DefineExports(const LPCWSTR *Exports, UINT N) {
+        for (UINT i = 0; i < N; i++) {
+            DefineExport(Exports[i]);
+        }
+    }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_EXISTING_COLLECTION; }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_EXISTING_COLLECTION_DESC &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+        m_CollectionRef = nullptr;
+        m_Strings.clear();
+        m_Exports.clear();
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_EXISTING_COLLECTION_DESC m_Desc;
+    D3DX12_COM_PTR<ID3D12StateObject> m_CollectionRef;
+    CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings;
+    std::vector<D3D12_EXPORT_DESC> m_Exports;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetSubobjectToAssociate(const D3D12_STATE_SUBOBJECT &SubobjectToAssociate) noexcept {
+        m_Desc.pSubobjectToAssociate = &SubobjectToAssociate;
+    }
+    void AddExport(LPCWSTR Export) {
+        m_Desc.NumExports++;
+        m_Exports.push_back(m_Strings.LocalCopy(Export));
+        m_Desc.pExports = &m_Exports[0]; // using ugly way to get pointer in case .data() is not defined
+    }
+    template <size_t N> void AddExports(LPCWSTR (&Exports)[N]) {
+        for (UINT i = 0; i < N; i++) {
+            AddExport(Exports[i]);
+        }
+    }
+    void AddExports(const LPCWSTR *Exports, UINT N) {
+        for (UINT i = 0; i < N; i++) {
+            AddExport(Exports[i]);
+        }
+    }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override {
+        return D3D12_STATE_SUBOBJECT_TYPE_SUBOBJECT_TO_EXPORTS_ASSOCIATION;
+    }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+        m_Strings.clear();
+        m_Exports.clear();
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION m_Desc;
+    CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings;
+    std::vector<LPCWSTR> m_Exports;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION() noexcept { Init(); }
+    CD3DX12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetSubobjectNameToAssociate(LPCWSTR SubobjectToAssociate) {
+        m_Desc.SubobjectToAssociate = m_SubobjectName.LocalCopy(SubobjectToAssociate, true);
+    }
+    void AddExport(LPCWSTR Export) {
+        m_Desc.NumExports++;
+        m_Exports.push_back(m_Strings.LocalCopy(Export));
+        m_Desc.pExports = &m_Exports[0]; // using ugly way to get pointer in case .data() is not defined
+    }
+    template <size_t N> void AddExports(LPCWSTR (&Exports)[N]) {
+        for (UINT i = 0; i < N; i++) {
+            AddExport(Exports[i]);
+        }
+    }
+    void AddExports(const LPCWSTR *Exports, UINT N) {
+        for (UINT i = 0; i < N; i++) {
+            AddExport(Exports[i]);
+        }
+    }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override {
+        return D3D12_STATE_SUBOBJECT_TYPE_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION;
+    }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+        m_Strings.clear();
+        m_SubobjectName.clear();
+        m_Exports.clear();
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION m_Desc;
+    CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings;
+    CD3DX12_STATE_OBJECT_DESC::StringContainer m_SubobjectName;
+    std::vector<LPCWSTR> m_Exports;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_HIT_GROUP_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_HIT_GROUP_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_HIT_GROUP_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetHitGroupExport(LPCWSTR exportName) { m_Desc.HitGroupExport = m_Strings[0].LocalCopy(exportName, true); }
+    void SetHitGroupType(D3D12_HIT_GROUP_TYPE Type) noexcept { m_Desc.Type = Type; }
+    void SetAnyHitShaderImport(LPCWSTR importName) {
+        m_Desc.AnyHitShaderImport = m_Strings[1].LocalCopy(importName, true);
+    }
+    void SetClosestHitShaderImport(LPCWSTR importName) {
+        m_Desc.ClosestHitShaderImport = m_Strings[2].LocalCopy(importName, true);
+    }
+    void SetIntersectionShaderImport(LPCWSTR importName) {
+        m_Desc.IntersectionShaderImport = m_Strings[3].LocalCopy(importName, true);
+    }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_HIT_GROUP; }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_HIT_GROUP_DESC &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+        for (UINT i = 0; i < m_NumStrings; i++) {
+            m_Strings[i].clear();
+        }
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_HIT_GROUP_DESC m_Desc;
+    static const UINT m_NumStrings = 4;
+    CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings[m_NumStrings]; // one string for every entrypoint name
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void Config(UINT MaxPayloadSizeInBytes, UINT MaxAttributeSizeInBytes) noexcept {
+        m_Desc.MaxPayloadSizeInBytes = MaxPayloadSizeInBytes;
+        m_Desc.MaxAttributeSizeInBytes = MaxAttributeSizeInBytes;
+    }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override {
+        return D3D12_STATE_SUBOBJECT_TYPE_RAYTRACING_SHADER_CONFIG;
+    }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_RAYTRACING_SHADER_CONFIG &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_RAYTRACING_SHADER_CONFIG m_Desc;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void Config(UINT MaxTraceRecursionDepth) noexcept { m_Desc.MaxTraceRecursionDepth = MaxTraceRecursionDepth; }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override {
+        return D3D12_STATE_SUBOBJECT_TYPE_RAYTRACING_PIPELINE_CONFIG;
+    }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_RAYTRACING_PIPELINE_CONFIG &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_RAYTRACING_PIPELINE_CONFIG m_Desc;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_RAYTRACING_PIPELINE_CONFIG1_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_RAYTRACING_PIPELINE_CONFIG1_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_RAYTRACING_PIPELINE_CONFIG1_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void Config(UINT MaxTraceRecursionDepth, D3D12_RAYTRACING_PIPELINE_FLAGS Flags) noexcept {
+        m_Desc.MaxTraceRecursionDepth = MaxTraceRecursionDepth;
+        m_Desc.Flags = Flags;
+    }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override {
+        return D3D12_STATE_SUBOBJECT_TYPE_RAYTRACING_PIPELINE_CONFIG1;
+    }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_RAYTRACING_PIPELINE_CONFIG1 &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_RAYTRACING_PIPELINE_CONFIG1 m_Desc;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_GLOBAL_ROOT_SIGNATURE_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_GLOBAL_ROOT_SIGNATURE_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_GLOBAL_ROOT_SIGNATURE_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetRootSignature(ID3D12RootSignature *pRootSig) noexcept { m_pRootSig = pRootSig; }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override {
+        return D3D12_STATE_SUBOBJECT_TYPE_GLOBAL_ROOT_SIGNATURE;
+    }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator ID3D12RootSignature *() const noexcept { return D3DX12_COM_PTR_GET(m_pRootSig); }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_pRootSig = nullptr;
+    }
+    void *Data() noexcept override { return D3DX12_COM_PTR_ADDRESSOF(m_pRootSig); }
+    D3DX12_COM_PTR<ID3D12RootSignature> m_pRootSig;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetRootSignature(ID3D12RootSignature *pRootSig) noexcept { m_pRootSig = pRootSig; }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override {
+        return D3D12_STATE_SUBOBJECT_TYPE_LOCAL_ROOT_SIGNATURE;
+    }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator ID3D12RootSignature *() const noexcept { return D3DX12_COM_PTR_GET(m_pRootSig); }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_pRootSig = nullptr;
+    }
+    void *Data() noexcept override { return D3DX12_COM_PTR_ADDRESSOF(m_pRootSig); }
+    D3DX12_COM_PTR<ID3D12RootSignature> m_pRootSig;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_STATE_OBJECT_CONFIG_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_STATE_OBJECT_CONFIG_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_STATE_OBJECT_CONFIG_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetFlags(D3D12_STATE_OBJECT_FLAGS Flags) noexcept { m_Desc.Flags = Flags; }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_STATE_OBJECT_CONFIG; }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_STATE_OBJECT_CONFIG &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_STATE_OBJECT_CONFIG m_Desc;
+};
+
+//------------------------------------------------------------------------------------------------
+class CD3DX12_NODE_MASK_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE {
+  public:
+    CD3DX12_NODE_MASK_SUBOBJECT() noexcept { Init(); }
+    CD3DX12_NODE_MASK_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) {
+        Init();
+        AddToStateObject(ContainingStateObject);
+    }
+    void SetNodeMask(UINT NodeMask) noexcept { m_Desc.NodeMask = NodeMask; }
+    D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_NODE_MASK; }
+    operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; }
+    operator const D3D12_NODE_MASK &() const noexcept { return m_Desc; }
+
+  private:
+    void Init() noexcept {
+        SUBOBJECT_HELPER_BASE::Init();
+        m_Desc = {};
+    }
+    void *Data() noexcept override { return &m_Desc; }
+    D3D12_NODE_MASK m_Desc;
+};
+
+#undef D3DX12_COM_PTR
+#undef D3DX12_COM_PTR_GET
+#undef D3DX12_COM_PTR_ADDRESSOF
+#endif // #ifndef D3DX12_NO_STATE_OBJECT_HELPERS
+
+#endif // defined( __cplusplus )
+
+#endif //__D3DX12_H__
diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.cpp b/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.cpp
new file mode 100644
index 000000000..af3472f85
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.cpp
@@ -0,0 +1,80 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "D3D12Timer.h"
+#include "../directx_third_party/DXSampleHelper.h"
+#include "../directx_third_party/d3dx12.h"
+#include <cassert>
+
+namespace D3D12 {
+D3D12Timer::D3D12Timer() {}
+
+// Destructor.
+D3D12Timer::~D3D12Timer() {
+    if (m_queryHeap)
+        m_queryHeap->Release();
+    if (m_queryResourceCPU)
+        m_queryResourceCPU->Release();
+}
+
+void D3D12Timer::init(ID3D12Device *pDevice, ID3D12CommandQueue *pCommandQueue, UINT numTimers, QueueType type) {
+    assert(pDevice != nullptr);
+    m_device = pDevice;
+    m_timerCount = numTimers;
+
+    UINT64 gpuFreq;
+    ThrowIfFailed(pCommandQueue->GetTimestampFrequency(&gpuFreq));
+    m_gpuFreqInv = 1000.0 / double(gpuFreq);
+
+    D3D12_QUERY_HEAP_DESC queryHeapDesc;
+    queryHeapDesc.Count = m_timerCount * 2;
+    queryHeapDesc.NodeMask = 0;
+    if (type == QueueType::compute) {
+        queryHeapDesc.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP;
+    } else if (type == QueueType::copy) {
+        queryHeapDesc.Type = D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP;
+    }
+    ThrowIfFailed(m_device->CreateQueryHeap(&queryHeapDesc, IID_PPV_ARGS(&m_queryHeap)));
+
+    D3D12_HEAP_PROPERTIES heapProp = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK);
+    D3D12_RESOURCE_DESC resouceDesc = CD3DX12_RESOURCE_DESC::Buffer(m_timerCount * sizeof(GPUTimestampPair));
+    ThrowIfFailed(m_device->CreateCommittedResource(&heapProp, D3D12_HEAP_FLAG_NONE, &resouceDesc,
+                                                    D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+                                                    IID_PPV_ARGS(&m_queryResourceCPU)));
+}
+
+// Start timestamp.
+bool D3D12Timer::start(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex) {
+    if (timestampPairIndex >= m_timerCount)
+        return false;
+    pCommandList->EndQuery(m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, getStartIndex(timestampPairIndex));
+    return true;
+}
+
+// Stop timestamp.
+bool D3D12Timer::stop(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex) {
+    if (timestampPairIndex >= m_timerCount)
+        return false;
+    pCommandList->EndQuery(m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, getEndIndex(timestampPairIndex));
+    return true;
+}
+
+// Resolve query data. Write query to device memory. Make sure to wait for query to finish before resolving data.
+void D3D12Timer::resolveQueryToCPU(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex) {
+    pCommandList->ResolveQueryData(m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, getStartIndex(timestampPairIndex), 2,
+                                   m_queryResourceCPU, sizeof(GPUTimestampPair) * timestampPairIndex);
+}
+
+// Get start and end timestamp pair.
+double D3D12Timer::getElapsedMsByTimestampPair(UINT timestampPairIndex) {
+    GPUTimestampPair *timingData = nullptr;
+    D3D12_RANGE readRange{sizeof(GPUTimestampPair) * timestampPairIndex,
+                          sizeof(GPUTimestampPair) * (timestampPairIndex + 1)};
+    D3D12_RANGE writeRange{0, 0};
+    if (SUCCEEDED(m_queryResourceCPU->Map(0, &readRange, (void **)&timingData))) {
+        m_queryResourceCPU->Unmap(0, &writeRange);
+        return (timingData->Stop - timingData->Start) * m_gpuFreqInv;
+    }
+    return -1;
+}
+} // namespace D3D12
diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.h b/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.h
new file mode 100644
index 000000000..e7308a5fe
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.h
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+#include <d3d12.h>
+
+namespace D3D12 {
+struct GPUTimestampPair {
+    UINT64 Start;
+    UINT64 Stop;
+};
+
+enum QueueType { compute = 0, copy = 1 };
+
+// D3D12 timer.
+class D3D12Timer {
+  public:
+    // Constructor.
+    D3D12Timer();
+
+    // Destructor.
+    ~D3D12Timer();
+
+    void init(ID3D12Device *pDevice, ID3D12CommandQueue *pCommandQueue, UINT numTimers, QueueType type);
+
+    // Start timestamp.
+    bool start(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex);
+
+    // Stop timestamp.
+    bool stop(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex);
+
+    // Resolve query data. Write query to device memory. Make sure to wait for query to finsih before resolving data.
+    void resolveQueryToCPU(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex);
+
+    // Get start and end timestamp pair.
+    double getElapsedMsByTimestampPair(UINT timestampPairIndex);
+
+    // Get the GPU frequency.
+    double getGPUFrequecy() { return m_gpuFreqInv; }
+
+    // Get start index of the selected timestamp pair
+    UINT getStartIndex(UINT timestampPairIndex) { return timestampPairIndex * 2; }
+
+    // Get end index of the selected timestamp pair
+    UINT getEndIndex(UINT timestampPairIndex) { return timestampPairIndex * 2 + 1; }
+
+  private:
+    ID3D12Device *m_device = nullptr;
+    ID3D12QueryHeap *m_queryHeap = nullptr;
+    ID3D12Resource *m_queryResourceCPU = nullptr;
+    UINT m_timerCount = 0;
+    double m_gpuFreqInv;
+};
+} // namespace D3D12
diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h
new file mode 100644
index 000000000..ce384272a
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h
@@ -0,0 +1,113 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+class Options {
+  protected:
+    char **begin;
+    char **end;
+
+    /**
+     * @brief Get the char* value of the cmd line argument.
+     * @param option the argument in cmd.
+     * @return char*
+     */
+    char *get_cmd_option(const std::string &option) {
+        char **itr = std::find(begin, end, option);
+        if (itr != end && ++itr != end) {
+            return *itr;
+        }
+        return 0;
+    }
+
+    /**
+     * @brief Get the int type value of cmd line argument.
+     * @param option the cmd line argument.
+     * @param defaults the default value.
+     * @return int the int type value of cmd line argument 'option'.
+     */
+    int get_cmd_line_argument_int(const std::string &option, int defaults) {
+        if (char *value = get_cmd_option(option)) {
+            try {
+                return std::stoi(value);
+            } catch (const std::exception &e) {
+                std::cerr << "Error: Invalid argument - " << option << " should be INT " << e.what() << '\n';
+                exit(1);
+            }
+        }
+        return defaults;
+    }
+
+    /**
+     * @brief Get the string type value of cmd line argument.
+     * @param  option the cmd line argument.
+     * @return std::string the int type value of cmd line argument 'option'.
+     */
+    std::string get_cmd_line_argument_string(const std::string &option) {
+        if (char *value = get_cmd_option(option)) {
+            return std::string(value);
+        }
+        return "";
+    }
+
+    /**
+     * @brief Get the boolean type value of cmd line argument.
+     * @param  option the cmd line argument.
+     * @return bool the boolean value.
+     */
+    bool get_cmd_line_argument_bool(const std::string &option) {
+        if (cmd_option_exists(option)) {
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * @brief Check if a argument exists.
+     * @param  option the cmd line argument.
+     * @return bool if a argument exists.
+     */
+    bool cmd_option_exists(const std::string &option) { return std::find(begin, end, option) != end; }
+
+    /**
+     * @brief Get the option usage.
+     */
+    virtual void get_option_usage(){};
+
+    /**
+     * @brief Parse the arguments.
+     */
+    virtual void parse_arguments(){};
+
+  public:
+    /**
+     * @brief Construct a new Command Line object.
+     * @param argc the number of command line arguments.
+     * @param argv the string array of comamnd line arguments.
+     */
+    Options(int argc, char *argv[]) {
+        begin = argv;
+        end = argv + argc;
+    }
+
+    /**
+     * @brief Init and parse the arguments.
+     */
+    virtual void init() {
+        if (cmd_option_exists("--help")) {
+            get_option_usage();
+            exit(0);
+        }
+        try {
+            parse_arguments();
+        } catch (const std::exception &e) {
+            std::cerr << "Error: Invalid argument - " << e.what() << '\n';
+            exit(1);
+        }
+    };
+};

From ed027e4c8ef8d15a1238342c2e5f165510ad91b6 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 29 Jun 2023 06:09:44 +0000
Subject: [PATCH 12/33] Tools - Add runner for sys info and update docs (#532)

**Description**
Add runner for sys info to automatically collect on multiple nodes and
update related docs.

**Major Revision**
- add runner for sys info which will check docker status and run `sb
node info` on all nodes' docker and fetch results from all nodes

**Minor Revision**
- update cli and system-info doc
- update sb node info to save output info output-dir/sys-info.json
---
 docs/cli.md                         | 32 +++++++++++++++++++++++++++++
 docs/user-tutorial/system-config.md | 30 ++++++++++++++++++++++++++-
 superbench/cli/_commands.py         |  3 +++
 superbench/cli/_handler.py          |  8 +++++++-
 superbench/cli/_help.py             |  4 ++++
 superbench/cli/_node_handler.py     | 13 +++++++++++-
 superbench/runner/runner.py         | 18 ++++++++++++++++
 7 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/docs/cli.md b/docs/cli.md
index df1c1ca4d..1f6b13a7a 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -165,6 +165,26 @@ Execute GPT2 model benchmark in default configuration:
 sb exec --config-override superbench.enable="['gpt2_models']"
 ```
 
+### `sb node info`
+Get system info on the local node.
+
+```bash title="SB CLI"
+sb node info [--output-dir]
+```
+
+#### Optional arguments
+
+| Name           | Default | Description                                                                 |
+|----------------|---------|-----------------------------------------------------------------------------|
+| `--output-dir` | `None`  | Path to output directory, outputs/{datetime} will be used if not specified. |
+
+#### Examples
+
+Get system info on the local node and save it into the `outputs` dir:
+```bash title="SB CLI"
+sb node info --output-dir outputs
+```
+
 ### `sb result diagnosis`
 
 Filter the defective machines automatically from benchmarking results according to rules defined in rule file.
@@ -284,6 +304,7 @@ sb run [--config-file]
        [--docker-image]
        [--docker-password]
        [--docker-username]
+       [--get-info]
        [--host-file]
        [--host-list]
        [--host-password]
@@ -302,6 +323,7 @@ sb run [--config-file]
 | `--docker-image` `-i`    | `superbench/superbench` | Docker image URI.                                                           |
 | `--docker-password`      | `None`                  | Docker registry password if authentication is needed.                       |
 | `--docker-username`      | `None`                  | Docker registry username if authentication is needed.                       |
+| `--get-info`             | `False`                 | Collect system info.                                                        |
 | `--host-file` `-f`       | `None`                  | Path to Ansible inventory host file.                                        |
 | `--host-list` `-l`       | `None`                  | Comma separated host list.                                                  |
 | `--host-password`        | `None`                  | Host password or key passphase if needed.                                   |
@@ -335,6 +357,16 @@ sb run --no-docker --host-list localhost --config-override \
   superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark
 ```
 
+Collect system info on all nodes in ./host.ini" distributed without running benchmarks:
+```bash title="SB CLI"
+sb run --get-info --host-file ./host.ini -C superbench.enable=none
+```
+
+Collect system info on all nodes in ./host.ini" distributed while running benchmarks:
+```bash title="SB CLI"
+sb run --get-info --host-file ./host.ini
+```
+
 ### `sb version`
 
 Print the current SuperBench CLI version.
diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md
index dbde728d3..2a749ba52 100644
--- a/docs/user-tutorial/system-config.md
+++ b/docs/user-tutorial/system-config.md
@@ -4,6 +4,8 @@ id: system-config
 
 # System Config Info
 
+This tool is to collect the system information automatically on the tested GPU nodes including the following hardware categories:
+
 - [System](#system)
 - [Memory](#memory)
 - [CPU](#cpu)
@@ -12,7 +14,33 @@ id: system-config
 - [Accelerator](#accelerator)
 - [PCIe](#pcie)
 
-## Parameter amd Details
+## Usage
+
+### Usage on local machine
+
+1. [Install SuperBench](../getting-started/installation.mdx) on the local machine using root privilege.
+
+2. Start to collect the sys info using `sb node info --output-dir ${output-dir}` command using root privilege.
+
+3. After the command finished, you can find the output system info json file `sys-info.json` of local node under \${output_dir}.
+
+### Usage on multiple remote machines
+
+1. [Install SuperBench](../getting-started/installation.mdx) on the local machine.
+
+2. [Deploy SuperBench](../getting-started/run-superbench.md#deploy) onto the remote machines.
+
+2. Prepare the host file of the tested GPU nodes using [Ansible Inventory](../getting-started/configuration.md#ansible-inventory) on the local machine.
+
+3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using  `sb run --get-info` command. The detailed command can be found from [SuperBench CLI](../cli.md).
+
+  ```
+  sb run --get-info -f host.ini --output-dir ${output-dir} -C superbench.enable=none
+  ```
+
+4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}.
+
+## Parameter and Details
 
 ### System
 
diff --git a/superbench/cli/_commands.py b/superbench/cli/_commands.py
index f37bc0f33..2122034a3 100644
--- a/superbench/cli/_commands.py
+++ b/superbench/cli/_commands.py
@@ -67,6 +67,9 @@ def load_arguments(self, command):
                 nargs='+',
                 help='Extra arguments to override config_file.'
             )
+            ac.argument(
+                'get_info', options_list=('--get-info', '-g'), action='store_true', help='Collect node system info.'
+            )
 
         with ArgumentsContext(self, 'benchmark') as ac:
             ac.argument('name', options_list=('--name', '-n'), type=str, help='Benchmark name or regular expression.')
diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py
index 3c2d1cbaa..41c9f3741 100644
--- a/superbench/cli/_handler.py
+++ b/superbench/cli/_handler.py
@@ -275,7 +275,8 @@ def run_command_handler(
     output_dir=None,
     private_key=None,
     config_file=None,
-    config_override=None
+    config_override=None,
+    get_info=False,
 ):
     """Run the SuperBench benchmarks distributedly.
 
@@ -295,6 +296,7 @@ def run_command_handler(
         config_file (str, optional): Path to SuperBench config file. Defaults to None.
         config_override (str, optional): Extra arguments to override config_file,
             following [Hydra syntax](https://hydra.cc/docs/advanced/override_grammar/basic). Defaults to None.
+        get_info (bool, optional): Collect node system info. Defaults to False.
 
     Raises:
         CLIError: If input arguments are invalid.
@@ -316,6 +318,10 @@ def run_command_handler(
     )
 
     runner = SuperBenchRunner(sb_config, docker_config, ansible_config, sb_output_dir)
+
     runner.run()
+    if get_info:
+        runner.run_sys_info()
+
     if runner.get_failure_count() != 0:
         sys.exit(runner.get_failure_count())
diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py
index 2c7f507b2..fb7f87973 100644
--- a/superbench/cli/_help.py
+++ b/superbench/cli/_help.py
@@ -63,6 +63,10 @@
           text: >
             {cli_name} run --no-docker --host-list localhost
             --config-override superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark
+        - name: Collect system info on all nodes in ./host.ini" without running benchmarks
+          text: {cli_name} run --get-info --host-file ./host.ini -C superbench.enable=none
+        - name: Collect system info on all nodes in ./host.ini" while running benchmarks
+          text: {cli_name} run --get-info --host-file ./host.ini
 """.format(cli_name=CLI_NAME)
 
 helps['benchmark'] = """
diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py
index 4a57b5b20..d59ed8b85 100644
--- a/superbench/cli/_node_handler.py
+++ b/superbench/cli/_node_handler.py
@@ -3,17 +3,28 @@
 
 """SuperBench CLI node subgroup command handler."""
 
+from pathlib import Path
+import json
+
 from superbench.tools import SystemInfo
+from superbench.common.utils import create_sb_output_dir
 
 
-def info_command_handler():
+def info_command_handler(output_dir=None):
     """Get node hardware info.
 
+    Args:
+        output_dir (str): Output directory.
+
     Returns:
         dict: node info.
     """
     try:
         info = SystemInfo().get_all()
+        output_dir = create_sb_output_dir(output_dir)
+        output_dir_path = Path(output_dir)
+        with open(output_dir_path / 'sys_info.json', 'w') as f:
+            json.dump(info, f)
     except Exception as ex:
         raise RuntimeError('Failed to get node info.') from ex
     return info
diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index d91020bfb..bd8cc9c83 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -199,6 +199,24 @@ def deploy(self):    # pragma: no cover
             )
         self._ansible_client.run(self._ansible_client.get_playbook_config('deploy.yaml', extravars=extravars))
 
+    def run_sys_info(self):
+        """Run the system info on all nodes."""
+        self.check_env()
+
+        logger.info('Runner is going to get node system info.')
+
+        fcmd = "docker exec sb-workspace bash -c '{command}'"
+        if self._docker_config.skip:
+            fcmd = "bash -c 'cd $SB_WORKSPACE && {command}'"
+        ansible_runner_config = self._ansible_client.get_shell_config(
+            fcmd.format(command='sb node info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir))
+        )
+        ansible_rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip))
+
+        if ansible_rc != 0:
+            self.cleanup()
+        self.fetch_results()
+
     def check_env(self):    # pragma: no cover
         """Check SuperBench environment."""
         logger.info('Checking SuperBench environment.')

From af4d18dedf8c158d3f96c034cf82464fb63c90d8 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 29 Jun 2023 07:03:40 +0000
Subject: [PATCH 13/33] Benchmarks: Add benchmark - Add source code of
 DirectxGPUMemBw microbenchmark (#487)

**Description**
Add source code of DirectxGPUMemBw microbenchmark.

---------

Co-authored-by: v-junlinlv <v-junlinlv@microsoft.com>
---
 .../BenchmarkOptions.h                        |  83 +++++
 .../directx_mem_bw_performance/GPUMemRwBw.cpp | 317 ++++++++++++++++++
 .../directx_mem_bw_performance/GPUMemRwBw.h   | 174 ++++++++++
 .../GPUMemRwBw.vcxproj                        | 105 ++++++
 .../directx_mem_bw_performance/Main.cpp       |  24 ++
 .../directx_mem_bw_performance/ReadWrite.hlsl |  62 ++++
 .../micro_benchmarks/directx_utils/Options.h  |  51 +++
 7 files changed, 816 insertions(+)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/Main.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/ReadWrite.hlsl

diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
new file mode 100644
index 000000000..7893fe8af
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
@@ -0,0 +1,83 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "../directx_utils/Options.h"
+#include "GPUMemRwBw.h"
+
+enum Memtype {
+    Read,
+    Write,
+    ReadWrite,
+};
+const std::string MemtypeString[] = {"Read", "Write", "ReadWrite"};
+
+class BenchmarkOptions : public Options {
+  public:
+    // Number of warm up rounds.
+    int num_warm_up = 0;
+    // Number of loop rounds of dispatch to measure the performance.
+    int num_loop = 0;
+    // Size of data for GPU mem access.
+    unsigned long long size;
+    // Run size from min_size to max_size for GPU mem access.
+    unsigned long long min_size = 0;
+    // Run size from min_size to max_size for GPU mem access.
+    unsigned long long max_size = 0;
+    // Whether check data correctness.
+    bool check_data = false;
+    // Memory operation type.
+    Memtype mem_type = Memtype::Write;
+    // Number of threads to launch.
+    UInt3 num_threads;
+
+    /**
+     * @brief Construct a new BenchmarkOptions object.
+     */
+    BenchmarkOptions(int argc, char *argv[]) : Options(argc, argv) {}
+
+    /**
+     * @brief Get the option usage.
+     */
+    void get_option_usage() override {
+        std::cout << "Usage: " << std::endl;
+        std::cout << "  --num_warm_up <num_warm_up> : Number of warm up rounds." << std::endl;
+        std::cout << "  --num_loop <num_loop> : Number of loop times to measure the performance." << std::endl;
+        std::cout << "  --minbytes <minbytes> : Lower data size bound to test." << std::endl;
+        std::cout << "  --maxbytes <maxbytes> : Upper data size bound to test." << std::endl;
+        std::cout << "  --check_data <check_data> : Whether check data correctness." << std::endl;
+        std::cout << "  --read : Memory operation type is read." << std::endl;
+        std::cout << "  --write : Memory operation type is write." << std::endl;
+        std::cout << "  --readwrite : Memory operation type is readwrite." << std::endl;
+        std::cout << "  --numthreads <x>,<y>,<z> : Number of threads in 3 dimenstions to launch." << std::endl;
+        std::cout << "  --help : Print help message." << std::endl;
+    }
+
+    /**
+     * @brief Parse the arguments.
+     */
+    virtual void parse_arguments() override {
+        num_warm_up = get_cmd_line_argument_int("--num_warm_up", 0);
+        num_loop = get_cmd_line_argument_int("--num_loop", 1);
+        size = get_cmd_line_argument_ulonglong("--size", -1);
+        min_size = get_cmd_line_argument_int("--minbytes", 4 * 1024);
+        max_size =
+            get_cmd_line_argument_ulonglong("--maxbytes", static_cast<unsigned long long>(1LL * 1024 * 1024 * 1024));
+        check_data = get_cmd_line_argument_bool("--check");
+        if (get_cmd_line_argument_bool("--read")) {
+            mem_type = Memtype::Read;
+        }
+        if (get_cmd_line_argument_bool("--write")) {
+            mem_type = Memtype::Write;
+        }
+        if (get_cmd_line_argument_bool("--readwrite")) {
+            mem_type = Memtype::ReadWrite;
+        }
+        num_threads = get_cmd_line_argument_uint3("--numthreads", {256, 1, 1});
+    }
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.cpp b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.cpp
new file mode 100644
index 000000000..75a7f7141
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.cpp
@@ -0,0 +1,317 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <fstream>
+#include <iostream>
+#include <tchar.h>
+#include <vector>
+
+#include "GPUMemRwBw.h"
+
+/*
+ * @brief Start benchmark.
+ */
+void GPUMemRwBw::Run() {
+    // Create GPU pipeline and device objects.
+    CreatePipeline();
+    // Prepare data and buffers.
+    PrepareDataAndBuffer(this->m_num_elements);
+    // Load shaders and root signatures.
+    LoadAssets();
+    // Start benchmark.
+    double time_ms = MemReadWriteBench(this->m_num_elements, opts->num_loop, opts->num_warm_up);
+    double bw = this->m_num_elements * sizeof(float) * opts->num_loop / time_ms / 1e6;
+    // Output benchmark result.
+    std::string mode = MemtypeString[static_cast<int>(opts->mem_type)];
+    cout << "GPUMemBw: " << mode << " " << opts->size << " " << bw << " GB/s" << endl;
+}
+
+/**
+ * @brief Allocate resouce on both CPU side and GPU side and construct a array of buffers with given length.
+ * @param numElement the length of data array.
+
+ */
+void GPUMemRwBw::PrepareDataAndBuffer(SIZE_T numElement) {
+    // Prepare CPU side data.
+    std::vector<float> dataA(numElement);
+    for (SIZE_T i = 0; i < numElement; i++) {
+        dataA[i] = i % 256;
+    }
+    // Allocate resources on GPU side to take those data.
+    UINT64 byteSize = dataA.size() * sizeof(float);
+    if (opts->mem_type == Memtype::Write || opts->mem_type == Memtype::ReadWrite) {
+        m_inputBuffer =
+            CreateDefaultBuffer(m_device.Get(), m_commandList.Get(), dataA.data(), byteSize, m_uploadBuffer);
+    }
+    // Allocate upload buffer to upload data from CPU to GPU.
+    ThrowIfFailed(m_device->CreateCommittedResource(
+        get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)), D3D12_HEAP_FLAG_NONE,
+        get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(byteSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)),
+        D3D12_RESOURCE_STATE_UNORDERED_ACCESS, nullptr, IID_PPV_ARGS(&m_outputBuffer)));
+    // Allocate readback buffer if needed.
+    if (opts->check_data && opts->mem_type != Memtype::Read) {
+        // Allocate readback buffer to check result correctness
+        ThrowIfFailed(m_device->CreateCommittedResource(
+            get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK)), D3D12_HEAP_FLAG_NONE,
+            get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(byteSize)), D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+            IID_PPV_ARGS(&m_readbackBuffer)));
+    }
+    // Prepare the parameter buffer of shader.
+    UINT8 *pCBDataBegin;
+    CD3DX12_HEAP_PROPERTIES heapProperties(D3D12_HEAP_TYPE_UPLOAD);
+    CD3DX12_RESOURCE_DESC bufferDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(ParameterBuffer));
+    ThrowIfFailed(m_device->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE, &bufferDesc,
+                                                    D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+                                                    IID_PPV_ARGS(&m_constantBuffer)));
+    // Fill the constant buffer to pass parameters to GPU.
+    ParameterBuffer param;
+    // Calculate total number of threads.
+    SIZE_T totalThreadNum = 1LL * (m_num_dispatch.x * m_num_dispatch.y * m_num_dispatch.z) *
+                            (m_num_thread.x * m_num_thread.y * m_num_thread.z);
+    param.numLoop = numElement / totalThreadNum;
+    param.numThread = m_num_thread;
+    // Upload constant buffer.
+    param.numDispatch = m_num_dispatch;
+    ThrowIfFailed(m_constantBuffer->Map(0, nullptr, reinterpret_cast<void **>(&pCBDataBegin)));
+    memcpy(pCBDataBegin, &param, sizeof(param));
+    m_constantBuffer->Unmap(0, nullptr);
+    // Commit resource allocation command list.
+    ExecuteWaitForCommandQueue();
+}
+
+/**
+ * @brief Check result correctness.
+ * @param numElement the length of data array.
+ * @return true if result is correct.
+ */
+bool GPUMemRwBw::CheckData(SIZE_T numElement) {
+    // Readback result to check correctness.
+    m_commandList->ResourceBarrier(
+        1, get_rvalue_ptr(CD3DX12_RESOURCE_BARRIER::Transition(m_outputBuffer.Get(), D3D12_RESOURCE_STATE_COMMON,
+                                                               D3D12_RESOURCE_STATE_COPY_SOURCE)));
+    m_commandList->CopyResource(m_readbackBuffer.Get(), m_outputBuffer.Get());
+    m_commandList->ResourceBarrier(
+        1, get_rvalue_ptr(CD3DX12_RESOURCE_BARRIER::Transition(m_outputBuffer.Get(), D3D12_RESOURCE_STATE_COPY_SOURCE,
+                                                               D3D12_RESOURCE_STATE_COMMON)));
+    // Execute copy back and sync.
+    ExecuteWaitForCommandQueue();
+    // Access from CPU.
+    float *mappedData = nullptr;
+    ThrowIfFailed(m_readbackBuffer->Map(0, nullptr, reinterpret_cast<void **>(&mappedData)));
+    for (int i = 0; i < numElement; ++i) {
+        if ((int)mappedData[i] != i % 256) {
+            cout << "Error: check data failed - index " << i << " should be " << i % 256 << " but got "
+                 << (int)mappedData[i] << endl;
+            break;
+        }
+    }
+    m_readbackBuffer->Unmap(0, nullptr);
+    return true;
+}
+
+/**
+ * @brief Memory read write benchmark.
+ * @param numElem the length of data array.
+ * @return double the time elapsed in ms.
+ */
+double GPUMemRwBw::MemReadWriteBench(SIZE_T numElem, int loops, int numWarmUp) {
+    // Start test.
+    m_gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
+    for (int i = 0; i < loops + numWarmUp; i++) {
+        if (i == numWarmUp) {
+            // Start timestamp.
+            m_gpuTimer.start(m_commandList.Get(), 0);
+        }
+        UInt3 dispatch = m_num_dispatch;
+        m_commandList->Dispatch(dispatch.x, dispatch.y, dispatch.z);
+    }
+    // Stop timestamp.
+    m_gpuTimer.stop(m_commandList.Get(), 0);
+    m_gpuTimer.resolveQueryToCPU(m_commandList.Get(), 0);
+
+    // Close, execute (and optionally reset) the command list, and also to use a fence to wait for the command queue.
+    ExecuteWaitForCommandQueue();
+
+    // Get time in ms.
+    double timeInMs = m_gpuTimer.getElapsedMsByTimestampPair(0);
+
+    if (opts->check_data && opts->mem_type != Memtype::Read) {
+        CheckData(numElem);
+    }
+    return timeInMs;
+}
+
+/**
+ * @brief Create pipeline including
+ *		  create device object, command list, command queue
+ *		  and synchronization objects.
+ */
+void GPUMemRwBw::CreatePipeline() {
+    UINT dxgiFactoryFlags = 0;
+#if _DEBUG
+    // Enable the debug layer (requires the Graphics Tools "optional feature").
+    // NOTE: Enabling the debug layer after device creation will invalidate the active device.
+    {
+        ComPtr<ID3D12Debug> debugController;
+        if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) {
+            debugController->EnableDebugLayer();
+            // Enable additional debug layers.
+            dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG;
+        }
+    }
+#endif
+    ComPtr<IDXGIFactory4> factory;
+    ThrowIfFailed(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory)));
+    ComPtr<IDXGIAdapter1> hardwareAdapter;
+    GetHardwareAdapter(factory.Get(), &hardwareAdapter);
+    ThrowIfFailed(D3D12CreateDevice(hardwareAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device)));
+    D3D12_COMMAND_QUEUE_DESC cqd3 = {};
+    cqd3.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+    ThrowIfFailed(m_device->CreateCommandQueue(&cqd3, IID_PPV_ARGS(&m_commandQueue)));
+    ThrowIfFailed(m_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_commandAllocator)));
+    // Create the command list.
+    ThrowIfFailed(m_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_commandAllocator.Get(), nullptr,
+                                              IID_PPV_ARGS(&m_commandList)));
+    // Create synchronization objects.
+    ThrowIfFailed(m_device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_fence)));
+    m_fenceValue = 1;
+    // Create an event handle to use for GPU synchronization.
+    m_eventHandle = CreateEvent(0, false, false, 0);
+}
+
+/**
+ * @brief Setup GPU pipeline resource including creating root signature, pipeline state and compile shader.
+ */
+void GPUMemRwBw::LoadAssets() {
+    // Prepare root signature, root parameter can be a table, root descriptor or root constants.
+    const int nParamter = 3;
+    CD3DX12_ROOT_PARAMETER slotRootParameter[nParamter];
+    // Bind the SRV, CBV and UAV descriptor tables to the root parameters.
+    slotRootParameter[0].InitAsShaderResourceView(0);
+    slotRootParameter[1].InitAsConstantBufferView(0);
+    slotRootParameter[2].InitAsUnorderedAccessView(0);
+    // Create the root signature.
+    // A root signature is an array of root parameters.
+    CD3DX12_ROOT_SIGNATURE_DESC rootSigDesc(nParamter, slotRootParameter, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE);
+    ComPtr<ID3DBlob> serializedRootSig = nullptr;
+    ComPtr<ID3DBlob> errorBlob = nullptr;
+    HRESULT hr = D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1,
+                                             serializedRootSig.GetAddressOf(), errorBlob.GetAddressOf());
+    if (hr != S_OK || errorBlob != nullptr) {
+        std::cout << "Error: " << (char *)errorBlob->GetBufferPointer() << std::endl;
+        throw runtime_error("Error: D3D12SerializeRootSignature failed.");
+    }
+    ThrowIfFailed(m_device->CreateRootSignature(0, serializedRootSig->GetBufferPointer(),
+                                                serializedRootSig->GetBufferSize(),
+                                                IID_PPV_ARGS(m_rootSignature.GetAddressOf())));
+    // Define the number of threads per thread group.
+    // LPCSTR pointer obtained from myString.c_str() is only valid as long as the myString object exists.
+    std::string x_str = std::to_string(m_num_thread.x);
+    LPCSTR x_val = x_str.c_str();
+    std::string y_str = std::to_string(m_num_thread.y);
+    LPCSTR y_val = y_str.c_str();
+    std::string z_str = std::to_string(m_num_thread.z);
+    LPCSTR z_val = z_str.c_str();
+    D3D_SHADER_MACRO defines[] = {
+        {"X", x_val},
+        {"Y", y_val},
+        {"Z", z_val},
+        {nullptr, nullptr} // The last entry must be nullptr to indicate the end of the array
+    };
+    // Load and Compile shader according to user specified.
+    switch (opts->mem_type) {
+    case Memtype::Read:
+        m_shader = CompileShader(L"ReadWrite.hlsl", defines, "Read", "cs_5_0");
+        break;
+    case Memtype::Write:
+        m_shader = CompileShader(L"ReadWrite.hlsl", defines, "Write", "cs_5_0");
+        break;
+    case Memtype::ReadWrite:
+        m_shader = CompileShader(L"ReadWrite.hlsl", defines, "ReadWrite", "cs_5_0");
+        break;
+    default:
+        std::cout << "Error: Invalid memory type." << std::endl;
+        exit(1);
+    }
+    // Describe and create the graphics pipeline state object (PSO).
+    D3D12_COMPUTE_PIPELINE_STATE_DESC computePsoDesc = {};
+    computePsoDesc.pRootSignature = m_rootSignature.Get();
+    computePsoDesc.CS = {reinterpret_cast<BYTE *>(m_shader->GetBufferPointer()), m_shader->GetBufferSize()};
+    computePsoDesc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE;
+    ThrowIfFailed(m_device->CreateComputePipelineState(&computePsoDesc, IID_PPV_ARGS(&m_PSO)));
+
+    ExecuteWaitForCommandQueue();
+
+    // Setup root signature for pipeline.
+    m_commandList->SetPipelineState(m_PSO.Get());
+    m_commandList->SetComputeRootSignature(m_rootSignature.Get());
+    if (opts->mem_type == Memtype::Write || opts->mem_type == Memtype::ReadWrite) {
+        m_commandList->SetComputeRootShaderResourceView(0, m_inputBuffer->GetGPUVirtualAddress());
+    }
+    m_commandList->SetComputeRootConstantBufferView(1, m_constantBuffer->GetGPUVirtualAddress());
+    m_commandList->SetComputeRootUnorderedAccessView(2, m_outputBuffer->GetGPUVirtualAddress());
+}
+
+/**
+ * @brief Create a default buffer and upload data with the upload buffer.
+ * @param device the GPU device object.
+ * @param cmdList the GPU command list object.
+ * @param initData the data that need to upload.
+ * @param byteSize the size of data that need to upload.
+ * @param uploadBuffer the upload that use for upload data.
+ * @return a constant buffer object.
+ */
+Microsoft::WRL::ComPtr<ID3D12Resource>
+GPUMemRwBw::CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, const void *initData,
+                                UINT64 byteSize, Microsoft::WRL::ComPtr<ID3D12Resource> &uploadBuffer) {
+    ComPtr<ID3D12Resource> defaultBuffer;
+    // Create target default buffer.
+    CD3DX12_HEAP_PROPERTIES DefaultHeap(D3D12_HEAP_TYPE_DEFAULT);
+    CD3DX12_RESOURCE_DESC defaultResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(byteSize);
+    ThrowIfFailed(device->CreateCommittedResource(&DefaultHeap, D3D12_HEAP_FLAG_NONE, &defaultResourceDesc,
+                                                  D3D12_RESOURCE_STATE_COMMON, nullptr,
+                                                  IID_PPV_ARGS(defaultBuffer.GetAddressOf())));
+    // Create a temporary upload buffer to upload data.
+    CD3DX12_HEAP_PROPERTIES UploadHeap(D3D12_HEAP_TYPE_UPLOAD);
+    CD3DX12_RESOURCE_DESC UploadResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(byteSize);
+    ThrowIfFailed(device->CreateCommittedResource(&UploadHeap, D3D12_HEAP_FLAG_NONE, &UploadResourceDesc,
+                                                  D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+                                                  IID_PPV_ARGS(uploadBuffer.GetAddressOf())));
+    // Upload data that pass in.
+    D3D12_SUBRESOURCE_DATA subResourceData = {};
+    subResourceData.pData = initData;
+    subResourceData.RowPitch = byteSize;
+    subResourceData.SlicePitch = subResourceData.RowPitch;
+    // Commit copy command list.
+    CD3DX12_RESOURCE_BARRIER WriteBarrier = CD3DX12_RESOURCE_BARRIER::Transition(
+        defaultBuffer.Get(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST);
+    cmdList->ResourceBarrier(1, &WriteBarrier);
+    UpdateSubresources<1>(cmdList, defaultBuffer.Get(), uploadBuffer.Get(), 0, 0, 1, &subResourceData);
+    CD3DX12_RESOURCE_BARRIER ReadBarrier = CD3DX12_RESOURCE_BARRIER::Transition(
+        defaultBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_GENERIC_READ);
+    cmdList->ResourceBarrier(1, &ReadBarrier);
+    return defaultBuffer;
+}
+
+/**
+ * @brief Execute the commands and wait until command completed.
+ */
+void GPUMemRwBw::ExecuteWaitForCommandQueue(DWORD dwMilliseconds) {
+    // Close, execute (and optionally reset) the command list, and also to use a fence to wait for the command queue.
+    ThrowIfFailed(m_commandList->Close());
+    ID3D12CommandList *listsToExecute[] = {m_commandList.Get()};
+    m_commandQueue->ExecuteCommandLists(ARRAYSIZE(listsToExecute), listsToExecute);
+    // Signal and increment the fence value.
+    const UINT64 fenceL = m_fenceValue;
+    ThrowIfFailed(m_commandQueue->Signal(m_fence.Get(), fenceL));
+    m_fenceValue++;
+    // Wait until command queue is done.
+    if (m_fence->GetCompletedValue() < fenceL) {
+        ThrowIfFailed(m_fence->SetEventOnCompletion(fenceL, m_eventHandle));
+        WaitForSingleObject(m_eventHandle, dwMilliseconds);
+    }
+    // Reset the command allocator and command list.
+    ID3D12CommandAllocator *activeAllocator = m_commandAllocator.Get();
+    ThrowIfFailed(activeAllocator->Reset());
+    ThrowIfFailed(m_commandList->Reset(activeAllocator, nullptr));
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.h b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.h
new file mode 100644
index 000000000..59ca86db8
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.h
@@ -0,0 +1,174 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers.
+#endif
+
+#include <algorithm>
+#include <chrono>
+#include <random>
+#include <string>
+#include <vector>
+#include <wrl.h>
+
+#include <D3Dcompiler.h>
+#include <DirectXMath.h>
+#include <d3d12.h>
+#include <d3d12shader.h>
+#include <dxgi1_6.h>
+
+#include "../directx_third_party/DXSampleHelper.h"
+#include "../directx_third_party/d3dx12.h"
+#include "../directx_utils/D3D12Timer.h"
+#include "BenchmarkOptions.h"
+
+// linker
+#pragma comment(lib, "dxguid.lib")
+#pragma comment(lib, "dxgi.lib")
+#pragma comment(lib, "d3d12.lib")
+#pragma comment(lib, "d3dcompiler.lib")
+
+#if defined(_DEBUG)
+#include <dxgidebug.h>
+#endif
+
+using namespace DirectX;
+// Note that while ComPtr is used to manage the lifetime of resources on the CPU,
+// it has no understanding of the lifetime of resources on the GPU. Apps must account
+// for the GPU lifetime of resources to avoid destroying objects that may still be
+// referenced by the GPU.
+// An example of this can be found in the class method: OnDestroy().
+using Microsoft::WRL::ComPtr;
+using namespace std;
+
+struct ParameterBuffer {
+    int numLoop;
+    UInt3 numThread;
+    UInt3 numDispatch;
+};
+
+template <typename T> T *get_rvalue_ptr(T &&v) { return &v; }
+
+class GPUMemRwBw {
+  public:
+    /**
+     * @brief Constructor, initialize the options.
+     * @param opts, Options for construct.
+     * @param usize, the byte size of data array.
+     */
+    GPUMemRwBw(BenchmarkOptions *opts) : opts(opts) {
+        // The setting of num_thread need be consistent with the the shader file.
+        m_num_thread = opts->num_threads;
+        m_num_elements = opts->size / sizeof(float);
+        uint32_t numThreadGroup = m_num_elements / (m_num_thread.x * m_num_thread.y * m_num_thread.z);
+        m_num_dispatch = {numThreadGroup, 1, 1};
+    }
+
+    /**
+     * @brief Destructor, release the fence.
+     */
+    ~GPUMemRwBw() {}
+
+    /**
+     * @brief Start and run the benchmark.
+     */
+    void Run();
+
+    /**
+     * @brief Memory read write benchmark.
+     * @param numElem the length of data array.
+     * @param loops the number of dispatch tiems for measuring the performance.
+     * @param numWarmUp the number of warm up dispatch times.
+     * @return double the time elapsed in ms.
+     */
+    double MemReadWriteBench(SIZE_T numElem, int loops, int numWarmUp);
+
+    /**
+     * @brief Create pipeline including
+     *		  create device object, command list, command queue
+     *		  and synchronization objects.
+     */
+    void CreatePipeline();
+
+    /**
+     * @brief Setup GPU pipeline resource including creating root signature, pipeline state and compile shader.
+     */
+    void LoadAssets();
+
+    /**
+     * @brief Allocate resouce on both CPU side and GPU side and construct a array of buffers with given length.
+     * @param numElement the length of data array.
+     */
+    void PrepareDataAndBuffer(SIZE_T numElement);
+
+    /**
+     * @brief Create a default buffer and upload data with the upload buffer.
+     * @param device the GPU device object.
+     * @param cmdList the GPU command list object.
+     * @param initData the data that need to upload.
+     * @param byteSize the size of data that need to upload.
+     * @param UploadBuffer the upload that use for upload data.
+     * @return a constant buffer object.
+     */
+    Microsoft::WRL::ComPtr<ID3D12Resource> CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList,
+                                                               const void *initData, UINT64 byteSize,
+                                                               Microsoft::WRL::ComPtr<ID3D12Resource> &uploadBuffer);
+
+    /**
+     * @brief Execute the commands and wait until command completed.
+     */
+    void ExecuteWaitForCommandQueue(DWORD dwMilliseconds = 30000);
+
+    /**
+     * @brief Check result correctness.
+     * @param numElement the length of data array.
+     * @return true if result is correct.
+     */
+    bool CheckData(SIZE_T numElement);
+
+  private:
+    // Dispatch layout of command.
+    UInt3 m_num_dispatch;
+    // Number of elements in data buffer.
+    uint32_t m_num_elements = 0;
+    // Number of threads each group.
+    UInt3 m_num_thread;
+
+    // Pipeline objects.
+    ComPtr<ID3D12Device> m_device = nullptr;
+    ComPtr<ID3D12CommandAllocator> m_commandAllocator = nullptr;
+    ComPtr<ID3D12CommandQueue> m_commandQueue = nullptr;
+    ComPtr<ID3D12GraphicsCommandList> m_commandList = nullptr;
+
+    // Upload buffer to upload data from CPU to GPU.
+    ComPtr<ID3D12Resource> m_uploadBuffer = nullptr;
+    // Input buffer to pass data into GPU.
+    ComPtr<ID3D12Resource> m_inputBuffer = nullptr;
+    // Readback buffer to copy data from GPU to CPU for data check.
+    ComPtr<ID3D12Resource> m_readbackBuffer = nullptr;
+    // Output buffer.
+    ComPtr<ID3D12Resource> m_outputBuffer = nullptr;
+    // Constant buffer.
+    ComPtr<ID3D12Resource> m_constantBuffer = nullptr;
+
+    // Root signature of GPU pipeline.
+    ComPtr<ID3D12RootSignature> m_rootSignature = nullptr;
+    // Pipeline object to execute.
+    ComPtr<ID3D12PipelineState> m_PSO = nullptr;
+    // Shader objects that loaded.
+    ComPtr<ID3DBlob> m_shader = nullptr;
+
+    // Synchronization objects.
+    ComPtr<ID3D12Fence1> m_fence = nullptr;
+    HANDLE m_eventHandle = nullptr;
+    UINT64 m_fenceValue = 0;
+
+    // GPU timer.
+    D3D12::D3D12Timer m_gpuTimer;
+
+    // User options.
+    BenchmarkOptions *opts;
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
new file mode 100644
index 000000000..80ab02e37
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
@@ -0,0 +1,105 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{7880ced5-0e93-4003-9f9b-2ed29bc4bd0f}</ProjectGuid>
+    <RootNamespace>GPUMemRwBw</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="../directx_utils/D3D12Timer.cpp"/>
+    <ClCompile Include="Main.cpp"/>
+    <ClCompile Include="GPUMemRwBw.cpp"/>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="../directx_utils/D3D12Timer.h"/>
+    <ClInclude Include="../directx_utils/Options.h"/>
+    <ClInclude Include="../directx_third_party/d3dx12.h"/>
+    <ClInclude Include="../directx_third_party/DXSampleHelper.h"/>
+    <ClInclude Include="GPUMemRwBw.h"/>
+    <ClInclude Include="BenchmarkOptions.h"/>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="ReadWrite.hlsl">
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">4.0</ShaderModel>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">4.0</ShaderModel>
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(OutDir)" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(OutDir)\%(Identity)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(OutDir)" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)\%(Identity)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/Main.cpp
new file mode 100644
index 000000000..7901224e7
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/Main.cpp
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <sstream>
+
+#include "GPUMemRwBw.h"
+
+int main(int argc, char *argv[]) {
+    BenchmarkOptions option(argc, argv);
+    option.init();
+    if (option.size != -1) {
+        // Run only one size
+        GPUMemRwBw benchmark(&option);
+        benchmark.Run();
+    } else {
+        // Run all sizes
+        for (SIZE_T usize = option.min_size; usize <= option.max_size; usize += usize) {
+            option.size = usize;
+            GPUMemRwBw benchmark(&option);
+            benchmark.Run();
+        }
+    }
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/ReadWrite.hlsl b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/ReadWrite.hlsl
new file mode 100644
index 000000000..f27ca2ebe
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/ReadWrite.hlsl
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+StructuredBuffer<float> gInputA : register(t0);
+RWStructuredBuffer<float> gOutput : register(u0);
+
+cbuffer ParamBuffer : register(b0) {
+	int numLoop;
+	uint3 numThreads;
+	uint3 numDispatch;
+};
+
+[numthreads(X, Y, Z)]
+void Read(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID, uint3 dispatchId : SV_DispatchThreadID)
+{
+	uint idStart = dispatchId.x +
+		dispatchId.y * numDispatch.x * numThreads.x +
+		dispatchId.z * numDispatch.x * numThreads.x * numDispatch.y * numThreads.y;
+
+	uint start = idStart * numLoop;
+	uint end = start + numLoop;
+	for (uint i = start; i < end; i++)
+	{
+		float c = gOutput[i];
+		if (c == -1)
+		{
+			// This condition should never access since gOutput init as zero.
+			// It is for avoid compile optimization.
+			gOutput[i] = 0;
+		}
+	}
+}
+
+[numthreads(X, Y, Z)]
+void Write(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID, uint3 dispatchId : SV_DispatchThreadID)
+{
+	uint idStart = dispatchId.x +
+		dispatchId.y * numDispatch.x * numThreads.x +
+		dispatchId.z * numDispatch.x * numThreads.x * numDispatch.y * numThreads.y;
+
+	uint start = idStart * numLoop;
+	uint end = start + numLoop;
+	for (uint i = start; i < end; i++)
+	{
+		gOutput[i] =  i % 256;
+	}
+}
+
+[numthreads(X, Y, Z)]
+void ReadWrite(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID, uint3 dispatchId : SV_DispatchThreadID)
+{
+	uint idStart = dispatchId.x +
+		dispatchId.y * numDispatch.x * numThreads.x +
+		dispatchId.z * numDispatch.x * numThreads.x * numDispatch.y * numThreads.y;
+
+	uint start = idStart * numLoop;
+	uint end = start + numLoop;
+	for (uint i = start; i < end; i++)
+	{
+		gOutput[i] = gInputA[i];
+	}
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h
index ce384272a..848688351 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h
@@ -7,6 +7,12 @@
 #include <sstream>
 #include <string>
 
+struct UInt3 {
+    unsigned int x;
+    unsigned int y;
+    unsigned int z;
+};
+
 class Options {
   protected:
     char **begin;
@@ -43,6 +49,51 @@ class Options {
         return defaults;
     }
 
+    /**
+     * @brief Get the unsigned long long type value of cmd line argument.
+     * @param option the cmd line argument.
+     * @param defaults the default value.
+     * @return unsigned long long the unsigned long long type value of cmd line argument 'option'.
+     */
+    std::vector<unsigned int> splitAndConvertToInt(const std::string &str) {
+        std::vector<unsigned int> result;
+        std::stringstream ss(str);
+        std::string token;
+
+        while (std::getline(ss, token, ',')) {
+            try {
+                result.push_back(std::stoul(token));
+            } catch (std::invalid_argument &e) {
+                throw std::invalid_argument("Invalid argument: " + token + e.what());
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @brief Get the unsigned int type value of cmd line argument.
+     * @param option the cmd line argument.
+     * @param defaults the default value.
+     * @return unsigned int the unsigned int type value of cmd line argument 'option'.
+     */
+    UInt3 get_cmd_line_argument_uint3(const std::string &option, const UInt3 &defaults) {
+        if (char *value = get_cmd_option(option)) {
+            try {
+                std::vector<unsigned int> values = splitAndConvertToInt(value);
+                if (values.size() != 3) {
+                    std::cout << "Error: Invalid argument - " << option << " should be unsigned int3" << '\n';
+                    exit(1);
+                }
+                return {values[0], values[1], values[2]};
+
+            } catch (const std::exception &e) {
+                std::cout << "Error: Invalid argument - " << option << " should be unsigned int3" << e.what() << '\n';
+                exit(1);
+            }
+        }
+        return defaults;
+    }
+
     /**
      * @brief Get the string type value of cmd line argument.
      * @param  option the cmd line argument.

From f25991370770c9f55ea8cf445e01301db61679d6 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 29 Jun 2023 11:38:01 +0000
Subject: [PATCH 14/33] Benchmarks: Add benchmark - Add source code of
 DirectxGPUCopy microbenchmark (#486)

**Description**
Add source code of DirectxGPUCopy microbenchmark.
---
 .../BenchmarkOptions.h                        |   3 +-
 .../BenchmarkOptions.h                        |  69 +++++
 .../GPUCopyBw.cpp                             | 241 ++++++++++++++++++
 .../directx_gpu_copy_performance/GPUCopyBw.h  | 146 +++++++++++
 .../GPUCopyBw.vcxproj                         |  90 +++++++
 .../directx_gpu_copy_performance/Main.cpp     |  23 ++
 .../micro_benchmarks/directx_utils/Options.h  |  26 +-
 7 files changed, 592 insertions(+), 6 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/BenchmarkOptions.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp

diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
index c5207bb4f..8ba9fb913 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
@@ -37,9 +37,8 @@ class BenchmarkOptions : public Options {
      * @brief Parse the arguments.
      */
     virtual void parse_arguments() {
-
         num_loops = get_cmd_line_argument_int("--num_loops", 10);
-        num_warm_up = get_cmd_line_argument_int("--num_loops", 0);
+        num_warm_up = get_cmd_line_argument_int("--num_warm_up", 0);
         m = get_cmd_line_argument_int("--m", 16 * 256);
         n = get_cmd_line_argument_int("--n", 16 * 256);
         k = get_cmd_line_argument_int("--k", 16 * 256);
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/BenchmarkOptions.h
new file mode 100644
index 000000000..aa0493cd4
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/BenchmarkOptions.h
@@ -0,0 +1,69 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "../directx_utils/Options.h"
+
+class BenchmarkOptions : public Options {
+
+  public:
+    // Size of data for GPU copy.
+    unsigned long long size;
+    // Run size from min_size to max_size for GPU copy.
+    unsigned long long min_size = 0;
+    // Run size from min_size to max_size for GPU copy.
+    unsigned long long max_size = 0;
+    // Number of warm up copy times to run.
+    int num_warm_up = 0;
+    // Number of copy times to run.
+    int num_loops = 0;
+    // Host-to-device copy mode.
+    bool htod_enabled = false;
+    // device-to-host copy mode.
+    bool dtoh_enabled = false;
+    // Whether check data after copy.
+    bool check_data = false;
+
+    /**
+     * @brief Construct a new BenchmarkOptions object.
+     */
+    BenchmarkOptions(int argc, char *argv[]) : Options(argc, argv) {}
+
+    /**
+     * @brief Parse the arguments.
+     */
+    virtual void parse_arguments() override {
+        size = get_cmd_line_argument_int("--size", -1);
+        num_warm_up = get_cmd_line_argument_int("--warm_up", 20);
+        num_loops = get_cmd_line_argument_int("--num_loops", 100000);
+        min_size = get_cmd_line_argument_int("--minbytes", 64);
+        max_size = get_cmd_line_argument_ulonglong("--maxbytes", 8 * 1024 * 1024);
+        htod_enabled = get_cmd_line_argument_bool("--htod");
+        dtoh_enabled = get_cmd_line_argument_bool("--dtoh");
+        check_data = get_cmd_line_argument_bool("--check");
+        if (!htod_enabled && !dtoh_enabled) {
+            std::cerr << "Error: Please specify copy mode!" << std::endl;
+            exit(-1);
+        }
+    }
+
+    /**
+     * @brief Get the option usage.
+     */
+    void get_option_usage() override {
+        std::cout << "Usage: " << std::endl;
+        std::cout << "  --size <int>            Size of data for GPU copy." << std::endl;
+        std::cout << "  --warm_up <int>         Number of warm up copy times to run." << std::endl;
+        std::cout << "  --num_loops <int>       Number of copy times to run." << std::endl;
+        std::cout << "  --minbytes <int>        Run size from min_size to max_size for GPU copy." << std::endl;
+        std::cout << "  --maxbytes <int>        Run size from min_size to max_size for GPU copy." << std::endl;
+        std::cout << "  --htod <bool>           Host-to-device copy mode." << std::endl;
+        std::cout << "  --dtoh <bool>           Device-to-host copy mode." << std::endl;
+        std::cout << "  --check <bool>          Whether check data after copy." << std::endl;
+    }
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.cpp b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.cpp
new file mode 100644
index 000000000..c95c79f3f
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.cpp
@@ -0,0 +1,241 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <tchar.h>
+#include <vector>
+
+#include "GPUCopyBw.h"
+
+/**
+ * @brief Run the benchmark.
+ */
+void GPUCopyBw::Run() {
+    CreatePipeline();
+    double time_ms = CopyResourceBench(opts->size, opts->num_loops, opts->num_warm_up);
+    double bw = opts->size * opts->num_loops / time_ms / 1e6;
+    string mode = opts->dtoh_enabled ? "dtoh" : "htod";
+    cout << mode << ": " << opts->size << "B " << bw << " GB/s" << endl;
+}
+
+/**
+ * @brief Allocate gpu resources, construct a array of buffers with given size.
+ * @param uSize the size of each buffer inside of array.
+ */
+void GPUCopyBw::InitializeBuffer(SIZE_T uSize) {
+    m_defaultBufferDesc = CD3DX12_RESOURCE_DESC::Buffer(uSize);
+
+    // The output buffer (created below) is on a default heap, so only the GPU can access it.
+    auto defaultHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT);
+    ThrowIfFailed(m_device->CreateCommittedResource(&defaultHeapProperties, D3D12_HEAP_FLAG_NONE, &m_defaultBufferDesc,
+                                                    D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+                                                    IID_PPV_ARGS(&m_defaultBuffer)));
+
+    // Create upload buffer to upload data to GPU.
+    auto uploadHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD);
+    ThrowIfFailed(m_device->CreateCommittedResource(&uploadHeapProperties, D3D12_HEAP_FLAG_NONE, &m_defaultBufferDesc,
+                                                    D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+                                                    IID_PPV_ARGS(&m_uploadBuffer)));
+
+    // Create read back buffer if dtoh mode.
+    if (opts->dtoh_enabled) {
+        auto readbackHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK);
+        ThrowIfFailed(m_device->CreateCommittedResource(&readbackHeapProperties, D3D12_HEAP_FLAG_NONE,
+                                                        &m_defaultBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+                                                        IID_PPV_ARGS(&m_readbackBuffer)));
+    }
+}
+
+/**
+ * @brief Allocate data on CPU side to prepare upload.
+ * @param byteSize the size of data to be uploaded.
+ */
+void GPUCopyBw::PrepareData(SIZE_T byteSize) {
+    m_pDataBegin = std::make_unique<uint8_t[]>(byteSize);
+    constexpr int uint8_mod = 256;
+    for (int j = 0; j < byteSize; j++) {
+        m_pDataBegin[j] = static_cast<uint8_t>(j % uint8_mod);
+    }
+}
+
+/**
+ * @brief Check result correctness.
+ * @param byteSize the size of data to be checked.
+ * @param pData the byte array that expect to be.
+ * @return true result is correct.
+ */
+bool GPUCopyBw::CheckData(SIZE_T byteSize, const uint8_t *pData) {
+    if (opts->dtoh_enabled) {
+        D3D12_RANGE readbackBufferRange{0, byteSize};
+        uint8_t *pReadbackBufferData{};
+
+        // Read back data from GPU.
+        ThrowIfFailed(m_readbackBuffer->Map(0, &readbackBufferRange, reinterpret_cast<void **>(&pReadbackBufferData)));
+        // Check result correctness.
+        for (int i = 0; i < byteSize; i++) {
+            if (pData[i] != pReadbackBufferData[i])
+                return false;
+        }
+        D3D12_RANGE emptyRange{0, 0};
+        m_readbackBuffer->Unmap(0, &emptyRange);
+    }
+    return true;
+}
+
+/**
+ * @brief GPU copy benchmark.
+ * @param size the size of data to copy.
+ * @param loops the number of copy times to measure the performance.
+ * @return double the time elapsed in ms.
+ */
+double GPUCopyBw::CopyResourceBench(SIZE_T size, int loops, int warm_up) {
+    // Prepare CPU side data buffer.
+    PrepareData(size);
+    // Prepare GPU resources and buffers.
+    InitializeBuffer(size);
+    // Set data into source buffer.
+    PrepareSourceBufferData(m_pDataBegin.get(), size);
+
+    // Run the copy command.
+    gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::copy);
+    for (int i = 0; i < loops + warm_up; i++) {
+        if (i == warm_up) {
+            // Start timestamp.
+            this->gpuTimer.start(m_commandList.Get(), 0);
+        }
+        if (opts->htod_enabled) {
+            CopyResourceFromUploadToDefault();
+        } else if (opts->dtoh_enabled) {
+            CopyResourceFromDefaultToReadback();
+        }
+    }
+    // Stop timestamp.
+    this->gpuTimer.stop(m_commandList.Get(), 0);
+    this->gpuTimer.resolveQueryToCPU(m_commandList.Get(), 0);
+
+    // Close, execute (and optionally reset) the command list, and also to use a fence to wait for the command queue.
+    this->ExecuteWaitForCopyQueue();
+
+    // Check if result is correctly copied.
+    // The code below assumes that the GPU wrote FLOATs to the buffer.
+    if (opts->check_data) {
+        bool correctness = CheckData(size, m_pDataBegin.get());
+        if (!correctness) {
+            std::cout << "Error: Result is not correct!" << std::endl;
+        }
+    }
+
+    return this->gpuTimer.getElapsedMsByTimestampPair(0);
+}
+
+/**
+ * @brief Copy data from CPU side to GPU side.
+ */
+void GPUCopyBw::CopyResourceFromUploadToDefault() {
+    m_commandList->CopyResource(m_defaultBuffer.Get(), m_uploadBuffer.Get());
+}
+
+/**
+ * @brief Copy data from GPU side to GPU side.
+ */
+void GPUCopyBw::CopyResourceFromDefaultToDefault() {
+    m_commandList->CopyResource(m_defaultBuffer.Get(), m_defaultDescBuffer.Get());
+}
+
+/**
+ * @brief Copy data from GPU side to CPU side.
+ */
+void GPUCopyBw::CopyResourceFromDefaultToReadback() {
+    m_commandList->CopyResource(m_readbackBuffer.Get(), m_defaultBuffer.Get());
+}
+
+/**
+ * @brief Execute the commands and wait until command completed.
+ */
+void GPUCopyBw::ExecuteWaitForCopyQueue(DWORD dwMilliseconds) {
+    // Close, execute (and optionally reset) the command list, and also to use a fence to wait for the command queue.
+    ThrowIfFailed(m_commandList->Close());
+    ID3D12CommandList *listsToExecute[] = {m_commandList.Get()};
+    m_commandQueue->ExecuteCommandLists(ARRAYSIZE(listsToExecute), listsToExecute);
+    // Signal and increment the fence value.
+    const UINT64 fenceL = m_copyFenceValue;
+    ThrowIfFailed(m_commandQueue->Signal(m_copyFence.Get(), fenceL));
+    m_copyFenceValue++;
+    // Wait until command queue is done.
+    if (m_copyFence->GetCompletedValue() < fenceL) {
+        ThrowIfFailed(m_copyFence->SetEventOnCompletion(fenceL, m_copyEventHandle));
+        WaitForSingleObject(m_copyEventHandle, dwMilliseconds);
+    }
+    // Reset the command allocator and command list.
+    ID3D12CommandAllocator *activeAllocator = m_commandAllocator.Get();
+    ThrowIfFailed(activeAllocator->Reset());
+    ThrowIfFailed(m_commandList->Reset(activeAllocator, nullptr));
+}
+
+/**
+ * @brief Prepare data of the source buffer of benchmark.
+ * @param pData the data that should upload.
+ * @param byteSize the size of data.
+ */
+void GPUCopyBw::PrepareSourceBufferData(const void *pData, SIZE_T byteSize) {
+    // Upload data from CPU to upload buffer.
+    void *p;
+    ThrowIfFailed(m_uploadBuffer->Map(0, nullptr, &p));
+    memcpy(p, pData, byteSize);
+    m_uploadBuffer->Unmap(0, nullptr);
+
+    if (opts->dtoh_enabled) {
+        // Upload data from upload to default buffer.
+        CopyResourceFromUploadToDefault();
+        D3D12_RESOURCE_BARRIER outputBufferResourceBarrier{CD3DX12_RESOURCE_BARRIER::Transition(
+            m_defaultBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE)};
+        m_commandList->ResourceBarrier(1, &outputBufferResourceBarrier);
+        ExecuteWaitForCopyQueue();
+    }
+}
+
+/**
+ * @brief Create pipeline including
+ *		  create device object, command list, command queue
+ *		  and synchronization objects.
+ */
+void GPUCopyBw::CreatePipeline() {
+    UINT dxgiFactoryFlags = 0;
+
+#if _DEBUG
+    // Enable the debug layer (requires the Graphics Tools "optional feature").
+    // NOTE: Enabling the debug layer after device creation will invalidate the active device.
+    {
+        ComPtr<ID3D12Debug> debugController;
+        if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) {
+            debugController->EnableDebugLayer();
+
+            // Enable additional debug layers.
+            dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG;
+        }
+    }
+#endif
+
+    ComPtr<IDXGIFactory4> factory;
+    ThrowIfFailed(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory)));
+
+    ComPtr<IDXGIAdapter1> hardwareAdapter;
+    GetHardwareAdapter(factory.Get(), &hardwareAdapter);
+
+    ThrowIfFailed(D3D12CreateDevice(hardwareAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device)));
+
+    D3D12_COMMAND_QUEUE_DESC cqd3 = {};
+    cqd3.Type = D3D12_COMMAND_LIST_TYPE_COPY;
+    ThrowIfFailed(m_device->CreateCommandQueue(&cqd3, IID_PPV_ARGS(&m_commandQueue)));
+
+    ThrowIfFailed(m_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COPY, IID_PPV_ARGS(&m_commandAllocator)));
+
+    // Create the command list.
+    ThrowIfFailed(m_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COPY, m_commandAllocator.Get(), nullptr,
+                                              IID_PPV_ARGS(&m_commandList)));
+
+    ThrowIfFailed(m_device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_copyFence)));
+    m_copyFenceValue = 1;
+    // Create an event handle to use for GPU synchronization.
+    m_copyEventHandle = CreateEvent(0, false, false, 0);
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.h b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.h
new file mode 100644
index 000000000..945aa2092
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.h
@@ -0,0 +1,146 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers.
+#endif
+
+#include <D3Dcompiler.h>
+#include <DirectXMath.h>
+#include <d3d12.h>
+#include <d3d12shader.h>
+#include <dxgi1_6.h>
+#include <shellapi.h>
+#include <string>
+#include <wrl.h>
+
+// linker
+#pragma comment(lib, "dxguid.lib")
+#pragma comment(lib, "dxgi.lib")
+#pragma comment(lib, "d3d12.lib")
+#pragma comment(lib, "d3dcompiler.lib")
+
+#if defined(_DEBUG)
+#include <dxgidebug.h>
+#endif
+
+#include "../directx_third_party/DXSampleHelper.h"
+#include "../directx_third_party/d3dx12.h"
+#include "../directx_utils/D3D12Timer.h"
+#include "BenchmarkOptions.h"
+
+using namespace DirectX;
+// Note that while ComPtr is used to manage the lifetime of resources on the CPU,
+// it has no understanding of the lifetime of resources on the GPU. Apps must account
+// for the GPU lifetime of resources to avoid destroying objects that may still be
+// referenced by the GPU.
+// An example of this can be found in the class method: OnDestroy().
+using Microsoft::WRL::ComPtr;
+using namespace std;
+
+class GPUCopyBw {
+  public:
+    GPUCopyBw(BenchmarkOptions *opts) : opts(opts) {}
+    ~GPUCopyBw() { CloseHandle(m_copyFence.Get()); }
+
+    /**
+     * @brief Run the benchmark.
+     */
+    void Run();
+
+    /**
+     * @brief GPU copy benchmark.
+     * @param size the size of data to copy.
+     * @param loops the number of copy times to measure the performance.
+     * @return double the time elapsed in ms.
+     */
+    double CopyResourceBench(SIZE_T size, int loops, int warm_up);
+
+    /**
+     * @brief Create pipeline including
+     *		  create device object, command list, command queue
+     *		  and synchronization objects.
+     */
+    void CreatePipeline();
+
+    /**
+     * @brief Allocate data on CPU side to prepare upload.
+     * @param byteSize the size of data to be uploaded.
+     */
+    void PrepareData(SIZE_T byteSize);
+
+    /**
+     * @brief Allocate gpu resources, construct a array of buffers with given size.
+     * @param uSize the size of each buffer inside of array.
+     */
+    void InitializeBuffer(SIZE_T uSize);
+
+    /**
+     * @brief Prepare data of the source buffer of benchmark.
+     * @param pData the data that should upload.
+     * @param byteSize the size of data.
+     */
+    void PrepareSourceBufferData(const void *pData, SIZE_T byteSize);
+
+    /**
+     * @brief Copy data from CPU side to GPU side.
+     */
+    void CopyResourceFromUploadToDefault();
+
+    /**
+     * @brief Copy data from GPU side to CPU side.
+     */
+    void CopyResourceFromDefaultToReadback();
+
+    /**
+     * @brief Copy data from GPU side to GPU side.
+     */
+    void CopyResourceFromDefaultToDefault();
+
+    /**
+     * @brief Execute the commands and wait until command completed.
+     */
+    void ExecuteWaitForCopyQueue(DWORD dwMilliseconds = 60000);
+
+    /**
+     * @brief Check result correctness.
+     * @param byteSize the size of data to be checked.
+     * @param pData the byte array that expect to be.
+     * @return true result is correct.
+     */
+    bool CheckData(SIZE_T byteSize, const uint8_t *pData);
+
+  private:
+    // Pipeline objects.
+    ComPtr<ID3D12Device> m_device = nullptr;
+    ComPtr<ID3D12CommandAllocator> m_commandAllocator = nullptr;
+    ComPtr<ID3D12CommandQueue> m_commandQueue = nullptr;
+    ComPtr<ID3D12GraphicsCommandList> m_commandList = nullptr;
+
+    // App resources.
+    // Pointer of CPU size resource.
+    std::unique_ptr<uint8_t[]> m_pDataBegin = nullptr;
+    // GPU side buffer.
+    ComPtr<ID3D12Resource> m_defaultBuffer = nullptr;
+    // GPU side buffer as destination if in dtod mode.
+    ComPtr<ID3D12Resource> m_defaultDescBuffer = nullptr;
+    // Upload buffer to upload data from CPU to GPU.
+    ComPtr<ID3D12Resource> m_uploadBuffer = nullptr;
+    // Read back buffer to check data correctness.
+    ComPtr<ID3D12Resource> m_readbackBuffer = nullptr;
+    // Default buffer descriptor.
+    D3D12_RESOURCE_DESC m_defaultBufferDesc;
+
+    // Synchronization objects.
+    ComPtr<ID3D12Fence1> m_copyFence = nullptr;
+    HANDLE m_copyEventHandle = nullptr;
+    UINT64 m_copyFenceValue = 0;
+
+    // GPU timer.
+    D3D12::D3D12Timer gpuTimer;
+
+    // Options.
+    BenchmarkOptions *opts;
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj
new file mode 100644
index 000000000..3be231342
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{f561fb23-0ec2-492f-9c8d-9555a0f6a4f6}</ProjectGuid>
+    <RootNamespace>GPUCopyBw</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\directx_utils\D3D12Timer.cpp" />
+    <ClCompile Include="GPUCopyBw.cpp" />
+    <ClCompile Include="Main.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\directx_third_party\d3dx12.h" />
+    <ClInclude Include="..\directx_third_party\DXSampleHelper.h" />
+    <ClInclude Include="..\directx_utils\D3D12Timer.h" />
+    <ClInclude Include="..\directx_utils\Options.h" />
+    <ClInclude Include="GPUCopyBw.h" />
+    <ClInclude Include="BenchmarkOptions.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp
new file mode 100644
index 000000000..ac12597c5
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <sstream>
+
+#include "GPUCopyBw.h"
+
+int main(int argc, char *argv[]) {
+    BenchmarkOptions option(argc, argv);
+    option.init();
+    if (option.size != -1) {
+        // Run only one size
+        GPUCopyBw benchmark(&option);
+        benchmark.Run();
+    } else {
+        // Run all sizes
+        for (SIZE_T usize = option.min_size; usize <= option.max_size; usize += usize) {
+            GPUCopyBw benchmark(&option);
+            benchmark.Run();
+        }
+    }
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h
index 848688351..edb34bcee 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h
@@ -6,6 +6,7 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <vector>
 
 struct UInt3 {
     unsigned int x;
@@ -55,6 +56,23 @@ class Options {
      * @param defaults the default value.
      * @return unsigned long long the unsigned long long type value of cmd line argument 'option'.
      */
+    unsigned long long get_cmd_line_argument_ulonglong(const std::string &option, unsigned long long defaults) {
+        if (char *value = get_cmd_option(option)) {
+            try {
+                return std::stoull(value);
+            } catch (const std::exception &e) {
+                std::cout << "Error: Invalid argument - " << option << " should be unsigned long long" << e.what()
+                          << '\n';
+            }
+        }
+        return defaults;
+    }
+
+    /**
+     * @brief Split the string by ',' and convert to unsigned int.
+     * @param str the string to be split.
+     * @return std::vector<unsigned int> the vector of unsigned int.
+     */
     std::vector<unsigned int> splitAndConvertToInt(const std::string &str) {
         std::vector<unsigned int> result;
         std::stringstream ss(str);
@@ -71,10 +89,10 @@ class Options {
     }
 
     /**
-     * @brief Get the unsigned int type value of cmd line argument.
+     * @brief Get the unsigned int 3 type value of cmd line argument.
      * @param option the cmd line argument.
      * @param defaults the default value.
-     * @return unsigned int the unsigned int type value of cmd line argument 'option'.
+     * @return unsigned int the unsigned int 3 type value of cmd line argument 'option'.
      */
     UInt3 get_cmd_line_argument_uint3(const std::string &option, const UInt3 &defaults) {
         if (char *value = get_cmd_option(option)) {
@@ -128,12 +146,12 @@ class Options {
     /**
      * @brief Get the option usage.
      */
-    virtual void get_option_usage(){};
+    virtual void get_option_usage() = 0;
 
     /**
      * @brief Parse the arguments.
      */
-    virtual void parse_arguments(){};
+    virtual void parse_arguments() = 0;
 
   public:
     /**

From 7184bdd1ede7037007b9bbf54d2103952191dc57 Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Fri, 30 Jun 2023 11:22:46 +0800
Subject: [PATCH 15/33] Benchmarks - Update result parsing in tensorrt
 inference (#541)

* Update result parsing for newer tensorrt versions
* Update arguments when load torchvision models
---
 setup.py                                      |  1 +
 .../micro_benchmarks/_export_torch_to_onnx.py |  5 +-
 .../tensorrt_inference_performance.py         | 11 +++
 .../test_tensorrt_inference_performance.py    | 16 +++-
 ...inference.log => tensorrt_inference.1.log} |  0
 tests/data/tensorrt_inference.2.log           | 80 +++++++++++++++++++
 6 files changed, 108 insertions(+), 5 deletions(-)
 rename tests/data/{tensorrt_inference.log => tensorrt_inference.1.log} (100%)
 create mode 100644 tests/data/tensorrt_inference.2.log

diff --git a/setup.py b/setup.py
index af65fc690..23c796833 100644
--- a/setup.py
+++ b/setup.py
@@ -166,6 +166,7 @@ def run(self):
         'numpy>=1.19.2',
         'omegaconf==2.0.6',
         'openpyxl>=3.0.7',
+        'packaging>=21.0',
         'pandas>=1.1.5',
         'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4',
         'pyyaml>=5.3',
diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index cd7c8b134..1e37b793d 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -5,6 +5,7 @@
 
 from pathlib import Path
 
+from packaging import version
 import torch.hub
 import torch.onnx
 import torchvision.models
@@ -129,7 +130,9 @@ def export_torchvision_model(self, model_name, batch_size=1):
         if not self.check_torchvision_model(model_name):
             return ''
         file_name = str(self._onnx_model_path / (model_name + '.onnx'))
-        model = getattr(torchvision.models, model_name)(pretrained=False).eval().cuda()
+        # the parameter 'pretrained' is deprecated since 0.13 in torchvision
+        args = {'pretrained': False} if version.parse(torchvision.__version__) < version.parse('0.13') else {}
+        model = getattr(torchvision.models, model_name)(**args).eval().cuda()
         dummy_input = torch.randn((batch_size, 3, 224, 224), device='cuda')
         torch.onnx.export(
             model,
diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index a7a4aa17b..306aa2de8 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -145,6 +145,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
                         self._result.add_result(f'{model}_host_time_{tag}', float(lats[0]))
                         self._result.add_result(f'{model}_end_to_end_time_{tag}', float(lats[1]))
                     success = True
+                if '[I] Latency:' in line or '[I] GPU Compute Time:' in line:
+                    tm = 'gpu' if '[I] GPU Compute Time:' in line else 'host'
+                    self._result.add_result(
+                        f'{model}_{tm}_time_mean',
+                        float(re.findall(r'mean = (\d+\.\d+) ms', line)[0]),
+                    )
+                    self._result.add_result(
+                        f'{model}_{tm}_time_99',
+                        float(re.findall(r'\(99\%\) = (\d+\.\d+) ms', line)[0]),
+                    )
+                    success = True
         except BaseException as e:
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
             logger.error(
diff --git a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
index 43277b7a3..301a4a08d 100644
--- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py
@@ -116,16 +116,17 @@ def test_tensorrt_inference_params(self):
                     len(test_case.get('pytorch_models', benchmark._pytorch_models)), len(benchmark._commands)
                 )
 
-    @decorator.load_data('tests/data/tensorrt_inference.log')
-    def test_tensorrt_inference_result_parsing(self, test_raw_log):
+    @decorator.load_data('tests/data/tensorrt_inference.1.log')
+    @decorator.load_data('tests/data/tensorrt_inference.2.log')
+    def test_tensorrt_inference_result_parsing(self, test_raw_log_1, test_raw_log_2):
         """Test tensorrt-inference benchmark result parsing."""
         (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
         benchmark = benchmark_cls(self.benchmark_name, parameters='')
         benchmark._args = SimpleNamespace(pytorch_models=['model_0', 'model_1'], log_raw_data=False)
         benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
 
-        # Positive case - valid raw output
-        self.assertTrue(benchmark._process_raw_result(0, test_raw_log))
+        # Positive case 1 - valid raw output
+        self.assertTrue(benchmark._process_raw_result(0, test_raw_log_1))
         self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
 
         self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result))
@@ -134,5 +135,12 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log):
             self.assertEqual(0.6, benchmark.result[f'model_0_host_time_{tag}'][0])
             self.assertEqual(1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0])
 
+        # Positive case 2 - valid raw output
+        self.assertTrue(benchmark._process_raw_result(0, test_raw_log_2))
+        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
+        for tag in ['mean', '99']:
+            self.assertEqual(1.5, benchmark.result[f'model_0_gpu_time_{tag}'][1])
+            self.assertEqual(2.0, benchmark.result[f'model_0_host_time_{tag}'][1])
+
         # Negative case - invalid raw output
         self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output'))
diff --git a/tests/data/tensorrt_inference.log b/tests/data/tensorrt_inference.1.log
similarity index 100%
rename from tests/data/tensorrt_inference.log
rename to tests/data/tensorrt_inference.1.log
diff --git a/tests/data/tensorrt_inference.2.log b/tests/data/tensorrt_inference.2.log
new file mode 100644
index 000000000..b07529e94
--- /dev/null
+++ b/tests/data/tensorrt_inference.2.log
@@ -0,0 +1,80 @@
+[06/29/2023-08:24:55] [I] === Model Options ===
+[06/29/2023-08:24:55] [I] Format: ONNX
+[06/29/2023-08:24:55] [I] Model: /root/.cache/torch/hub/onnx/resnet50.onnx
+[06/29/2023-08:24:55] [I] Output:
+[06/29/2023-08:24:55] [I] === Build Options ===
+[06/29/2023-08:24:55] [I] Max batch: explicit batch
+[06/29/2023-08:24:55] [I] Memory Pools: workspace: 8192 MiB, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
+[06/29/2023-08:24:55] [I] minTiming: 1
+[06/29/2023-08:24:55] [I] avgTiming: 8
+[06/29/2023-08:24:55] [I] Precision: FP32+FP16
+[06/29/2023-08:24:55] [I] LayerPrecisions:
+[06/29/2023-08:24:55] [I] Calibration:
+[06/29/2023-08:24:55] [I] Refit: Disabled
+[06/29/2023-08:24:55] [I] Sparsity: Disabled
+[06/29/2023-08:24:55] [I] Safe mode: Disabled
+[06/29/2023-08:24:55] [I] DirectIO mode: Disabled
+[06/29/2023-08:24:55] [I] Restricted mode: Disabled
+[06/29/2023-08:24:55] [I] Build only: Disabled
+[06/29/2023-08:24:55] [I] Save engine:
+[06/29/2023-08:24:55] [I] Load engine:
+[06/29/2023-08:24:55] [I] Profiling verbosity: 0
+[06/29/2023-08:24:55] [I] Tactic sources: Using default tactic sources
+[06/29/2023-08:24:55] [I] timingCacheMode: local
+[06/29/2023-08:24:55] [I] timingCacheFile:
+[06/29/2023-08:24:55] [I] Heuristic: Disabled
+[06/29/2023-08:24:55] [I] Preview Features: Use default preview flags.
+[06/29/2023-08:24:55] [I] Input(s)s format: fp32:CHW
+[06/29/2023-08:24:55] [I] Output(s)s format: fp32:CHW
+[06/29/2023-08:24:55] [I] Input build shape: input=32x3x224x224+32x3x224x224+32x3x224x224
+[06/29/2023-08:24:55] [I] Input calibration shapes: model
+[06/29/2023-08:24:55] [I] === System Options ===
+[06/29/2023-08:24:55] [I] Device: 0
+[06/29/2023-08:24:55] [I] DLACore:
+[06/29/2023-08:24:55] [I] Plugins:
+[06/29/2023-08:24:55] [I] === Inference Options ===
+[06/29/2023-08:24:55] [I] Batch: Explicit
+[06/29/2023-08:24:55] [I] Input inference shape: input=32x3x224x224
+[06/29/2023-08:24:55] [I] Iterations: 2048
+[06/29/2023-08:24:55] [I] Duration: 3s (+ 200ms warm up)
+[06/29/2023-08:24:55] [I] Sleep time: 0ms
+[06/29/2023-08:24:55] [I] Idle time: 0ms
+[06/29/2023-08:24:55] [I] Streams: 1
+[06/29/2023-08:24:55] [I] ExposeDMA: Disabled
+[06/29/2023-08:24:55] [I] Data transfers: Enabled
+[06/29/2023-08:24:55] [I] Spin-wait: Disabled
+[06/29/2023-08:24:55] [I] Multithreading: Disabled
+[06/29/2023-08:24:55] [I] CUDA Graph: Disabled
+[06/29/2023-08:24:55] [I] Separate profiling: Disabled
+[06/29/2023-08:24:55] [I] Time Deserialize: Disabled
+[06/29/2023-08:24:55] [I] Time Refit: Disabled
+[06/29/2023-08:24:55] [I] NVTX verbosity: 0
+[06/29/2023-08:24:55] [I] Persistent Cache Ratio: 0
+[06/29/2023-08:24:55] [I] Inputs:
+[06/29/2023-08:24:55] [I] === Reporting Options ===
+[06/29/2023-08:24:55] [I] Verbose: Disabled
+[06/29/2023-08:24:55] [I] Averages: 10 inferences
+[06/29/2023-08:24:55] [I] Percentiles: 99
+[06/29/2023-08:24:55] [I] Dump refittable layers:Disabled
+[06/29/2023-08:24:55] [I] Dump output: Disabled
+[06/29/2023-08:24:55] [I] Profile: Disabled
+[06/29/2023-08:24:55] [I] Export timing to JSON file:
+[06/29/2023-08:24:55] [I] Export output to JSON file:
+[06/29/2023-08:24:55] [I] Export profile to JSON file:
+[06/29/2023-08:25:38] [I]
+[06/29/2023-08:25:38] [I] === Trace details ===
+[06/29/2023-08:25:38] [I] Trace averages of 10 runs:
+[06/29/2023-08:25:38] [I] Average on 10 runs - GPU latency: 1.5 ms - Host latency: 2.0 ms (enqueue 0.4 ms)
+[06/29/2023-08:25:38] [I] Average on 10 runs - GPU latency: 1.5 ms - Host latency: 2.0 ms (enqueue 0.4 ms)
+[06/29/2023-08:25:38] [I]
+[06/29/2023-08:25:38] [I] === Performance summary ===
+[06/29/2023-08:25:38] [I] Throughput: 1000.00 qps
+[06/29/2023-08:25:38] [I] Latency: min = 1.9 ms, max = 2.1 ms, mean = 2.0 ms, median = 2.0 ms, percentile(99%) = 2.0 ms
+[06/29/2023-08:25:38] [I] Enqueue Time: min = 0.3 ms, max = 0.3 ms, mean = 0.3 ms, median = 0.3 ms, percentile(99%) = 0.3 ms
+[06/29/2023-08:25:38] [I] H2D Latency: min = 0.3 ms, max = 0.3 ms, mean = 0.3 ms, median = 0.3 ms, percentile(99%) = 0.3 ms
+[06/29/2023-08:25:38] [I] GPU Compute Time: min = 1.4 ms, max = 1.6 ms, mean = 1.5 ms, median = 1.5 ms, percentile(99%) = 1.5 ms
+[06/29/2023-08:25:38] [I] D2H Latency: min = 0.03 ms, max = 0.03 ms, mean = 0.03 ms, median = 0.03 ms, percentile(99%) = 0.03 ms
+[06/29/2023-08:25:38] [I] Total Host Walltime: 3.0 s
+[06/29/2023-08:25:38] [I] Total GPU Compute Time: 2.9 s
+[06/29/2023-08:25:38] [I] Explanations of the performance metrics are printed in the verbose logs.
+[06/29/2023-08:25:38] [I]

From c7d0beaf9eded6ae681127194131c7309dd58c9a Mon Sep 17 00:00:00 2001
From: Lei Qu <59161330+quge009@users.noreply.github.com>
Date: Fri, 30 Jun 2023 19:17:41 +0800
Subject: [PATCH 16/33] Doc - Update outdate references in micro-benchmarks.md
 (#544)

Modify link for Nvidia bandwidth test tool

**Description**
previous link is 404

**Minor Revision**
update the link value to
https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest
---
 docs/user-tutorial/benchmarks/micro-benchmarks.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index b2e43db3f..95e087235 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -229,7 +229,7 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/
 #### Introduction
 
 Measure the memory copy bandwidth across PCI-e and memory copy bandwidth between GPUs,
-performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/bandwidthTest)
+performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest)
 or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils/hipBusBandwidth) bandwidth test tool.
 
 #### Metrics

From 97f7b1df8688eac14b524c2be51340d4b48809fe Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Fri, 30 Jun 2023 12:58:41 +0000
Subject: [PATCH 17/33] Benchmarks: microbenchmark - add auto selecting
 algorithm support for cudnn functions (#540)

**Description**
add auto selecting algorithm support for cudnn functions.

**Major Revision**
- add auto selecting algorithm support for cudnn functions in source
code
- add 'auto_algo' option in benchmark
- add related test
---
 .../micro_benchmarks/cudnn_function.py        |  9 +++++++
 .../convolution_backward_data.h               | 12 +++++++++
 .../convolution_backward_filter.h             | 11 ++++++++
 .../cudnn_function/convolution_forward.h      | 11 ++++++++
 .../cudnn_function/cudnn_config.h             |  3 +++
 .../cudnn_function/cudnn_function.h           |  7 +++++
 .../cudnn_function/cudnn_function_helper.h    | 20 ++++++++++++--
 .../micro_benchmarks/test_cudnn_function.py   | 26 +++++++++++++++++--
 8 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function.py b/superbench/benchmarks/micro_benchmarks/cudnn_function.py
index 82384ae8b..3bc601742 100644
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function.py
@@ -357,6 +357,13 @@ def add_parser_arguments(self):
             required=False,
             help='The custom json string defining the params in a cudnn function.',
         )
+        self._parser.add_argument(
+            '--enable_auto_algo',
+            action='store_true',
+            default=False,
+            required=False,
+            help='Whether to use auto algorithm selection.'
+        )
 
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
@@ -373,6 +380,8 @@ def _preprocess(self):
         command += (' --warm_up ' + str(self._args.num_warmup))
         command += (' --num_in_step ' + str(self._args.num_in_step))
         command += (' --random_seed ' + str(self._args.random_seed))
+        if self._args.enable_auto_algo:
+            command += (' --enable_auto_algo')
 
         try:
             if not self._args.config_json_str:
diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h
index 7c40b4a22..1a7c207a8 100644
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h
@@ -32,6 +32,18 @@ template <typename T1, typename T2> class ConvolutionBackwardDataFunction : publ
             this->h_desc_.desc(), this->bwd_data_algo_, &this->fwd_workspace_size_));
     }
 
+    /**
+     * @brief Find the best algorithm for cudnn convolution functions
+     */
+    virtual void find_best_algo() {
+        int algo_count;
+        cudnnConvolutionBwdDataAlgoPerf_t perf_results;
+        CHECK_CUDNN_ERROR(cudnnFindConvolutionBackwardDataAlgorithm(
+            this->cudnn_handle, this->w_desc_.desc(), this->x_desc_.desc(), this->conv_desc_.desc(),
+            this->h_desc_.desc(), 1, &algo_count, &perf_results));
+        this->algo_ = perf_results.algo;
+    }
+
   public:
     /**
      * @brief Construct a new Convolution Backward Data Function object
diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h
index 10af651ad..6873ea4c9 100644
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h
@@ -31,6 +31,17 @@ template <typename T1, typename T2> class ConvolutionBackwardFilterFunction : pu
             this->cudnn_handle, this->x_desc_.desc(), this->h_desc_.desc(), this->conv_desc_.desc(),
             this->w_desc_.desc(), this->bwd_filter_algo_, &this->fwd_workspace_size_));
     }
+    /**
+     * @brief Find the best algorithm for cudnn convolution functions
+     */
+    virtual void find_best_algo() {
+        int algo_count;
+        cudnnConvolutionBwdFilterAlgoPerf_t perf_results;
+        CHECK_CUDNN_ERROR(cudnnFindConvolutionBackwardFilterAlgorithm(
+            this->cudnn_handle, this->x_desc_.desc(), this->h_desc_.desc(), this->conv_desc_.desc(),
+            this->w_desc_.desc(), 1, &algo_count, &perf_results));
+        this->algo_ = perf_results.algo;
+    }
 
   public:
     /**
diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h
index daca82337..7b1f1764b 100644
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h
@@ -31,6 +31,17 @@ template <typename T1, typename T2> class ConvolutionForwardFunction : public Cu
             this->cudnn_handle, this->x_desc_.desc(), this->w_desc_.desc(), this->conv_desc_.desc(),
             this->h_desc_.desc(), this->fwd_algo_, &this->fwd_workspace_size_));
     }
+    /**
+     * @brief Find the best algorithm for cudnn convolution functions
+     */
+    virtual void find_best_algo() {
+        int algo_count;
+        cudnnConvolutionFwdAlgoPerf_t perf_results;
+        CHECK_CUDNN_ERROR(cudnnFindConvolutionForwardAlgorithm(this->cudnn_handle, this->x_desc_.desc(),
+                                                               this->w_desc_.desc(), this->conv_desc_.desc(),
+                                                               this->h_desc_.desc(), 1, &algo_count, &perf_results));
+        this->algo_ = perf_results.algo;
+    }
 
   public:
     /**
diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h
index 37b259fd1..913ec7fc8 100644
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h
@@ -58,6 +58,7 @@ class CudnnConfig {
     cudnnDataType_t input_type_;  ///< selects the data type in which the computation will be done
     cudnnDataType_t conv_type_;   ///< selects the data type in which the convolution will be done
     std::string function_str_;    ///< the str representing the cudnn function with params
+    bool auto_algo_;              ///< whether to use auto algo selection
 
   public:
     void set_num_test(int num_test) { this->num_test = num_test; }
@@ -80,6 +81,7 @@ class CudnnConfig {
     void set_input_type(const cudnnDataType_t &input_type) { input_type_ = input_type; }
     void set_conv_type(const cudnnDataType_t &conv_type) { input_type_ = conv_type; }
     void set_function(const std::string &str) { function_str_ = str; }
+    void set_auto_algo(bool auto_algo) { auto_algo_ = auto_algo; }
 
     std::vector<int> &get_input_dims() { return input_dims_; }
     std::vector<int> &get_input_stride() { return input_stride_; }
@@ -98,6 +100,7 @@ class CudnnConfig {
     std::string &get_name() { return name; }
     cudnn_function_name_enum get_e_name() { return e_name; }
     std::string &get_function_str() { return function_str_; }
+    bool get_auto_algo() { return auto_algo_; }
     /**
      * @brief Convert name string to enum name
      * @return cudnn_function_name_enum
diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h
index 26b5601ed..f23649f4a 100644
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h
@@ -45,6 +45,10 @@ template <typename T1, typename T2> class CudnnFunction : public CudnnConfig {
      * @brief launch the kernel/function
      */
     virtual void kernel_entry() {}
+    /**
+     * @brief Find the best algorithm for cudnn convolution functions
+     */
+    virtual void find_best_algo() {}
 
   public:
     /**
@@ -87,6 +91,9 @@ template <typename T1, typename T2> void CudnnFunction<T1, T2>::prepare_for_func
     // Set Convolution MathType
     cudnnMathType_t algo = get_use_tensor_op() ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
     CHECK_CUDNN_ERROR(cudnnSetConvolutionMathType(conv_desc_.desc(), algo));
+    if (this->auto_algo_) {
+        find_best_algo();
+    }
     // Set convolution algorithm and workspace size
     this->get_workspace_size();
     zeros<float>(&fwd_workspace_, std::vector<int>{static_cast<int>(this->fwd_workspace_size_ / sizeof(float)), 1});
diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h
index 2ee150599..d1f93a8bd 100644
--- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h
+++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h
@@ -67,12 +67,24 @@ class Options {
         return "";
     }
 
+    /** @brief Get the bool type value of cmd line argument
+     * @param  option           the cmd line argument
+     * @return bool             the bool type value of cmd line argument 'option'
+     */
+    bool get_cmd_line_argument_bool(const std::string &option) {
+        if (std::find(begin, end, option) != end) {
+            return true;
+        }
+        return false;
+    }
+
   public:
     int num_test;
     int warm_up;
     int num_in_step;
     int random_seed;
     std::string para_info_json;
+    bool auto_algo;
 
     /**
      * @brief Construct a new Command Line object
@@ -91,6 +103,7 @@ class Options {
         random_seed = get_cmd_line_argument_int("--random_seed");
         random_seed = (random_seed == 0 ? time(NULL) : random_seed);
         para_info_json = get_cmd_line_argument_string("--config_json");
+        auto_algo = get_cmd_line_argument_bool("--enable_auto_algo");
         para_info_json =
             para_info_json == ""
                 ? R"({"algo":0,"arrayLength":2,"convType":0,"dilationA":[1,1],"filterStrideA":[1,1],"filterDims":[32,128,3,3],"inputDims":[32,128,14,14],"inputStride":[25088,196,14,1],"inputType":0,"mode":1, "name":"cudnnConvolutionBackwardFilter","outputDims":[32,32,14,14],"outputStride":[6272,196,14,1],"padA":[1,1],"tensorOp":false})"
@@ -126,8 +139,10 @@ void from_json(const json &j, cudnn_test::CudnnConfig &fn) {
     fn.set_input_stride(input_stride);
     auto output_stride = j.at("outputStride").get<std::vector<int>>();
     fn.set_output_stride(output_stride);
-    auto algo = j.at("algo").get<int>();
-    fn.set_algo(algo);
+    if (j.contains("algo")) {
+        auto algo = j.at("algo").get<int>();
+        fn.set_algo(algo);
+    }
     auto padA = j.at("padA").get<std::vector<int>>();
     fn.set_padA(padA);
     auto filter_strideA = j.at("filterStrideA").get<std::vector<int>>();
@@ -178,6 +193,7 @@ void run_benchmark(Options &options) {
         function.set_warm_up(options.warm_up);
         function.set_num_in_step(options.num_in_step);
         function.set_random_seed(options.random_seed);
+        function.set_auto_algo(options.auto_algo);
         if (function.get_input_type() == CUDNN_DATA_FLOAT && function.get_conv_type() == CUDNN_DATA_FLOAT) {
             auto p_function = get_cudnn_function_pointer<float, float>(function);
             p_function->benchmark();
diff --git a/tests/benchmarks/micro_benchmarks/test_cudnn_function.py b/tests/benchmarks/micro_benchmarks/test_cudnn_function.py
index 590e4e519..d6ca117c2 100644
--- a/tests/benchmarks/micro_benchmarks/test_cudnn_function.py
+++ b/tests/benchmarks/micro_benchmarks/test_cudnn_function.py
@@ -85,8 +85,7 @@ def test_cudnn_functions():
         if metric != 'return_code':
             assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
 
-
-# Test for custom list configuration
+    # Test for custom list configuration
     custom_config_str2 = '{"algo":1,"arrayLength":2,"convType":0,"dilationA":[1,1],"filterStrideA":[1,1],' \
         + '"filterDims":[32,128,3,3],"inputDims":[32,32,14,14],"inputStride":[6272, 196, 14, 1],"inputType":2,'\
         + '"mode":1,"name":"cudnnConvolutionBackwardData","outputDims":[32, 128, 14, 14],'\
@@ -126,3 +125,26 @@ def test_cudnn_functions():
         assert (isinstance(benchmark.result[metric][0], numbers.Number))
         if metric != 'return_code':
             assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)
+
+    # Test for auto_algo parameter
+    context = BenchmarkRegistry.create_benchmark_context(
+        'cudnn-function',
+        platform=Platform.CUDA,
+        parameters='--num_warmup 10 --num_steps 10 --num_in_step 100 --enable_auto_algo'
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (benchmark._args.enable_auto_algo is True)
+
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+
+    assert (18 + benchmark.default_metric_count == len(benchmark.result))
+    for metric in list(benchmark.result.keys()):
+        assert (len(benchmark.result[metric]) == 1)
+        assert (isinstance(benchmark.result[metric][0], numbers.Number))
+        if metric != 'return_code':
+            assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps)

From 865472177f47f64895dc02c6fc5e0084a056c665 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Mon, 3 Jul 2023 22:43:21 +0800
Subject: [PATCH 18/33] Benchmarks: Build Pipeline - add AMF in third party and
 build AMF encoding latency test (#543)

**Description**
add AMF in third party and build AMF encoding latency test.
---
 dockerfile/directx12.dockerfile |  3 +++
 third_party/Makefile            | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/dockerfile/directx12.dockerfile b/dockerfile/directx12.dockerfile
index 1a958d69a..344141266 100644
--- a/dockerfile/directx12.dockerfile
+++ b/dockerfile/directx12.dockerfile
@@ -59,6 +59,9 @@ RUN python -m pip install setuptools==65.0.0 && \
     python -m pip install --no-cache-dir .[amdworker] && \
     make directxbuild
 
+ADD third_party third_party
+RUN make -C third_party directx_amd
+
 # Run the entrypoint script for enabling vendor-specific graphics APIs
 RUN powershell -Command "Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine -Force"
 CMD [ "python", "dockerfile/directx/enable-graphics-apis.py" ]
diff --git a/third_party/Makefile b/third_party/Makefile
index f131ee3cb..b0c01d453 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -11,7 +11,7 @@ HPCX_HOME ?= /opt/hpcx
 CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
 ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
 
-.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl
+.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd
 
 # Build all targets.
 all: cuda rocm
@@ -19,6 +19,7 @@ cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcne
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
 cpu: common cpu_perftest
 common: cpu_hpl cpu_stream fio
+directx_amd: directx_amf_encoding_latency
 
 # Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
 sb_micro_path:
@@ -148,3 +149,13 @@ ifneq (,$(wildcard stream-tests/Makefile))
 	make all
 	cp -v ./stream-tests/stream*.exe $(SB_MICRO_PATH)/bin/
 endif
+
+# Build AMD Encoder Latency Test
+directx_amf_encoding_latency:
+	@if not exist "AMF" (git clone -b v1.4.29 https://github.com/GPUOpen-LibrariesAndSDKs/AMF.git)
+	@if exist "AMF\amf\public\samples\CPPSamples_vs2019.sln" ( \
+		curl -L -o vs_buildtools.exe https://aka.ms/vs/16/release/vs_buildtools.exe && echo "Downloaded vs_buildtools.exe" && \
+		start /wait vs_buildtools.exe --quiet --wait --norestart --nocache --installPath C:/temp/BuildTools --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.ATLMFC --includeRecommended  && echo "Installed VS Build Tools" && \
+		del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \
+		"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
+	)

From 3704a432b90277612da9b1553cda00725e60b03b Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Wed, 5 Jul 2023 11:33:40 +0800
Subject: [PATCH 19/33] CI/CD - Support DirectX test pipeline (#545)

**Description**
Support DirectX test pipeline.
---
 .github/workflows/build-win.yml     | 24 ++++++++++++++++++++++++
 superbench/benchmarks/build.bat     |  2 +-
 tests/common/test_directx_device.py | 15 +++++++++++++++
 tests/helper/__init__.py            |  4 ++++
 tests/helper/decorator.py           |  1 +
 tests/runner/test_ansible.py        |  4 +++-
 6 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 tests/common/test_directx_device.py
 create mode 100644 tests/helper/__init__.py

diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml
index 7226af8c7..24ed3d12a 100644
--- a/.github/workflows/build-win.yml
+++ b/.github/workflows/build-win.yml
@@ -44,3 +44,27 @@ jobs:
         TAG: superbench/main:win2004
         USER: ${{ secrets.DOCKERHUB_USERNAME }}
         PASS: ${{ secrets.DOCKERHUB_TOKEN }}
+  directx-unit-test:
+    name: DirectX unit test
+    needs: docker
+    runs-on: [self-hosted, windows, x64, win2004]
+    steps:
+    - name: Add bash to PATH
+      shell: pwsh
+      run: |
+        echo "$env:PATH;C:\Program Files\Git\bin" | Out-File -FilePath $env:GITHUB_PATH -Append -Encoding utf8
+    - name: Bash to get codecov env
+      run: |
+        ci_env=`bash <(curl -s https://codecov.io/env)`
+        echo "ci_env=$ci_env" >> $GITHUB_ENV
+      shell: bash
+    - name: Run unit tests inside docker
+      run: |
+        $command="curl -s -L https://uploader.codecov.io/latest/windows/codecov.exe -o codecov.exe && python -m pip install .[test] && python -m pytest -v --cov=superbench --cov-report=xml --cov-report=term-missing tests/ -k test_directx && codecov -t ${CODECOV_TOKEN} -cF directx-unit-test"
+        docker run --rm `
+        --isolation process `
+        --device class/5B45201D-F2F2-4F3B-85BB-30FF1F953599 `
+        -e CI=true $ci_env -e SB_TEST_CUDA="0" -e SB_TEST_ROCM="0" -e SB_TEST_PYTORCH="0" -e SB_TEST_DIRECTX="1" -e CODECOV_TOKEN superbench/main:win2004 cmd /c $command
+      shell: pwsh
+      env:
+        CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/superbench/benchmarks/build.bat b/superbench/benchmarks/build.bat
index 8639e1771..49c785e18 100644
--- a/superbench/benchmarks/build.bat
+++ b/superbench/benchmarks/build.bat
@@ -12,7 +12,7 @@ for /r %%F in (*.vcxproj) do (
     REM Download dependencies
     "!MSBUILD!" "!PROJ_PATH!" -t:restore -p:RestorePackagesConfig=true
     REM Build project
-    "!MSBUILD!" "!PROJ_PATH!" /p:Configuration=Release /p:AdditionalLibraryDirectories="%WindowsSDKDir%\Lib" /p:AdditionalIncludeDirectories="%WindowsSDKDir%\Include" /p:OutDir="%SB_MICRO_PATH%\bin"
+    "!MSBUILD!" "!PROJ_PATH!" /p:Configuration=Release /p:Platform=x64 /p:AdditionalLibraryDirectories="%WindowsSDKDir%\Lib" /p:AdditionalIncludeDirectories="%WindowsSDKDir%\Include" /p:OutDir="%SB_MICRO_PATH%\bin"
 )
 
 endlocal
diff --git a/tests/common/test_directx_device.py b/tests/common/test_directx_device.py
new file mode 100644
index 000000000..b38b495f4
--- /dev/null
+++ b/tests/common/test_directx_device.py
@@ -0,0 +1,15 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for directx gpu device module."""
+
+from superbench.common.devices.gpu import GPU
+from tests.helper import decorator
+
+
+@decorator.directx_test
+def test_directx_gpu():
+    """Test DirectX GPU device."""
+    gpu = GPU()
+    gpu.get_vendor()
+    assert (gpu.vendor == 'nvidia-graphics' or gpu.vendor == 'amd-graphics')
diff --git a/tests/helper/__init__.py b/tests/helper/__init__.py
new file mode 100644
index 000000000..e367e58b2
--- /dev/null
+++ b/tests/helper/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Helper module for tests."""
diff --git a/tests/helper/decorator.py b/tests/helper/decorator.py
index bda2bc5ac..ff08469ac 100644
--- a/tests/helper/decorator.py
+++ b/tests/helper/decorator.py
@@ -12,6 +12,7 @@
 rocm_test = unittest.skipIf(os.environ.get('SB_TEST_ROCM', '0') == '0', 'Skip ROCm tests.')
 
 pytorch_test = unittest.skipIf(os.environ.get('SB_TEST_PYTORCH', '1') == '0', 'Skip PyTorch tests.')
+directx_test = unittest.skipIf(os.environ.get('SB_TEST_DIRECTX', '0') == '0', 'Skip DirectX tests.')
 
 
 def load_data(filepath):
diff --git a/tests/runner/test_ansible.py b/tests/runner/test_ansible.py
index 924e1ce2d..550762c43 100644
--- a/tests/runner/test_ansible.py
+++ b/tests/runner/test_ansible.py
@@ -10,7 +10,9 @@
 
 from omegaconf import OmegaConf
 
-from superbench.runner.ansible import AnsibleClient
+from superbench.common.utils import LazyImport
+
+AnsibleClient = LazyImport('superbench.runner.ansible', 'AnsibleClient')
 
 
 class AnsibleClientTestCase(unittest.TestCase):

From f1d608aef77378560f6fb8e795960b4a79059db0 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Wed, 5 Jul 2023 16:56:21 +0800
Subject: [PATCH 20/33] Benchmarks: micro benchmarks - add python code for
 DirectXGPUCoreFlops  (#542)

**Description**
add python code for DirectX core flops and init DirectX test pipeline.

**Major Revision**
- add python code for DirectX core flops
- init DirectX test pipeline


**Minor Revision**
- add test for DirectX core flops
---
 .codecov.yml                                  |   2 +
 .github/workflows/build-win.yml               |  11 +-
 .../benchmarks/micro_benchmarks/__init__.py   |   2 +
 .../directx_gemm_flops_performance.py         | 145 ++++++++++++++++++
 .../BenchmarkOptions.h                        |   4 +-
 .../GPUCore.cpp                               |   4 +-
 .../GPUCore.vcxproj                           |   2 +
 .../test_directx_gemm_flops_performance.py    |  47 ++++++
 8 files changed, 206 insertions(+), 11 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py

diff --git a/.codecov.yml b/.codecov.yml
index 3f36d5612..81d50f8bc 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -17,6 +17,7 @@ coverage:
           - cpu-python3.6-unit-test
           - cpu-python3.7-unit-test
           - cuda-unit-test
+          - directx-unit-test
     patch:
       default:
         target: 80%
@@ -25,3 +26,4 @@ coverage:
           - cpu-python3.6-unit-test
           - cpu-python3.7-unit-test
           - cuda-unit-test
+          - directx-unit-test
diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml
index 24ed3d12a..6283544b7 100644
--- a/.github/workflows/build-win.yml
+++ b/.github/workflows/build-win.yml
@@ -1,4 +1,4 @@
-name: Build on Windows
+name: Build on Windows and run directx unit test
 
 on:
   push:
@@ -19,6 +19,10 @@ jobs:
       uses: actions/checkout@v2
       with:
         submodules: true
+    - name: Clearnup docker data
+      run: |
+        docker system prune -a -f
+        docker volume prune -a -f
     - name: Build Docker image
       working-directory: .
       shell: pwsh
@@ -44,11 +48,6 @@ jobs:
         TAG: superbench/main:win2004
         USER: ${{ secrets.DOCKERHUB_USERNAME }}
         PASS: ${{ secrets.DOCKERHUB_TOKEN }}
-  directx-unit-test:
-    name: DirectX unit test
-    needs: docker
-    runs-on: [self-hosted, windows, x64, win2004]
-    steps:
     - name: Add bash to PATH
       shell: pwsh
       run: |
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index c1cb3a1b9..57304bc43 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -31,6 +31,7 @@
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
 from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
 from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 
 __all__ = [
     'ComputationCommunicationOverlap',
@@ -61,4 +62,5 @@
     'ShardingMatmul',
     'TCPConnectivityBenchmark',
     'TensorRTInferenceBenchmark',
+    'DirectXGPUCoreFlops',
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py
new file mode 100644
index 000000000..862367543
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py
@@ -0,0 +1,145 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DirectXGPUCoreFlops performance benchmarks."""
+
+import os
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class DirectXGPUCoreFlops(MicroBenchmarkWithInvoke):
+    """The DirectXGPUCoreFlops benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'DirectXGPUCoreFlops.exe'
+        self._support_precisions = ['fp16', 'fp32']
+        self._precision_need_to_run = list()
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            default=10,
+            required=False,
+            help='The number of benchmark runs.',
+        )
+        self._parser.add_argument(
+            '--num_warm_up',
+            type=int,
+            default=2,
+            required=False,
+            help='The number of warm up runs.',
+        )
+        self._parser.add_argument(
+            '--n',
+            type=int,
+            default=16 * 256,
+            required=False,
+            help='The N dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--k',
+            type=int,
+            default=16 * 256,
+            required=False,
+            help='The K dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--m',
+            type=int,
+            default=16 * 256,
+            required=False,
+            help='The M dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--precision',
+            type=str,
+            nargs='+',
+            default=list(),
+            help='Precision for benchmarking. E.g. {}.'.format(' '.join(self._support_precisions)),
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        if len(self._args.precision) == 0:
+            self._precision_need_to_run = self._support_precisions
+        else:
+            self._args.precision = [p.lower() for p in self._args.precision]
+            for p in self._args.precision:
+                if p not in self._support_precisions:
+                    logger.warning(
+                        'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
+                            self._name, p, self._support_precisions
+                        )
+                    )
+                else:
+                    self._precision_need_to_run.append(p)
+
+        if len(self._precision_need_to_run) == 0:
+            self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
+            return False
+
+        for p in self._precision_need_to_run:
+            command = os.path.join(self._args.bin_dir, self._bin_name)
+            command += (' --num_loops ' + str(self._args.num_loops))
+            command += (' --num_warm_up ' + str(self._args.num_warm_up))
+            command += (' --n ' + str(self._args.n))
+            command += (' --k ' + str(self._args.k))
+            command += (' --m ' + str(self._args.m))
+            command += (' --' + p)
+            self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to process raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        precision = self._precision_need_to_run[cmd_idx]
+        self._result.add_raw_data('raw_output_' + precision, raw_output, self._args.log_raw_data)
+        valid = True
+        flops = list()
+        content = raw_output.splitlines()
+        try:
+            for line in content:
+                if 'TFLOPs' in line:
+                    flops.append(float(line.split()[0]))
+        except BaseException:
+            valid = False
+        finally:
+            if valid is False or len(flops) == 0:
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+        self._result.add_result(precision + '_flops', max(flops))
+        return True
+
+
+BenchmarkRegistry.register_benchmark('directx-gpu-core-flops', DirectXGPUCoreFlops, platform=Platform.DIRECTX)
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
index 8ba9fb913..0a244e5d8 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
@@ -42,10 +42,10 @@ class BenchmarkOptions : public Options {
         m = get_cmd_line_argument_int("--m", 16 * 256);
         n = get_cmd_line_argument_int("--n", 16 * 256);
         k = get_cmd_line_argument_int("--k", 16 * 256);
-        if (get_cmd_line_argument_bool("--f16")) {
+        if (get_cmd_line_argument_bool("--fp16")) {
             mode_precision = Option::F16;
         }
-        if (get_cmd_line_argument_bool("--f32")) {
+        if (get_cmd_line_argument_bool("--fp32")) {
             mode_precision = Option::F32;
         }
     }
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
index 206c49f90..d41316a01 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
@@ -25,7 +25,7 @@ void GPUCore::Run() {
 
     int loops = opts->num_loops;
     std::cout << "GPUCoreFLOPs" << std::endl;
-
+    gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
     switch (opts->mode_precision) {
     case Option::F32: {
         // Prepare input and output data and buffers.
@@ -37,7 +37,6 @@ void GPUCore::Run() {
             ExecuteComputeOp();
         }
         for (int i = 0; i < loops; ++i) {
-            gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
             // Do FLOPs job.
             double timeInMs = ExecuteComputeOp();
             auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
@@ -55,7 +54,6 @@ void GPUCore::Run() {
             ExecuteComputeOp();
         }
         for (int i = 0; i < loops; ++i) {
-            gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
             // Do FLOPs job.
             double timeInMs = ExecuteComputeOp();
             auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
index 109d39305..f70749b48 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
@@ -20,12 +20,14 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <TargetName>DirectXGPUCoreFlops</TargetName>
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <TargetName>DirectXGPUCoreFlops</TargetName>
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
diff --git a/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py
new file mode 100644
index 000000000..7571df752
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DirectXGPUCorefloops benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+@decorator.directx_test
+def test_directx_gpucoreflops():
+    """Test DirectXGPUCoreFlops benchmark."""
+    # Test for default configuration
+    context = BenchmarkRegistry.create_benchmark_context(
+        'directx-gpu-core-flops',
+        platform=Platform.DIRECTX,
+        parameters=r'--num_loops 10 --n 16384 --k 16384 --m 16384 --precision fp32'
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (benchmark.name == 'directx-gpu-core-flops')
+    assert (benchmark.type == BenchmarkType.MICRO)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.num_loops == 10)
+    assert (benchmark._args.n == 16384)
+    assert (benchmark._args.k == 16384)
+    assert (benchmark._args.m == 16384)
+    assert (sorted(benchmark._args.precision) == ['fp32'])
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    assert ('raw_output_fp32' in benchmark.raw_data)
+    assert (len(benchmark.raw_data['raw_output_fp32']) == 1)
+    assert (isinstance(benchmark.raw_data['raw_output_fp32'][0], str))
+
+    assert ('fp32_flops' in benchmark.result)
+    assert (len(benchmark.result['fp32_flops']) == 1)
+    assert (isinstance(benchmark.result['fp32_flops'][0], numbers.Number))

From af4cfd5bbfe989b212d5311656be0cbe7cd5ae35 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Wed, 5 Jul 2023 22:07:13 +0800
Subject: [PATCH 21/33] Benchmarks: micro benchmarks - add python code for
 DirecXGPUMemBw (#547)

**Description**
add python code for DirecXGPUMemBw.
---
 .github/workflows/build-win.yml               |   1 +
 .../benchmarks/micro_benchmarks/__init__.py   |   2 +
 .../directx_mem_bw_performance.py             | 149 ++++++++++++++++++
 .../BenchmarkOptions.h                        |   2 +-
 .../GPUMemRwBw.vcxproj                        |   2 +
 .../benchmarks/micro_benchmarks/micro_base.py |   2 +-
 superbench/common/utils/process.py            |  17 +-
 .../test_directx_mem_bw_performance.py        |  52 ++++++
 8 files changed, 222 insertions(+), 5 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py

diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml
index 6283544b7..d1b9a1c8d 100644
--- a/.github/workflows/build-win.yml
+++ b/.github/workflows/build-win.yml
@@ -23,6 +23,7 @@ jobs:
       run: |
         docker system prune -a -f
         docker volume prune -a -f
+      shell: pwsh
     - name: Build Docker image
       working-directory: .
       shell: pwsh
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 57304bc43..9fe14336c 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -31,6 +31,7 @@
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
 from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
 from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 
 __all__ = [
@@ -62,5 +63,6 @@
     'ShardingMatmul',
     'TCPConnectivityBenchmark',
     'TensorRTInferenceBenchmark',
+    'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py
new file mode 100644
index 000000000..ff9d9d239
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py
@@ -0,0 +1,149 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DirectXGPUMemBw performance benchmarks."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class DirectXGPUMemBw(MicroBenchmarkWithInvoke):
+    """The DirectXGPUMemBw benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'DirectXGPUMemRwBw.exe'
+        self._modes = ['read', 'write', 'readwrite']
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--num_warm_up',
+            type=int,
+            default=0,
+            required=False,
+            help='Number of warm up rounds.',
+        )
+        self._parser.add_argument(
+            '--num_loop',
+            type=int,
+            default=100,
+            required=False,
+            help='Number of loop times to measure the performance.',
+        )
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=None,
+            required=False,
+            help='Size of data for GPU copy.',
+        )
+        self._parser.add_argument(
+            '--minbytes',
+            type=int,
+            default=4096,
+            required=False,
+            help='Lower data size bound to test.',
+        )
+        self._parser.add_argument(
+            '--maxbytes',
+            type=int,
+            default=1024 * 1024 * 1024,
+            required=False,
+            help='Upper data size bound to test.',
+        )
+        self._parser.add_argument(
+            '--check_data',
+            action='store_true',
+            required=False,
+            help='Whether check data correctness.',
+        )
+        self._parser.add_argument(
+            '--mode',
+            type=str,
+            nargs='+',
+            default=list(),
+            help='Memory operation mode. E.g. {}.'.format(' '.join(self._modes)),
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking."""
+        if not super()._preprocess():
+            return False
+
+        self._args.mode = [m.lower() for m in self._args.mode]
+        for mode in self._args.mode:
+            if mode not in self._modes:
+                logger.warning(
+                    'Unsupported mode - benchmark: {}, mode: {}, expected: {}.'.format(self._name, mode, self._modes)
+                )
+                self._args.mode.remove(mode)
+
+        if len(self._args.mode) == 0:
+            logger.error('No valid operation modes are provided.')
+            return False
+
+        for mode in self._args.mode:
+            command = os.path.join(self._args.bin_dir, self._bin_name)
+            command += (' --num_warm_up ' + str(self._args.num_warm_up))
+            command += (' --num_loop ' + str(self._args.num_loop))
+            if self._args.size is not None:
+                command += (' --size ' + str(self._args.size))
+            else:
+                command += (' --minbytes ' + str(self._args.minbytes))
+                command += (' --maxbytes ' + str(self._args.maxbytes))
+            if self._args.check_data:
+                command += (' --check_data')
+            command += (' --' + mode)
+            self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to process raw results and save the summarized results.
+
+        self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        mode = self._args.mode[cmd_idx]
+        self._result.add_raw_data('raw_output_' + mode, raw_output, self._args.log_raw_data)
+
+        valid = True
+
+        content = raw_output.splitlines()
+        try:
+            for line in content:
+                if 'GPUMemBw:' in line:
+                    size = int(line.split()[-3])
+                    bw = float(line.split()[-2])
+                    self._result.add_result(f'{mode}_{size}_bw', bw)
+                if 'error' in line.lower():
+                    valid = False
+        except BaseException:
+            valid = False
+        finally:
+            if not valid:
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+        return True
+
+
+BenchmarkRegistry.register_benchmark('directx-gpu-mem-bw', DirectXGPUMemBw, platform=Platform.DIRECTX)
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
index 7893fe8af..c9d7507a3 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
@@ -68,7 +68,7 @@ class BenchmarkOptions : public Options {
         min_size = get_cmd_line_argument_int("--minbytes", 4 * 1024);
         max_size =
             get_cmd_line_argument_ulonglong("--maxbytes", static_cast<unsigned long long>(1LL * 1024 * 1024 * 1024));
-        check_data = get_cmd_line_argument_bool("--check");
+        check_data = get_cmd_line_argument_bool("--check_data");
         if (get_cmd_line_argument_bool("--read")) {
             mem_type = Memtype::Read;
         }
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
index 80ab02e37..b575f8040 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
@@ -19,12 +19,14 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <TargetName>DirectXGPUMemRwBw</TargetName>
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <TargetName>DirectXGPUMemRwBw</TargetName>
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
diff --git a/superbench/benchmarks/micro_benchmarks/micro_base.py b/superbench/benchmarks/micro_benchmarks/micro_base.py
index 7a2d36029..e1e854058 100644
--- a/superbench/benchmarks/micro_benchmarks/micro_base.py
+++ b/superbench/benchmarks/micro_benchmarks/micro_base.py
@@ -180,7 +180,7 @@ def _benchmark(self):
                 )
             )
 
-            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
+            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing, cwd=self._args.bin_dir)
             if output.returncode != 0:
                 self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                 logger.error(
diff --git a/superbench/common/utils/process.py b/superbench/common/utils/process.py
index 334bf7665..75767ead8 100644
--- a/superbench/common/utils/process.py
+++ b/superbench/common/utils/process.py
@@ -10,13 +10,14 @@
 from superbench.common.utils import stdout_logger
 
 
-def run_command(command, quiet=False, flush_output=False):
+def run_command(command, quiet=False, flush_output=False, cwd=None):
     """Run command in string format, return the result with stdout and stderr.
 
     Args:
         command (str): command to run.
         quiet (bool): no stdout display of the command if quiet is True.
         flush_output (bool): enable real-time output flush or not when running the command.
+        cwd (str): working directory to run the command.
 
     Return:
         result (subprocess.CompletedProcess): The return value from subprocess.run().
@@ -26,7 +27,11 @@ def run_command(command, quiet=False, flush_output=False):
         try:
             args = shlex.split(command)
             process = subprocess.Popen(
-                args, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True
+                args,
+                cwd=os.getcwd() if cwd is None else cwd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True
             )
             output = ''
             for line in process.stdout:
@@ -43,7 +48,13 @@ def run_command(command, quiet=False, flush_output=False):
             return subprocess.CompletedProcess(args=args, returncode=-1, stdout=str(e))
     else:
         result = subprocess.run(
-            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+            command,
+            cwd=os.getcwd() if cwd is None else cwd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            shell=True,
+            check=False,
+            universal_newlines=True
         )
         if not quiet:
             stdout_logger.log(result.stdout)
diff --git a/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py
new file mode 100644
index 000000000..baeed54a4
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DirectXGPUMemBw benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+@decorator.directx_test
+def test_directx_gpu_mem_bw():
+    """Test DirectXGPUMemBw benchmark."""
+    # Test for default configuration
+    context = BenchmarkRegistry.create_benchmark_context(
+        'directx-gpu-mem-bw',
+        platform=Platform.DIRECTX,
+        parameters=r'--num_warm_up 0 --num_loop 100 --size 1073741824 --mode read write'
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (benchmark.name == 'directx-gpu-mem-bw')
+    assert (benchmark.type == BenchmarkType.MICRO)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.num_warm_up == 0)
+    assert (benchmark._args.num_loop == 100)
+    assert (benchmark._args.size == 1073741824)
+    assert (sorted(benchmark._args.mode) == ['read', 'write'])
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    assert ('raw_output_read' in benchmark.raw_data)
+    assert ('raw_output_write' in benchmark.raw_data)
+    assert (len(benchmark.raw_data['raw_output_read']) == 1)
+    assert (len(benchmark.raw_data['raw_output_write']) == 1)
+    assert (isinstance(benchmark.raw_data['raw_output_read'][0], str))
+    assert (isinstance(benchmark.raw_data['raw_output_write'][0], str))
+
+    assert ('read_1073741824_bw' in benchmark.result)
+    assert ('write_1073741824_bw' in benchmark.result)
+    assert (len(benchmark.result['read_1073741824_bw']) == 1)
+    assert (len(benchmark.result['write_1073741824_bw']) == 1)
+    assert (isinstance(benchmark.result['read_1073741824_bw'][0], numbers.Number))
+    assert (isinstance(benchmark.result['write_1073741824_bw'][0], numbers.Number))

From c8c079c2af0a87d5e3de56e05188c2d9349898d3 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 6 Jul 2023 00:15:32 +0800
Subject: [PATCH 22/33] Benchmarks: micro benchmarks - add python code for
 DirectXGPUCopy (#546)

**Description**
add python code for DirectXGPUCopy.
---
 .../benchmarks/micro_benchmarks/__init__.py   |   2 +
 .../directx_gpu_copy_performance.py           | 132 ++++++++++++++++++
 .../GPUCopyBw.vcxproj                         |   2 +
 .../directx_gpu_copy_performance/Main.cpp     |   1 +
 .../test_directx_gpu_copy_performance.py      |  49 +++++++
 5 files changed, 186 insertions(+)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_gpu_copy_performance.py

diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 9fe14336c..47094aa3f 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -31,6 +31,7 @@
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
 from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
 from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.directx_gpu_copy_performance import DirectXGPUCopyBw
 from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 
@@ -63,6 +64,7 @@
     'ShardingMatmul',
     'TCPConnectivityBenchmark',
     'TensorRTInferenceBenchmark',
+    'DirectXGPUCopyBw',
     'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance.py b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance.py
new file mode 100644
index 000000000..b114bed68
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance.py
@@ -0,0 +1,132 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DirectXGPUCopyBw performance benchmarks."""
+
+import os
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks import MemBwBenchmark
+
+
+class DirectXGPUCopyBw(MemBwBenchmark):
+    """The GPUCopyBw benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._mem_types = ['htod', 'dtoh']
+        self._bin_name = 'DirectXGPUCopyBw.exe'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            required=False,
+            default=None,
+            help='Size of data for GPU copy.',
+        )
+        self._parser.add_argument(
+            '--warm_up',
+            type=int,
+            required=False,
+            default=20,
+            help='Number of warm up copy times to run.',
+        )
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            required=False,
+            default=1000,
+            help='Number of copy times to run.',
+        )
+        self._parser.add_argument(
+            '--minbytes',
+            type=int,
+            required=False,
+            default=64,
+            help='Run size from min_size to max_size for GPU copy.',
+        )
+        self._parser.add_argument(
+            '--maxbytes',
+            type=int,
+            required=False,
+            default=8 * 1024 * 1024,
+            help='Run size from min_size to max_size for GPU copy.',
+        )
+        self._parser.add_argument(
+            '--check',
+            action='store_true',
+            help='Whether check data after copy.',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        for mem_type in self._args.mem_type:
+            # Prepare the command line.
+            command = os.path.join(self._args.bin_dir, self._bin_name)
+            command += f' --{mem_type}'
+            command += ' --warm_up ' + str(self._args.warm_up)
+            command += ' --num_loops ' + str(self._args.num_loops)
+            if self._args.size is not None:
+                command += ' --size ' + str(self._args.size)
+            else:
+                command += ' --minbytes ' + str(self._args.minbytes)
+                command += ' --maxbytes ' + str(self._args.maxbytes)
+            if self._args.check:
+                command += ' --check'
+            self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to process raw results and save the summarized results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output', raw_output, self._args.log_raw_data)
+
+        try:
+            lines = raw_output.splitlines()
+            for line in lines:
+                if 'GB' in line:
+                    type = line.split()[0].strip(':')
+                    size = int(line.strip().split()[1].strip('B'))
+                    bw = float(line.strip().split()[2])
+                    self._result.add_result(f'{type}_{size}_bw', bw)
+                if 'error' in line.lower():
+                    logger.error(
+                        'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                            self._curr_run_index, self._name, raw_output
+                        )
+                    )
+                    return False
+            return True
+        except Exception as e:
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, exception: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            return False
+
+
+BenchmarkRegistry.register_benchmark('directx-gpu-copy-bw', DirectXGPUCopyBw, platform=Platform.DIRECTX)
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj
index 3be231342..cd3b45f61 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj
@@ -19,12 +19,14 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <TargetName>DirectXGPUCopyBw</TargetName>
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <TargetName>DirectXGPUCopyBw</TargetName>
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp
index ac12597c5..fc47e2f2e 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp
@@ -16,6 +16,7 @@ int main(int argc, char *argv[]) {
     } else {
         // Run all sizes
         for (SIZE_T usize = option.min_size; usize <= option.max_size; usize += usize) {
+            option.size = usize;
             GPUCopyBw benchmark(&option);
             benchmark.Run();
         }
diff --git a/tests/benchmarks/micro_benchmarks/test_directx_gpu_copy_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_gpu_copy_performance.py
new file mode 100644
index 000000000..49bf73f2b
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_directx_gpu_copy_performance.py
@@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DirectXGPUCopyBw benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+@decorator.directx_test
+def test_directx_gpu_copy_bw():
+    """Test DirectXGPUCopyBw benchmark."""
+    # Test for default configuration
+    context = BenchmarkRegistry.create_benchmark_context(
+        'directx-gpu-copy-bw',
+        platform=Platform.DIRECTX,
+        parameters=r'--warm_up 20 --num_loops 1000 --minbytes 64 --maxbytes 8388608 --mem_type htod dtoh'
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (benchmark.name == 'directx-gpu-copy-bw')
+    assert (benchmark.type == BenchmarkType.MICRO)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.warm_up == 20)
+    assert (benchmark._args.num_loops == 1000)
+    assert (benchmark._args.minbytes == 64)
+    assert (benchmark._args.maxbytes == 8388608)
+    assert (sorted(benchmark._args.mem_type) == ['dtoh', 'htod'])
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    assert ('raw_output' in benchmark.raw_data)
+    assert (isinstance(benchmark.raw_data['raw_output'][0], str))
+    size = 64
+    while size <= 8388608:
+        for mem_type in ['htod', 'dtoh']:
+            assert (f'{mem_type}_{size}_bw' in benchmark.result)
+            assert (len(benchmark.result[f'{mem_type}_{size}_bw']) == 1)
+            assert (isinstance(benchmark.result[f'{mem_type}_{size}_bw'][0], numbers.Number))
+        size *= 2

From e8ac0b1e28a93903d1f03752803cd5c9e059b1f1 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 6 Jul 2023 15:31:28 +0800
Subject: [PATCH 23/33] Benchmarks: micro benchmarks - add python code for
 DirectXGPUEncodingLatency (#548)

**Description**
add python code for DirectXGPUEncodingLatency.
---
 dockerfile/directx12.dockerfile               |   4 +-
 .../benchmarks/micro_benchmarks/__init__.py   |   2 +
 .../directx_gpu_encoding_latency.py           | 157 ++++++++++++++++++
 .../test_directx_gpu_encoding_latency.py      |  56 +++++++
 4 files changed, 217 insertions(+), 2 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_gpu_encoding_latency.py

diff --git a/dockerfile/directx12.dockerfile b/dockerfile/directx12.dockerfile
index 344141266..cd5ab9ad3 100644
--- a/dockerfile/directx12.dockerfile
+++ b/dockerfile/directx12.dockerfile
@@ -64,5 +64,5 @@ RUN make -C third_party directx_amd
 
 # Run the entrypoint script for enabling vendor-specific graphics APIs
 RUN powershell -Command "Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine -Force"
-CMD [ "python", "dockerfile/directx/enable-graphics-apis.py" ]
-ENTRYPOINT [ "cmd.exe" ]
+ENTRYPOINT [ "python", "dockerfile/directx/enable-graphics-apis.py" ]
+CMD [ "cmd.exe" ]
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 47094aa3f..6f3f29953 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -31,6 +31,7 @@
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
 from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
 from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.directx_gpu_encoding_latency import DirectXGPUEncodingLatency
 from superbench.benchmarks.micro_benchmarks.directx_gpu_copy_performance import DirectXGPUCopyBw
 from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
@@ -64,6 +65,7 @@
     'ShardingMatmul',
     'TCPConnectivityBenchmark',
     'TensorRTInferenceBenchmark',
+    'DirectXGPUEncodingLatency',
     'DirectXGPUCopyBw',
     'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py b/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py
new file mode 100644
index 000000000..70d6c75ad
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py
@@ -0,0 +1,157 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DirectXGPUEncodingLatency benchmarks."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+def create_nv12_file(file_name, num_frames, width, height):
+    """Create a NV12 file with the specified name, number of frames, width, and height."""
+    import numpy as np
+    # Generate a Y plane of width x height with values from 0-255
+    y_plane = np.random.randint(0, 256, (height, width), dtype=np.uint8)
+    # Generate a UV plane of width x height/2 with values from 0-255
+    uv_plane = np.random.randint(0, 256, (height // 2, width), dtype=np.uint8)
+    # Create the file
+    with open(f'{file_name}', 'wb') as f:
+        for _ in range(num_frames):
+            # Write the Y plane and UV plane to the file
+            f.write(y_plane.tobytes())
+            f.write(uv_plane.tobytes())
+
+
+class DirectXGPUEncodingLatency(MicroBenchmarkWithInvoke):
+    """The DirectXGPUEncodingLatency benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor."""
+        super().__init__(name, parameters)
+        self._bin_name = 'EncoderLatency.exe'
+        self._test_file = 'test_directx_gpu_encoding_latency.nv12'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--algo',
+            type=str,
+            choices=['ASAP', 'OneInOne'],
+            default='ASAP',
+            required=False,
+            help='The algorithm to use for encoding'
+        )
+        self._parser.add_argument(
+            '--codec',
+            type=str,
+            choices=['AVC', 'H264', 'HEVC', 'H265', 'AV1'],
+            default='H265',
+            required=False,
+            help='The codec to use for encoding'
+        )
+        self._parser.add_argument(
+            '--format',
+            type=str,
+            choices=['RGBA_F16', 'R10G10B10A2', 'NV12', 'P010'],
+            default='NV12',
+            required=False,
+            help='The format to use for encoding'
+        )
+        self._parser.add_argument(
+            '--frames', type=int, default=500, required=False, help='The number of frames to encode'
+        )
+        self._parser.add_argument(
+            '--height', type=int, default=720, required=False, help='The height of the input video'
+        )
+        self._parser.add_argument(
+            '--width', type=int, default=1080, required=False, help='The width of the input video'
+        )
+        self._parser.add_argument('--input_file', type=str, default=None, required=False, help='The input video file')
+        self._parser.add_argument('--output_file', type=str, default=None, required=False, help='The output video file')
+        self._parser.add_argument(
+            '--output_height', type=int, default=720, required=False, help='The height of the output video'
+        )
+        self._parser.add_argument(
+            '--output_width', type=int, default=1080, required=False, help='The width of the output video'
+        )
+        self._parser.add_argument(
+            '--vcn', type=int, choices=[0, 1], default=0, required=False, help='The VCN instance to use for encoding'
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        command += f' -ALGORITHM {self._args.algo}'
+        command += f' -CODEC {self._args.codec}'
+        command += f' -FORMAT {self._args.format}'
+        command += f' -FRAMES {self._args.frames}'
+        command += f' -HEIGHT {self._args.height}'
+        command += f' -WIDTH {self._args.width}'
+        if self._args.input_file is not None:
+            command += f' -INPUT {self._args.input_file}'
+        else:
+            if not os.path.exists(f'{self._test_file}'):
+                create_nv12_file(self._test_file, self._args.frames, self._args.width, self._args.height)
+            command += f' -INPUT {self._test_file}'
+        if self._args.output_file is not None:
+            command += f' -OUTPUT {self._args.output_file}'
+        command += f' -OUTPUT_HEIGHT {self._args.output_height}'
+        command += f' -OUTPUT_WIDTH {self._args.output_width}'
+        command += f' -VCNINSTANCE {self._args.vcn}'
+        self._commands.append(command)
+
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output', raw_output, self._args.log_raw_data)
+
+        content = raw_output.splitlines()
+        metrics = {}
+
+        try:
+            for line in content:
+                if 'Total' in line:
+                    metrics['fps'] = float(line.split('=')[3].strip().strip('frames').split()[0])
+                if 'Latency' in line and 'min' in line.lower():
+                    metrics['min_lat'] = float(line.split('=')[1].split(',')[1].strip('ms').strip())
+                    metrics['max_lat'] = float(line.split('=')[1].split(',')[2].strip('ms').strip())
+                if 'Latency' in line and 'average' in line.lower():
+                    metrics['avg_lat'] = float(line.split('=')[1].strip('ms').strip())
+        except Exception as e:
+            logger.error(
+                'The result format is invalid - benchmark: {}, raw output: {}, error: {}'.format(
+                    self._name, raw_output, str(e)
+                )
+            )
+            return False
+
+        for metric, value in metrics.items():
+            self._result.add_result(metric, value)
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark(
+    'directx-gpu-encoding-latency', DirectXGPUEncodingLatency, platform=Platform.DIRECTX
+)
diff --git a/tests/benchmarks/micro_benchmarks/test_directx_gpu_encoding_latency.py b/tests/benchmarks/micro_benchmarks/test_directx_gpu_encoding_latency.py
new file mode 100644
index 000000000..c9b5c7121
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_directx_gpu_encoding_latency.py
@@ -0,0 +1,56 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DirectXGPUEncodingLatency benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+@decorator.directx_test
+def test_directx_gpuencodinglatency():
+    """Test DirectXGPUEncodingLatency benchmark."""
+    context = BenchmarkRegistry.create_benchmark_context(
+        'directx-gpu-encoding-latency',
+        platform=Platform.DIRECTX,
+        parameters=r'--algo ASAP --codec H265 --format NV12 --frames 500' +
+        r' --height 720 --width 1080 --output_height 720 --output_width 1080 --vcn 0'
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (benchmark.name == 'directx-gpu-encoding-latency')
+    assert (benchmark.type == BenchmarkType.MICRO)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.algo == 'ASAP')
+    assert (benchmark._args.codec == 'H265')
+    assert (benchmark._args.format == 'NV12')
+    assert (benchmark._args.frames == 500)
+    assert (benchmark._args.height == 720)
+    assert (benchmark._args.width == 1080)
+    assert (benchmark._args.output_height == 720)
+    assert (benchmark._args.output_width == 1080)
+    assert (benchmark._args.vcn == 0)
+
+    # Check results and metrics.
+    assert (benchmark._args.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    assert ('raw_output' in benchmark.raw_data)
+    assert (len(benchmark.raw_data['raw_output']) == 1)
+    assert (isinstance(benchmark.raw_data['raw_output'][0], str))
+
+    assert ('fps' in benchmark.result)
+    assert ('min_lat' in benchmark.result)
+    assert ('max_lat' in benchmark.result)
+    assert ('avg_lat' in benchmark.result)
+    assert (isinstance(benchmark.result['fps'][0], numbers.Number))
+    assert (isinstance(benchmark.result['min_lat'][0], numbers.Number))
+    assert (isinstance(benchmark.result['max_lat'][0], numbers.Number))
+    assert (isinstance(benchmark.result['avg_lat'][0], numbers.Number))

From 466b477e9d3cd1c3c62a3ae28c88ad980b6c2a68 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Jul 2023 17:07:35 +0800
Subject: [PATCH 24/33] Bump semver from 5.7.1 to 5.7.2 in /website (#550)

Bumps [semver](https://github.com/npm/node-semver) from 5.7.1 to 5.7.2.
- [Release notes](https://github.com/npm/node-semver/releases)
- [Changelog](https://github.com/npm/node-semver/blob/v5.7.2/CHANGELOG.md)
- [Commits](npm/node-semver@v5.7.1...v5.7.2)

---
updated-dependencies:
- dependency-name: semver
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 website/package-lock.json | 90 +++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/website/package-lock.json b/website/package-lock.json
index 7526213de..80c139a56 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -176,9 +176,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -221,9 +221,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -265,9 +265,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -1044,9 +1044,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -1199,9 +1199,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -2441,9 +2441,9 @@
           }
         },
         "semver": {
-          "version": "5.7.1",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ=="
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g=="
         }
       }
     },
@@ -3233,9 +3233,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -4747,9 +4747,9 @@
           "integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A="
         },
         "semver": {
-          "version": "5.7.1",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ=="
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g=="
         },
         "shebang-command": {
           "version": "1.2.0",
@@ -5406,9 +5406,9 @@
           }
         },
         "semver": {
-          "version": "5.7.1",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ=="
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g=="
         },
         "tapable": {
           "version": "1.1.3",
@@ -7011,9 +7011,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -7649,9 +7649,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -8938,9 +8938,9 @@
           }
         },
         "semver": {
-          "version": "5.7.1",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ=="
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g=="
         }
       }
     },
@@ -9287,9 +9287,9 @@
       }
     },
     "semver": {
-      "version": "7.3.5",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.5.tgz",
-      "integrity": "sha512-PoeGJYh8HK4BTO/a9Tf6ZG3veo/A7ZVsYrSA6J8ny9nb3B1VrpkuN+z9OE5wfE5p6H4LchYZsegiQgbJD94ZFQ==",
+      "version": "7.5.4",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+      "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
       "requires": {
         "lru-cache": "^6.0.0"
       }
@@ -9303,9 +9303,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         }
       }
     },
@@ -11215,9 +11215,9 @@
           }
         },
         "semver": {
-          "version": "6.3.0",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-          "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw=="
+          "version": "6.3.1",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+          "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="
         },
         "string_decoder": {
           "version": "1.1.1",

From e1df877bfe4d84b352dff0d84c86b98c36cf3ebc Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 27 Jul 2023 10:42:31 +0800
Subject: [PATCH 25/33] Release - SuperBench v0.9.0 (#558)

**Description**
Cherry-pick bug fixes from v0.9.0 to main.

**Major Revision**
- CI/CD: pipeline - clean more disk space to fix rocm building image
pipeline(#555 )
- Benchmarks: bug fix - use absolute path for input file in
DirectXEncodingLatency(#554)
- CI/CD - add push win docker image on release branch in pipeline (#552)
- Docs - Upgrade version and release note(#557)
---
 .github/workflows/build-image.yml             | 11 ++++++
 .github/workflows/build-win.yml               | 29 ++++++++++++--
 README.md                                     |  2 +-
 docs/getting-started/installation.mdx         |  2 +-
 docs/getting-started/run-superbench.md        |  2 +-
 docs/superbench-config.mdx                    |  2 +-
 docs/user-tutorial/container-images.mdx       | 14 +++++++
 docs/user-tutorial/data-diagnosis.md          |  2 +-
 docs/user-tutorial/result-summary.md          |  2 +-
 superbench/__init__.py                        |  2 +-
 .../directx_gpu_encoding_latency.py           |  4 +-
 superbench/config/amd_mi100_hpe.yaml          |  2 +-
 superbench/config/amd_mi100_z53.yaml          |  2 +-
 .../inference/standard_nc64as_t4_v3.yaml      |  2 +-
 .../inference/standard_nc96ads_a100_v4.yaml   |  2 +-
 .../inference/standard_nv18ads_a10_v5.yaml    |  2 +-
 superbench/config/azure_ndmv4.yaml            |  2 +-
 superbench/config/azure_ndv4.yaml             |  2 +-
 superbench/config/default.yaml                |  2 +-
 third_party/Makefile                          |  2 +-
 website/blog/2023-07-25-release-0-9.md        | 38 +++++++++++++++++++
 website/docusaurus.config.js                  |  2 +-
 website/package-lock.json                     |  2 +-
 website/package.json                          |  2 +-
 24 files changed, 109 insertions(+), 25 deletions(-)
 create mode 100644 website/blog/2023-07-25-release-0-9.md

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 824418a6f..6b796830a 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -64,6 +64,17 @@ jobs:
           sudo apt-get clean
           sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps")
           df -h
+      - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
       - name: Prepare metadata
         id: metadata
         run: |
diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml
index d1b9a1c8d..252783421 100644
--- a/.github/workflows/build-win.yml
+++ b/.github/workflows/build-win.yml
@@ -12,7 +12,7 @@ on:
 
 jobs:
   docker:
-    name: Docker build win2004
+    name: Docker build win directx12
     runs-on: [self-hosted, windows, x64, win2004]
     steps:
     - name: Checkout
@@ -24,6 +24,25 @@ jobs:
         docker system prune -a -f
         docker volume prune -a -f
       shell: pwsh
+    - name: Set TAG variable based on the branch
+      run: |
+        if ($env:GITHUB_EVENT_NAME -match "release") {
+          $version = $env:GITHUB_REF.Substring($env:GITHUB_REF.LastIndexOf('/') + 1)
+          echo "TAG=superbench/superbench:$version-directx12" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        } elseif ($env:GITHUB_REF -match "refs/heads/release/(.*)") {
+          $version = $Matches[1]
+          echo "TAG=superbench/release:$version-directx12" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        } elseif ($env:GITHUB_BASEREF -match "release/(.*)"){
+          $version = $Matches[1]
+          echo "TAG=superbench/release:$version-directx12" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        } else {
+          echo "TAG=superbench/main:directx12" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        }
+      shell: pwsh
+      env:
+        GITHUB_REF: ${{ github.ref }}
+        GITHUB_BASEREF: ${{ github.base_ref }}
+        GITHUB_EVENT_NAME: ${{ github.event_name }}
     - name: Build Docker image
       working-directory: .
       shell: pwsh
@@ -37,7 +56,7 @@ jobs:
           --isolation=process `
           --tag $env:TAG .
       env:
-        TAG: superbench/main:win2004
+        TAG: ${{ env.TAG }}
     - name: Push Docker image
       if: ${{ github.event_name != 'pull_request' }}
       shell: pwsh
@@ -46,7 +65,7 @@ jobs:
         docker push $env:TAG
         docker logout
       env:
-        TAG: superbench/main:win2004
+        TAG: ${{ env.TAG }}
         USER: ${{ secrets.DOCKERHUB_USERNAME }}
         PASS: ${{ secrets.DOCKERHUB_TOKEN }}
     - name: Add bash to PATH
@@ -64,7 +83,9 @@ jobs:
         docker run --rm `
         --isolation process `
         --device class/5B45201D-F2F2-4F3B-85BB-30FF1F953599 `
-        -e CI=true $ci_env -e SB_TEST_CUDA="0" -e SB_TEST_ROCM="0" -e SB_TEST_PYTORCH="0" -e SB_TEST_DIRECTX="1" -e CODECOV_TOKEN superbench/main:win2004 cmd /c $command
+        -v C:/Windows/System32/DriverStore:C:/Windows/System32/DriverStore `
+        -e CI=true $ci_env -e SB_TEST_CUDA="0" -e SB_TEST_ROCM="0" -e SB_TEST_PYTORCH="0" -e SB_TEST_DIRECTX="1" -e CODECOV_TOKEN --entrypoint "cmd" $env:TAG "/c python dockerfile/directx/enable-graphics-apis.py && cmd /c $command"
       shell: pwsh
       env:
+        TAG: ${{ env.TAG }}
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/README.md b/README.md
index ffcd51960..cfcd4b6b3 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 
 __SuperBench__ is a validation and profiling tool for AI infrastructure.
 
-📢 [v0.8.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.8.0) has been released!
+📢 [v0.9.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.9.0) has been released!
 
 ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._
 
diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx
index 82c1fc9c3..8570306c9 100644
--- a/docs/getting-started/installation.mdx
+++ b/docs/getting-started/installation.mdx
@@ -61,7 +61,7 @@ You can clone the source from GitHub and build it.
 :::note Note
 You should checkout corresponding tag to use release version, for example,
 
-`git clone -b v0.8.0 https://github.com/microsoft/superbenchmark`
+`git clone -b v0.9.0 https://github.com/microsoft/superbenchmark`
 :::
 
 ```bash
diff --git a/docs/getting-started/run-superbench.md b/docs/getting-started/run-superbench.md
index 32a8c6d80..16c6d7a21 100644
--- a/docs/getting-started/run-superbench.md
+++ b/docs/getting-started/run-superbench.md
@@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password]
 :::note Note
 You should deploy corresponding Docker image to use release version, for example,
 
-`sb deploy -f local.ini -i superbench/superbench:v0.8.0-cuda12.1`
+`sb deploy -f local.ini -i superbench/superbench:v0.9.0-cuda12.1`
 
 You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone.
 
diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx
index 5720a8125..8893c46b9 100644
--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@@ -70,7 +70,7 @@ superbench:
 <TabItem value='example'>
 
 ```yaml
-version: v0.8
+version: v0.9
 superbench:
   enable: benchmark_1
   monitor:
diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx
index 27cf8da6f..5fd11502c 100644
--- a/docs/user-tutorial/container-images.mdx
+++ b/docs/user-tutorial/container-images.mdx
@@ -23,12 +23,15 @@ available tags are listed below for all stable versions.
   values={[
     {label: 'CUDA', value: 'cuda'},
     {label: 'ROCm', value: 'rocm'},
+    {label: 'DirectX', value: 'directx'},
   ]
 }>
 <TabItem value='cuda'>
 
 | Tag               | Description                        |
 |-------------------|------------------------------------|
+| v0.9.0-cuda12.1   | SuperBench v0.9.0 with CUDA 12.1   |
+| v0.9.0-cuda11.1.1 | SuperBench v0.9.0 with CUDA 11.1.1 |
 | v0.8.0-cuda12.1   | SuperBench v0.8.0 with CUDA 12.1   |
 | v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 |
 | v0.7.0-cuda11.8   | SuperBench v0.7.0 with CUDA 11.8   |
@@ -45,6 +48,10 @@ available tags are listed below for all stable versions.
 
 | Tag                           | Description                                      |
 |-------------------------------|--------------------------------------------------|
+| v0.9.0-rocm5.1.3              | SuperBench v0.9.0 with ROCm 5.1.3                |
+| v0.9.0-rocm5.1.1              | SuperBench v0.9.0 with ROCm 5.1.1                |
+| v0.9.0-rocm5.0.1              | SuperBench v0.9.0 with ROCm 5.0.1                |
+| v0.9.0-rocm5.0                | SuperBench v0.9.0 with ROCm 5.0                  |
 | v0.8.0-rocm5.1.3              | SuperBench v0.8.0 with ROCm 5.1.3                |
 | v0.8.0-rocm5.1.1              | SuperBench v0.8.0 with ROCm 5.1.1                |
 | v0.8.0-rocm5.0.1              | SuperBench v0.8.0 with ROCm 5.0.1                |
@@ -66,5 +73,12 @@ available tags are listed below for all stable versions.
 | v0.3.0-rocm4.2-pytorch1.7.0   | SuperBench v0.3.0 with ROCm 4.2, PyTorch 1.7.0   |
 | v0.3.0-rocm4.0-pytorch1.7.0   | SuperBench v0.3.0 with ROCm 4.0, PyTorch 1.7.0   |
 
+</TabItem>
+<TabItem value='directx'>
+
+| Tag                           | Description                                      |
+|-------------------------------|--------------------------------------------------|
+| v0.9.0-directx12              | SuperBench v0.9.0 with DirectX12, Windows10-2004 |
+
 </TabItem>
 </Tabs>
diff --git a/docs/user-tutorial/data-diagnosis.md b/docs/user-tutorial/data-diagnosis.md
index 94a2a025d..a0bd99640 100644
--- a/docs/user-tutorial/data-diagnosis.md
+++ b/docs/user-tutorial/data-diagnosis.md
@@ -65,7 +65,7 @@ superbench:
 example:
 ```yaml
 # SuperBench rules
-version: v0.8
+version: v0.9
 superbench:
   rules:
     failure-rule:
diff --git a/docs/user-tutorial/result-summary.md b/docs/user-tutorial/result-summary.md
index e53738ff8..7e393a188 100644
--- a/docs/user-tutorial/result-summary.md
+++ b/docs/user-tutorial/result-summary.md
@@ -58,7 +58,7 @@ superbench:
 
 ```yaml title="Example"
 # SuperBench rules
-version: v0.8
+version: v0.9
 superbench:
   rules:
     kernel_launch:
diff --git a/superbench/__init__.py b/superbench/__init__.py
index 5b85c9a9a..bc20aebf9 100644
--- a/superbench/__init__.py
+++ b/superbench/__init__.py
@@ -6,5 +6,5 @@
 Provide hardware and software benchmarks for AI systems.
 """
 
-__version__ = '0.8.0'
+__version__ = '0.9.0'
 __author__ = 'Microsoft'
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py b/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py
index 70d6c75ad..ed17ea5bd 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py
+++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py
@@ -98,11 +98,11 @@ def _preprocess(self):
         command += f' -HEIGHT {self._args.height}'
         command += f' -WIDTH {self._args.width}'
         if self._args.input_file is not None:
-            command += f' -INPUT {self._args.input_file}'
+            command += f' -INPUT {os.path.abspath(self._args.input_file)}'
         else:
             if not os.path.exists(f'{self._test_file}'):
                 create_nv12_file(self._test_file, self._args.frames, self._args.width, self._args.height)
-            command += f' -INPUT {self._test_file}'
+            command += f' -INPUT {os.path.abspath(self._test_file)}'
         if self._args.output_file is not None:
             command += f' -OUTPUT {self._args.output_file}'
         command += f' -OUTPUT_HEIGHT {self._args.output_height}'
diff --git a/superbench/config/amd_mi100_hpe.yaml b/superbench/config/amd_mi100_hpe.yaml
index 150424c0f..718224531 100644
--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@@ -3,7 +3,7 @@
 # Server:
 #   - Product: HPE Apollo 6500
 
-version: v0.8
+version: v0.9
 superbench:
   enable: null
   var:
diff --git a/superbench/config/amd_mi100_z53.yaml b/superbench/config/amd_mi100_z53.yaml
index 188c93547..8aa8fd85e 100644
--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@@ -4,7 +4,7 @@
 #   - Product: G482-Z53
 #   - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html
 
-version: v0.8
+version: v0.9
 superbench:
   enable: null
   var:
diff --git a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
index 62e0d6586..5ffa26311 100644
--- a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
+++ b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml
@@ -1,4 +1,4 @@
-version: v0.8
+version: v0.9
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
index 337affacf..5c78d866d 100644
--- a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
+++ b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml
@@ -1,4 +1,4 @@
-version: v0.8
+version: v0.9
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
index f95469cb0..75375cd79 100644
--- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
+++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml
@@ -1,4 +1,4 @@
-version: v0.8
+version: v0.9
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml
index e482d6ed0..8aabb65f7 100644
--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@@ -3,7 +3,7 @@
 # Azure NDm A100 v4
 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series
 
-version: v0.8
+version: v0.9
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml
index cb9a93ddc..274556842 100644
--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@@ -1,5 +1,5 @@
 # SuperBench Config
-version: v0.8
+version: v0.9
 superbench:
   enable: null
   monitor:
diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml
index 60d6be7b0..1a6af7dc5 100644
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -1,5 +1,5 @@
 # SuperBench Config
-version: v0.8
+version: v0.9
 superbench:
   enable: null
   monitor:
diff --git a/third_party/Makefile b/third_party/Makefile
index b0c01d453..ec72ccae9 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -157,5 +157,5 @@ directx_amf_encoding_latency:
 		curl -L -o vs_buildtools.exe https://aka.ms/vs/16/release/vs_buildtools.exe && echo "Downloaded vs_buildtools.exe" && \
 		start /wait vs_buildtools.exe --quiet --wait --norestart --nocache --installPath C:/temp/BuildTools --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.ATLMFC --includeRecommended  && echo "Installed VS Build Tools" && \
 		del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \
-		"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
+		"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
 	)
diff --git a/website/blog/2023-07-25-release-0-9.md b/website/blog/2023-07-25-release-0-9.md
new file mode 100644
index 000000000..59e931103
--- /dev/null
+++ b/website/blog/2023-07-25-release-0-9.md
@@ -0,0 +1,38 @@
+---
+slug: release-sb-v0.9
+title: Releasing SuperBench v0.9
+author: Peng Cheng
+author_title: SuperBench Team
+author_url: https://github.com/cp5555
+author_image_url: https://github.com/cp5555.png
+tags: [superbench, announcement, release]
+---
+
+We are very happy to announce that **SuperBench 0.9.0 version** is officially released today!
+
+You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation).
+
+## SuperBench 0.9.0 Release Notes
+
+### SuperBench Improvement
+- Support Ctrl+C and interrupt to stop all SuperBench testing.
+- Support Windows Docker for VDI/Gaming GPU.
+- Support DirectX platform for Nvidia and AMD GPU.
+- Add System Config Info feature in SB runner to support distributed collection.
+- Support DirectX test pipeline.
+
+### Micro-benchmark Improvement
+- Add DirectXGPUCopyBw Benchmark to measure HtoD/DtoH bandwidth by DirectX.
+- Add DirectXGPUCoreFLops Benchmark to measure peak FLOPS by DirectX..
+- Add DirectXGPUMemBw Benchmark to measure GPU memory bandwidth by DirectX..
+- Add DirectXVCNEncodingLatency Benchmark to measure the VCN hardware encoding latency on AMD graphic GPUs.
+- Support best algorithm selection in cudnn-function microbenchmark.
+- Revise step time collection in distributed inference benchmark.
+
+### Model Benchmark Improvement
+- Fix early stop logic due to num_steps in model benchmarks.
+- Support TensorRT models on Nvidia H100.
+
+### Documentation
+- Improve documentation for System Config Info.
+- Update outdate references.
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
index cc583913d..c1d83edfa 100644
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -101,7 +101,7 @@ module.exports = {
     announcementBar: {
       id: 'supportus',
       content:
-        '📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.8">v0.8.0</a> has been released! ' +
+        '📢 <a href="https://microsoft.github.io/superbenchmark/blog/release-sb-v0.9">v0.9.0</a> has been released! ' +
         '⭐️ If you like SuperBench, give it a star on <a target="_blank" rel="noopener noreferrer" href="https://github.com/microsoft/superbenchmark">GitHub</a>! ⭐️',
     },
     algolia: {
diff --git a/website/package-lock.json b/website/package-lock.json
index 80c139a56..a2e3b219d 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -1,6 +1,6 @@
 {
   "name": "superbench-website",
-  "version": "0.8.0",
+  "version": "0.9.0",
   "lockfileVersion": 1,
   "requires": true,
   "dependencies": {
diff --git a/website/package.json b/website/package.json
index c761f26d8..38ca1f75a 100644
--- a/website/package.json
+++ b/website/package.json
@@ -1,6 +1,6 @@
 {
   "name": "superbench-website",
-  "version": "0.8.0",
+  "version": "0.9.0",
   "private": true,
   "scripts": {
     "docusaurus": "docusaurus",

From 67f2aa7237cefb0cf5b3032c8510a6b432407329 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Tue, 8 Aug 2023 00:03:32 -0500
Subject: [PATCH 26/33] Benchmarks: model benchmarks - change
 torch.distributed.launch to torchrun (#556)

This PR has following changes
- torch.distributed.launch changed to torchrun. torch.distributed.launch
is deprecated in latest Pytorch and is recommended to move to torchrun -
https://pytorch.org/docs/stable/elastic/run.html
- Changes to AMD GPU detection logic. The AMD GPU detection logic throws
warning when containers have only renderD in /dev/dri, this change would
resolve those warnings

---------

Co-authored-by: Yuting Jiang <yutingjiang@microsoft.com>
---
 superbench/common/devices/gpu.py | 2 +-
 superbench/runner/runner.py      | 4 ++--
 tests/runner/test_runner.py      | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/superbench/common/devices/gpu.py b/superbench/common/devices/gpu.py
index e12889e10..3398d707f 100644
--- a/superbench/common/devices/gpu.py
+++ b/superbench/common/devices/gpu.py
@@ -26,7 +26,7 @@ def get_vendor(self):
                 logger.warning('Cannot find NVIDIA GPU device.')
             return 'nvidia'
         if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir():
-            if not list(Path('/dev/dri').glob('card*')):
+            if not list(Path('/dev/dri').glob('renderD*')):
                 logger.warning('Cannot find AMD GPU device.')
             return 'amd'
         if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')):
diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index bd8cc9c83..7e29f4dfe 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -144,8 +144,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
             torch_dist_params = '' if mode.node_num == 1 else \
                 '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
             mode_command = (
-                f'python3 -m torch.distributed.launch'
-                f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
+                f'torchrun'
+                f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
                 f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'
                 f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
             )
diff --git a/tests/runner/test_runner.py b/tests/runner/test_runner.py
index 304a6ba22..250942267 100644
--- a/tests/runner/test_runner.py
+++ b/tests/runner/test_runner.py
@@ -105,8 +105,8 @@ def test_get_mode_command(self):
                     'node_num': 'all',
                 },
                 'expected_command': (
-                    'python3 -m torch.distributed.launch '
-                    '--use_env --no_python --nproc_per_node=1 '
+                    'torchrun '
+                    '--no_python --nproc_per_node=1 '
                     '--nnodes=$NNODES --node_rank=$NODE_RANK '
                     '--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
                     f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
@@ -123,8 +123,8 @@ def test_get_mode_command(self):
                     'node_num': 1,
                 },
                 'expected_command': (
-                    'python3 -m torch.distributed.launch '
-                    '--use_env --no_python --nproc_per_node=8 '
+                    'torchrun '
+                    '--no_python --nproc_per_node=8 '
                     f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo '
                     'superbench.benchmarks.foo.parameters.distributed_impl=ddp '
                     'superbench.benchmarks.foo.parameters.distributed_backend=nccl'

From 6c0205cece527ed49959619e74865c1bd5e69e6e Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Fri, 18 Aug 2023 13:17:04 +0800
Subject: [PATCH 27/33] Benchmarks: micro benchmarks - add source code for
 DirectXRenderPerf (#549)

**Description**
add source code for DirectXRenderPerf.

---------

Co-authored-by: yukirora <yuting.jiang@microsoft.com>
---
 .github/workflows/build-image.yml             |   2 +-
 .../BenchmarkOptions.h                        |  80 ++
 .../BufferHelper.cpp                          | 137 +++
 .../directx_render_performance/BufferHelper.h | 126 +++
 .../DirectXRenderPerformance.vcxproj          | 159 +++
 .../GeometryHelper.cpp                        |  37 +
 .../GeometryHelper.h                          | 159 +++
 .../directx_render_performance/Main.cpp       | 148 +++
 .../directx_render_performance/RenderApp.cpp  | 388 ++++++++
 .../directx_render_performance/RenderApp.h    | 221 +++++
 .../RenderGeometryPass.cpp                    | 139 +++
 .../RenderGeometryPass.h                      |  99 ++
 .../RenderLightingPass.cpp                    | 217 +++++
 .../RenderLightingPass.h                      | 119 +++
 .../RenderShadowMapPass.cpp                   |  77 ++
 .../RenderShadowMapPass.h                     |  34 +
 .../Shaders/Base.hlsl                         | 134 +++
 .../Shaders/DefferredLightingPixel.hlsl       | 919 ++++++++++++++++++
 .../Shaders/DefferredLightingVertex.hlsl      | 199 ++++
 .../Shaders/ShadowMap.hlsl                    |  55 ++
 .../directx_third_party/DeviceResources.cpp   | 670 +++++++++++++
 .../directx_third_party/DeviceResources.h     | 138 +++
 .../directx_third_party/pch.h                 |  97 ++
 23 files changed, 4353 insertions(+), 1 deletion(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/BenchmarkOptions.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/DirectXRenderPerformance.vcxproj
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Main.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/Base.hlsl
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingPixel.hlsl
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingVertex.hlsl
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/ShadowMap.hlsl
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/pch.h

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 6b796830a..e2dad1a66 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -65,7 +65,7 @@ jobs:
           sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps")
           df -h
       - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
-        uses: jlumbroso/free-disk-space@main
+        uses: hirnidrin/free-disk-space@main
         with:
           # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
           tool-cache: false
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BenchmarkOptions.h
new file mode 100644
index 000000000..5c93b9378
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BenchmarkOptions.h
@@ -0,0 +1,80 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <algorithm>
+#include <codecvt>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "../directx_utils/Options.h"
+
+using namespace std;
+
+// enum class for pass type
+enum class PassType { GeometryPass, ShadowMapPass, LightingPass };
+
+class BenchmarkOptions : public Options {
+  public:
+    int m_textureSize = 0;
+    int m_textureNum = 10;
+    int m_vertexNum = 3000;
+    int m_indexNum = 3000;
+    int m_width = 1080;
+    int m_height = 720;
+    int m_warmup = 500;
+    int m_num_object = 1;
+    string m_outfile = "outfile.txt";
+    PassType m_pass_type = PassType::ShadowMapPass;
+    int m_num_frames = 3000;
+    int m_num_light = 1;
+    bool m_quiet = true;
+
+    BenchmarkOptions(int argc, char *argv[]) : Options(argc, argv) {}
+
+    virtual void get_option_usage() {
+        cout << "Usage: " << endl;
+        cout << "  --width <int>        set the width of the window" << endl;
+        cout << "  --height <int>       set the height of the window" << endl;
+        cout << "  --warmup <int>       set the warmup frames" << endl;
+        cout << "  --vertex <int>       set the number of vertices" << endl;
+        cout << "  --index <int>        set the number of indices" << endl;
+        cout << "  --texture_size <int> set the size of textures <x,x>" << endl;
+        cout << "  --outfile <string>   set the output file name" << endl;
+        cout << "  --pass <string>      set the pass type" << endl;
+        cout << "  --object <int>       set the number of objects" << endl;
+        cout << "  --frame <int>        set the number of frames" << endl;
+        cout << "  --light <int>        set the number of lights" << endl;
+        cout << "  --quiet              disable window" << endl;
+    }
+
+    virtual void parse_arguments() {
+        m_width = get_cmd_line_argument_int("--width", 1080);
+        m_height = get_cmd_line_argument_int("--height", 720);
+        m_warmup = get_cmd_line_argument_int("--warmup", 500);
+        m_vertexNum = get_cmd_line_argument_int("--vertex", m_vertexNum);
+        m_indexNum = get_cmd_line_argument_int("--index", m_indexNum);
+        m_textureSize = get_cmd_line_argument_int("--texture", 3);
+        m_textureNum = get_cmd_line_argument_int("--texture_num", 3);
+        m_outfile = get_cmd_line_argument_string("--outfile");
+        auto pass = get_cmd_line_argument_string("--pass");
+        std::transform(pass.begin(), pass.end(), pass.begin(), [](unsigned char c) { return std::tolower(c); });
+        if (pass == "geometry") {
+            m_pass_type = PassType::GeometryPass;
+        } else if (pass == "shadow") {
+            m_pass_type = PassType::ShadowMapPass;
+        } else if (pass == "lighting") {
+            m_pass_type = PassType::LightingPass;
+        } else {
+            cout << "Error: Invalid pass type: " << pass << endl;
+            exit(1);
+        }
+        m_num_object = get_cmd_line_argument_int("--object", m_num_object);
+        m_num_frames = get_cmd_line_argument_int("--frame", m_num_frames);
+        m_num_light = get_cmd_line_argument_int("--light", m_num_light);
+        m_quiet = get_cmd_line_argument_bool("--quiet");
+    };
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.cpp
new file mode 100644
index 000000000..84cb4e294
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.cpp
@@ -0,0 +1,137 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "BufferHelper.h"
+
+// Function to calculate the byte size of the constant buffer,
+// which must be a multiple of 256 bytes.
+UINT CalcConstantBufferByteSize(UINT byteSize) {
+    // Calculate the aligned size.
+    return (byteSize + 255) & ~255;
+}
+
+/*
+ * @brief: Create a default buffer.
+ * @param: device the device of GPU object.
+ * @param: cmdList the command list of GPU object.
+ * @param: initData the data to be copied to the default buffer.
+ * @param: byteSize the size of data.
+ * @return: the default buffer.
+ */
+Microsoft::WRL::ComPtr<ID3D12Resource> CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList,
+                                                           const void *initData, UINT64 byteSize,
+                                                           Microsoft::WRL::ComPtr<ID3D12Resource> &uploadBuffer) {
+    ComPtr<ID3D12Resource> defaultBuffer;
+
+    // Create the actual default buffer resource.
+    ThrowIfFailed(device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+                                                  D3D12_HEAP_FLAG_NONE, &CD3DX12_RESOURCE_DESC::Buffer(byteSize),
+                                                  D3D12_RESOURCE_STATE_COMMON, nullptr,
+                                                  IID_PPV_ARGS(defaultBuffer.GetAddressOf())));
+
+    // In order to copy CPU memory data into our default buffer, we need to create
+    // an intermediate upload heap.
+    ThrowIfFailed(device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD),
+                                                  D3D12_HEAP_FLAG_NONE, &CD3DX12_RESOURCE_DESC::Buffer(byteSize),
+                                                  D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+                                                  IID_PPV_ARGS(uploadBuffer.GetAddressOf())));
+
+    // Describe the data we want to copy into the default buffer.
+    D3D12_SUBRESOURCE_DATA subResourceData = {};
+    subResourceData.pData = initData;
+    subResourceData.RowPitch = byteSize;
+    subResourceData.SlicePitch = subResourceData.RowPitch;
+
+    // Schedule to copy the data to the default buffer resource.  At a high level, the helper function
+    // UpdateSubresources will copy the CPU memory into the intermediate upload heap.  Then, using
+    // ID3D12CommandList::CopySubresourceRegion, the intermediate upload heap data will be copied to mBuffer.
+    cmdList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(defaultBuffer.Get(), D3D12_RESOURCE_STATE_COMMON,
+                                                                      D3D12_RESOURCE_STATE_COPY_DEST));
+    UpdateSubresources<1>(cmdList, defaultBuffer.Get(), uploadBuffer.Get(), 0, 0, 1, &subResourceData);
+    cmdList->ResourceBarrier(1,
+                             &CD3DX12_RESOURCE_BARRIER::Transition(defaultBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST,
+                                                                   D3D12_RESOURCE_STATE_GENERIC_READ));
+
+    // Note: uploadBuffer has to be kept alive after the above function calls because
+    // the command list has not been executed yet that performs the actual copy.
+    // The caller can Release the uploadBuffer after it knows the copy has been executed.
+
+    return defaultBuffer;
+}
+
+std::vector<UINT8> CreateRandomTexture(const UINT width, const UINT height, const UINT texturePixelSize) {
+    // Create a buffer to store the texture data
+    std::vector<unsigned char> textureData(width * height * texturePixelSize);
+
+    // Initialize the random number generator
+    std::random_device rd;
+    std::mt19937 generator(rd());
+    std::uniform_int_distribution<int> distribution(0, 255);
+
+    // Generate random data for the texture
+    for (UINT i = 0; i < width * height * texturePixelSize; ++i) {
+        textureData[i] = static_cast<unsigned char>(distribution(generator));
+    }
+    return textureData;
+}
+
+void UploadTexture(ID3D12Device *device, ID3D12GraphicsCommandList *pCmdList, const std::vector<UINT8> &textureData,
+                   Microsoft::WRL::ComPtr<ID3D12Resource> &texture, const UINT width, const UINT height,
+                   const UINT texturePixelSize) {
+    // Create the GPU upload buffer.
+    const UINT64 uploadBufferSize = GetRequiredIntermediateSize(texture.Get(), 0, 1);
+
+    ID3D12Resource *textureUploadHeap;
+    ThrowIfFailed(
+        device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD), D3D12_HEAP_FLAG_NONE,
+                                        &CD3DX12_RESOURCE_DESC::Buffer(uploadBufferSize),
+                                        D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, IID_PPV_ARGS(&textureUploadHeap)));
+
+    // Copy data to the intermediate upload heap and then schedule a copy
+    // from the upload heap to the Texture2D.
+    D3D12_SUBRESOURCE_DATA textureDataDesc = {};
+    textureDataDesc.pData = textureData.data();
+    textureDataDesc.RowPitch = width * texturePixelSize;
+    textureDataDesc.SlicePitch = textureDataDesc.RowPitch * height;
+
+    UpdateSubresources(pCmdList, texture.Get(), textureUploadHeap, 0, 0, 1, &textureDataDesc);
+    pCmdList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(texture.Get(), D3D12_RESOURCE_STATE_COPY_DEST,
+                                                                       D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE));
+}
+
+void CreateTextureResource(ID3D12Device *device, UINT width, UINT height, DXGI_FORMAT format,
+                           Microsoft::WRL::ComPtr<ID3D12Resource> &textureResource, UINT16 arraySize) {
+    D3D12_RESOURCE_DESC textureDesc = {};
+    textureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
+    textureDesc.Width = width;
+    textureDesc.Height = height;
+    textureDesc.DepthOrArraySize = arraySize;
+    textureDesc.MipLevels = 1;
+    textureDesc.Format = format;
+    textureDesc.SampleDesc.Count = 1;
+    textureDesc.SampleDesc.Quality = 0;
+    textureDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
+    textureDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+    ThrowIfFailed(device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT),
+                                                  D3D12_HEAP_FLAG_NONE, &textureDesc, D3D12_RESOURCE_STATE_COPY_DEST,
+                                                  nullptr, IID_PPV_ARGS(&textureResource)));
+}
+
+void Texture2D(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList,
+               Microsoft::WRL::ComPtr<ID3D12Resource> &textureResource, int width, int height, DXGI_FORMAT format) {
+    CreateTextureResource(device, width, height, format, textureResource, 1);
+    auto textureData = CreateRandomTexture(width, height);
+    UploadTexture(device, cmdList, textureData, textureResource, width, height);
+}
+
+void TextureCube(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList,
+                 Microsoft::WRL::ComPtr<ID3D12Resource> &textureResource, int width, int height, DXGI_FORMAT format) {
+    CreateTextureResource(device, width, height, format, textureResource, 6);
+    std::vector<UINT8> textureCubeData;
+    for (int i = 0; i < 6; ++i) {
+        auto textureData = CreateRandomTexture(width, height);
+        textureCubeData.insert(textureCubeData.end(), textureData.begin(), textureData.end());
+    }
+    UploadTexture(device, cmdList, textureCubeData, textureResource, width, height);
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.h
new file mode 100644
index 000000000..1ab91ddb6
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.h
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <random>
+
+#include "../directx_third_party/DXSampleHelper.h"
+#include "../directx_third_party/d3dx12.h"
+
+// Helper class for creating and uploading resources to the GPU.
+template <typename T> class UploadBuffer {
+  public:
+    UploadBuffer(ID3D12Device *device, UINT elementCount, bool isConstantBuffer)
+        : m_isConstantBuffer(isConstantBuffer) {
+        m_elementByteSize = sizeof(T);
+
+        if (isConstantBuffer)
+            m_elementByteSize = CalcConstantBufferByteSize(sizeof(T));
+
+        ThrowIfFailed(
+            device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD), D3D12_HEAP_FLAG_NONE,
+                                            &CD3DX12_RESOURCE_DESC::Buffer(m_elementByteSize * elementCount),
+                                            D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, IID_PPV_ARGS(&m_uploadBuffer)));
+    }
+
+    UploadBuffer(const UploadBuffer &rhs) = delete;
+    UploadBuffer &operator=(const UploadBuffer &rhs) = delete;
+    ~UploadBuffer() {
+        if (m_uploadBuffer != nullptr)
+            m_uploadBuffer->Unmap(0, nullptr);
+
+        m_mappedData = nullptr;
+    }
+
+    ID3D12Resource *Resource() const { return m_uploadBuffer.Get(); }
+
+    void CopyData(int elementIndex, const T &data) {
+        ThrowIfFailed(m_uploadBuffer->Map(0, nullptr, reinterpret_cast<void **>(&m_mappedData)));
+        memcpy(&m_mappedData[elementIndex * m_elementByteSize], &data, sizeof(T));
+        m_uploadBuffer->Unmap(0, nullptr);
+    }
+
+  private:
+    Microsoft::WRL::ComPtr<ID3D12Resource> m_uploadBuffer;
+    BYTE *m_mappedData = nullptr;
+
+    UINT m_elementByteSize = 0;
+    bool m_isConstantBuffer = false;
+};
+
+/*
+ * @brief: Create a default buffer.
+ * @param: device the device of GPU object.
+ * @param: cmdList the command list of GPU object.
+ * @param: initData the data to be copied to the default buffer.
+ * @param: byteSize the size of data.
+ * @return: the default buffer.
+ */
+Microsoft::WRL::ComPtr<ID3D12Resource> CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList,
+                                                           const void *initData, UINT64 byteSize,
+                                                           Microsoft::WRL::ComPtr<ID3D12Resource> &uploadBuffer);
+
+/*
+ * @brief: Calculate the size of constant buffer.
+ */
+UINT CalcConstantBufferByteSize(UINT byteSize);
+
+/*
+ * @brief: Create a random texture.
+ * @param: width the width of texture.
+ * @param: height the height of texture.
+ * @param: texturePixelSize the size of texture pixel.
+ * @return: the random texture data.
+ */
+std::vector<UINT8> CreateRandomTexture(const UINT width, const UINT height, const UINT texturePixelSize = 4);
+
+/*
+ * @brief: Upload the texture to GPU.
+ * @param: device the device of GPU object.
+ * @param: pCmdList the command list of GPU object.
+ * @param: textureData the texture data to be uploaded.
+ * @param: texture the texture resource.
+ * @param: width the width of texture.
+ * @param: height the height of texture.
+ * @param: texturePixelSize the size of texture pixel.
+ */
+void UploadTexture(ID3D12Device *device, ID3D12GraphicsCommandList *pCmdList, const std::vector<UINT8> &textureData,
+                   Microsoft::WRL::ComPtr<ID3D12Resource> &texture, const UINT width, const UINT height,
+                   const UINT texturePixelSize = 4);
+
+/*
+ * @brief: Create a texture resource.
+ * @param: device the device of GPU object.
+ * @param: width the width of texture.
+ * @param: height the height of texture.
+ * @param: format the format of texture.
+ * @param: textureResource the texture resource.
+ * @param: arraySize the size of texture array.
+ */
+void CreateTextureResource(ID3D12Device *device, UINT width, UINT height, DXGI_FORMAT format,
+                           Microsoft::WRL::ComPtr<ID3D12Resource> &textureResource, UINT16 arraySize);
+
+/*
+ * @brief: Create a random texture resource and upload it to GPU.
+ * @param: device the device of GPU object.
+ * @param: cmdList the command list of GPU object.
+ * @param: textureResource the texture resource.
+ * @param: width the width of texture.
+ * @param: height the height of texture.
+ * @param: format the format of texture.
+ */
+void Texture2D(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList,
+               Microsoft::WRL::ComPtr<ID3D12Resource> &textureResource, int width, int height, DXGI_FORMAT format);
+
+/*
+ * @brief: Create a random texture cube resource and upload it to GPU.
+ * @param: device the device of GPU object.
+ * @param: cmdList the command list of GPU object.
+ * @param: textureResource the texture resource.
+ * @param: width the width of texture.
+ * @param: height the height of texture.
+ * @param: format the format of texture.
+ */
+void TextureCube(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList,
+                 Microsoft::WRL::ComPtr<ID3D12Resource> &textureResource, int width, int height, DXGI_FORMAT format);
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/DirectXRenderPerformance.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_render_performance/DirectXRenderPerformance.vcxproj
new file mode 100644
index 000000000..46a40606f
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/DirectXRenderPerformance.vcxproj
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{627418c9-578a-47a9-8579-45c0e08fe528}</ProjectGuid>
+    <RootNamespace>DirectXRenderPerformance</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>false</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+    <FxCompile>
+      <ShaderModel>5.1</ShaderModel>
+    </FxCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>false</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="BufferHelper.cpp" />
+    <ClCompile Include="..\directx_third_party\DeviceResources.cpp" />
+    <ClCompile Include="..\directx_utils\D3D12Timer.cpp" />
+    <ClCompile Include="GeometryHelper.cpp" />
+    <ClCompile Include="RenderApp.cpp" />
+    <ClCompile Include="Main.cpp" />
+    <ClCompile Include="RenderGeometryPass.cpp" />
+    <ClCompile Include="RenderLightingPass.cpp" />
+    <ClCompile Include="RenderShadowMapPass.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\directx_third_party\d3dx12.h" />
+    <ClInclude Include="..\directx_third_party\DeviceResources.h" />
+    <ClInclude Include="..\directx_third_party\DXSampleHelper.h" />
+    <ClInclude Include="..\directx_third_party\pch.h" />
+    <ClInclude Include="..\directx_utils\D3D12Timer.h" />
+    <ClInclude Include="..\directx_utils\Options.h" />
+    <ClInclude Include="BufferHelper.h" />
+    <ClInclude Include="GeometryHelper.h" />
+    <ClInclude Include="RenderApp.h" />
+    <ClInclude Include="BenchmarkOptions.h" />
+    <ClInclude Include="RenderGeometryPass.h" />
+    <ClInclude Include="RenderLightingPass.h" />
+    <ClInclude Include="RenderShadowMapPass.h" />
+  </ItemGroup>
+    <ItemGroup>
+    <CustomBuild Include="Shaders\Base.hlsl">
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">4.0</ShaderModel>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">4.0</ShaderModel>
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(OutDir)\Shaders" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(OutDir)\%(Identity)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(OutDir)\Shaders" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)\%(Identity)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="Shaders\ShadowMap.hlsl">
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">4.0</ShaderModel>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">4.0</ShaderModel>
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(OutDir)\Shaders" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(OutDir)\%(Identity)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(OutDir)\Shaders" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)\%(Identity)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="Shaders\DefferredLightingPixel.hlsl">
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">4.0</ShaderModel>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">4.0</ShaderModel>
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(OutDir\Shaders)" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(OutDir)\%(Identity)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(OutDir)\Shaders" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)\%(Identity)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="Shaders\DefferredLightingVertex.hlsl">
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">4.0</ShaderModel>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">4.0</ShaderModel>
+      <FileType>Document</FileType>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy %(Identity) "$(OutDir)\Shaders" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(OutDir)\%(Identity)</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy %(Identity) "$(OutDir)\Shaders" &gt; NUL</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)\%(Identity)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="packages\directxtex_uwp.2023.3.30.1\build\native\directxtex_uwp.targets" Condition="Exists('packages\directxtex_uwp.2023.3.30.1\build\native\directxtex_uwp.targets')" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.cpp
new file mode 100644
index 000000000..fe5a5bdee
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.cpp
@@ -0,0 +1,37 @@
+
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "GeometryHelper.h"
+
+namespace MathHelper {
+DirectX::XMFLOAT4X4 Identity4x4() {
+
+    DirectX::XMFLOAT4X4 identity;
+    DirectX::XMStoreFloat4x4(&identity, DirectX::XMMatrixIdentity());
+    return identity;
+}
+
+float genRand2N_f(int n) {
+    srand((unsigned int)time(NULL));
+    // Seed
+    std::random_device rd;
+
+    // Random number generator
+    std::default_random_engine generator(rd());
+    std::uniform_real_distribution<float> distribution(0, n);
+    return distribution(generator);
+}
+
+uint16_t genRand2N_large(int n) {
+    srand((unsigned int)time(NULL));
+    // Seed
+    std::random_device rd;
+
+    // Random number generator
+    std::default_random_engine generator(rd());
+    // Use std::uniform_int_distribution<uint16_t> with the desired range
+    std::uniform_int_distribution<uint16_t> distribution(0, static_cast<uint16_t>(n));
+    return distribution(generator);
+};
+} // namespace MathHelper
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.h
new file mode 100644
index 000000000..1445ca5fd
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.h
@@ -0,0 +1,159 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <random>
+#include <wrl.h>
+
+#include <DirectXMath.h>
+
+#include "../directx_third_party/DXSampleHelper.h"
+#include "../directx_third_party/pch.h"
+#include "BufferHelper.h"
+
+using Microsoft::WRL::ComPtr;
+
+namespace MathHelper {
+const float Infinity = FLT_MAX;
+const float Pi = 3.1415926535f;
+// Create identity4*4 matrix
+DirectX::XMFLOAT4X4 Identity4x4();
+// Returns random float in [0, n).
+float genRand2N_f(int n);
+// Returns random uint16_t in [0, n).
+uint16_t genRand2N_large(int n);
+} // namespace MathHelper
+
+// Simple struct to represent a vertex.
+class Vertex {
+  public:
+    Vertex() {
+        x = MathHelper::genRand2N_f(2) - 1;
+        y = MathHelper::genRand2N_f(2) - 1;
+        z = MathHelper::genRand2N_f(2) - 1;
+    }
+    Vertex(float x, float y, float z) : x(x), y(y), z(z) {}
+    Vertex(const Vertex &v) : x(v.x), y(v.y), z(v.z) {}
+    float x, y, z; // Position
+    // You can add other attributes such as color, normal, texture coordinates etc.
+};
+
+// Simple struct to represent a Geometry object.
+struct Geometry {
+    std::unique_ptr<Vertex[]> vertexData = nullptr;
+    std::vector<uint16_t> indexData;
+    UINT vertexNum;
+    UINT indexNum;
+    UINT vertexByteSize;
+    UINT indexByteSize;
+    UINT vertexByteStride;
+};
+
+// Create a random geometry data buffer.
+template <class T> std::unique_ptr<Geometry> CreateRandomGeometry(const UINT vertexNum, const UINT indexNum) {
+    static_assert(std::is_base_of<Vertex, T>::value, "T must be a Vertex or derived from Vertex");
+    std::unique_ptr<Geometry> geo = make_unique<Geometry>();
+    // Create the vertices.
+    // Allocate memory and reinterpret_cast it to Vertex array
+    geo->vertexData.reset(reinterpret_cast<Vertex *>(new T[vertexNum]));
+
+    // Fill in the random vertex data.
+    for (UINT i = 0; i < vertexNum; i++) {
+        // Here you need to reinterpret_cast it back to T for accessing/modifying
+        T &v = reinterpret_cast<T &>(geo->vertexData[i]);
+        v = T();
+    }
+
+    // Create the indices.
+    // Fill in the random index data.
+    for (UINT i = 0; i < indexNum; i++) {
+        geo->indexData.push_back(MathHelper::genRand2N_large(vertexNum));
+    }
+    geo->vertexNum = vertexNum;
+    geo->indexNum = indexNum;
+    geo->vertexByteStride = sizeof(T);
+    geo->vertexByteSize = sizeof(T) * vertexNum;
+    geo->indexByteSize = sizeof(std::uint16_t) * indexNum;
+    return geo;
+}
+
+// Helpter class to manage geometry data buffer on GPU.
+struct GeometryResource {
+    ComPtr<ID3DBlob> VertexBufferCPU = nullptr;
+    ComPtr<ID3DBlob> IndexBufferCPU = nullptr;
+
+    ComPtr<ID3D12Resource> VertexBufferGPU = nullptr;
+    ComPtr<ID3D12Resource> IndexBufferGPU = nullptr;
+
+    ComPtr<ID3D12Resource> VertexBufferUploader = nullptr;
+    ComPtr<ID3D12Resource> IndexBufferUploader = nullptr;
+
+    // Data about the buffers.
+    UINT VertexByteStride = 0;
+    UINT VertexBufferByteSize = 0;
+    DXGI_FORMAT IndexFormat = DXGI_FORMAT_R16_UINT;
+    UINT IndexBufferByteSize = 0;
+
+    D3D12_PRIMITIVE_TOPOLOGY PrimitiveType = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
+
+    UINT IndexCount = 0;
+    UINT StartIndexLocation = 0;
+    INT BaseVertexLocation = 0;
+
+    /*
+     * @brief Get the vertex buffer view.
+     */
+    D3D12_VERTEX_BUFFER_VIEW VertexBufferView() const {
+        D3D12_VERTEX_BUFFER_VIEW vbv;
+        vbv.BufferLocation = VertexBufferGPU->GetGPUVirtualAddress();
+        vbv.StrideInBytes = VertexByteStride;
+        vbv.SizeInBytes = VertexBufferByteSize;
+        return vbv;
+    }
+
+    /*
+     * @brief Get the index buffer view.
+     */
+    D3D12_INDEX_BUFFER_VIEW IndexBufferView() const {
+        D3D12_INDEX_BUFFER_VIEW ibv;
+        ibv.BufferLocation = IndexBufferGPU->GetGPUVirtualAddress();
+        ibv.Format = IndexFormat;
+        ibv.SizeInBytes = IndexBufferByteSize;
+        return ibv;
+    }
+
+    /*
+     * @brief Upload geometry data and set necessary information about the geometry.
+     */
+    void Create(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, std::unique_ptr<Geometry> &geoData) {
+        if (device == nullptr) {
+            throw std::runtime_error("device is nullptr");
+        }
+        if (cmdList == nullptr) {
+            throw std::runtime_error("cmdList is nullptr");
+        }
+        if (geoData == nullptr) {
+            throw std::runtime_error("geoData is nullptr");
+        }
+        auto geometry = geoData.get();
+        ThrowIfFailed(D3DCreateBlob(geometry->vertexByteSize, &this->VertexBufferCPU));
+        CopyMemory(this->VertexBufferCPU->GetBufferPointer(), geometry->vertexData.get(), geometry->vertexByteSize);
+
+        ThrowIfFailed(D3DCreateBlob(geometry->indexByteSize, &this->IndexBufferCPU));
+        CopyMemory(this->IndexBufferCPU->GetBufferPointer(), geometry->indexData.data(), geometry->indexByteSize);
+
+        this->VertexBufferGPU = CreateDefaultBuffer(device, cmdList, geometry->vertexData.get(),
+                                                    geometry->vertexByteSize, this->VertexBufferUploader);
+
+        this->IndexBufferGPU = CreateDefaultBuffer(device, cmdList, geometry->indexData.data(), geometry->indexByteSize,
+                                                   this->IndexBufferUploader);
+
+        this->VertexByteStride = geometry->vertexByteStride;
+        this->VertexBufferByteSize = geometry->vertexByteSize;
+        this->IndexBufferByteSize = geometry->indexByteSize;
+        this->IndexCount = geometry->indexNum;
+    }
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Main.cpp
new file mode 100644
index 000000000..3add03fc0
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Main.cpp
@@ -0,0 +1,148 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "RenderGeometryPass.h"
+#include "RenderLightingPass.h"
+#include "RenderShadowMapPass.h"
+#include <codecvt>
+#include <cstdio>
+#include <iostream>
+#include <locale>
+#include <string>
+#include <tuple>
+#include <windows.h>
+
+/*
+ * @brief: Main message handler for the sample.
+ */
+LRESULT WindowProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) {
+    // Handle window event.
+    switch (message) {
+    case WM_CLOSE:
+        DestroyWindow(hWnd);
+        break;
+    case WM_DESTROY:
+        PostQuitMessage(0);
+        break;
+    default:
+        return DefWindowProc(hWnd, message, wParam, lParam);
+    }
+    return 0;
+}
+
+/*
+ * @brief: Main window procedure.
+ */
+static LRESULT CALLBACK MainWndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam) {
+    // Forward hwnd on because we can get messages (e.g., WM_CREATE)
+    // before CreateWindow returns, and thus before mhMainWnd is valid.
+    return WindowProc(hwnd, msg, wParam, lParam);
+}
+
+/*
+ * @brief: Register a window app.
+ */
+bool InitMainWindow(HINSTANCE hInstance, int width, int height, HWND &hMainWnd, const std::wstring &winTitle,
+                    bool quiet_mode) {
+    WNDCLASS wc;
+    wc.style = CS_HREDRAW | CS_VREDRAW;
+    wc.lpfnWndProc = MainWndProc;
+    wc.cbClsExtra = 0;
+    wc.cbWndExtra = 0;
+    wc.cbWndExtra = 0;
+    wc.hInstance = hInstance;
+    wc.hIcon = LoadIcon(0, IDI_APPLICATION);
+    wc.hCursor = LoadCursor(0, IDC_ARROW);
+    wc.hbrBackground = (HBRUSH)GetStockObject(NULL_BRUSH);
+    wc.lpszMenuName = 0;
+    wc.lpszClassName = L"MainWnd";
+
+    if (!RegisterClass(&wc)) {
+        return false;
+    }
+
+    // Compute window rectangle dimensions based on requested client area dimensions.
+    RECT R = {0, 0, width, height};
+    AdjustWindowRect(&R, WS_OVERLAPPEDWINDOW, false);
+    width = R.right - R.left;
+    height = R.bottom - R.top;
+
+    hMainWnd = CreateWindow(wc.lpszClassName, winTitle.c_str(), WS_OVERLAPPEDWINDOW, CW_USEDEFAULT, CW_USEDEFAULT,
+                            width, height, 0, 0, hInstance, 0);
+    if (!hMainWnd) {
+        return false;
+    }
+
+    if (!quiet_mode) {
+        ShowWindow(hMainWnd, SW_SHOW);
+        UpdateWindow(hMainWnd);
+    }
+    return true;
+}
+
+/*
+ * @brief: Load the render microbenchmark according to the pass type.
+ */
+std::unique_ptr<RenderApp> get_render_pointer(BenchmarkOptions &args, HINSTANCE hInstance, HWND hMainWnd,
+                                              std::wstring &winTitle) {
+    if (args.m_pass_type == PassType::GeometryPass) {
+        return std::make_unique<RenderGeometryPass>(&args, hInstance, hMainWnd, winTitle);
+    } else if (args.m_pass_type == PassType::ShadowMapPass) {
+        return std::make_unique<RenderShadowMapPass>(&args, hInstance, hMainWnd, winTitle);
+    } else if (args.m_pass_type == PassType::LightingPass) {
+        return std::make_unique<RenderLightingPass>(&args, hInstance, hMainWnd, winTitle);
+    } else
+        throw "invalid pass name";
+}
+
+/*
+ * @brief: Main entry point for a Windows application.
+ */
+int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow) {
+    // Enable console attach and redirect stdout/stderr to console.
+    if (AttachConsole(ATTACH_PARENT_PROCESS) || AllocConsole()) {
+        FILE *stream;
+        if (freopen_s(&stream, "CONOUT$", "w", stdout) == 0) {
+            printf("Hello, Console!\n");
+        }
+        if (freopen_s(&stream, "CONOUT$", "w", stderr) == 0) {
+            fprintf(stderr, "Hello, Error Console!\n");
+        }
+        // Or use std::cout
+        std::cout << "Hello from std::cout" << std::endl;
+    }
+
+    MSG msg = {0};
+    try {
+        // Parse command line arguments.
+        BenchmarkOptions args(__argc, __argv);
+        args.init();
+        // Create the main window.
+        HWND hMainWnd;
+        std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
+        std::wstring winTitle = converter.from_bytes("");
+        if (!InitMainWindow(hInstance, args.m_width, args.m_height, hMainWnd, winTitle, args.m_quiet))
+            return -1;
+
+        // Create the render microbenchmark.
+        auto app_sample = get_render_pointer(args, hInstance, hMainWnd, winTitle);
+        app_sample->Initialize();
+        app_sample->LoadAssets();
+
+        while (msg.message != WM_QUIT) {
+            // If there are Window messages then process them.
+            // We need to handle message here otherwise it is no response.
+            if (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) {
+                TranslateMessage(&msg);
+                DispatchMessage(&msg);
+            } else {
+                // Update and render per frame.
+                app_sample->Tick();
+            }
+        }
+    } catch (const std::exception &e) {
+        std::cerr << e.what() << '\n';
+    }
+
+    return (int)msg.wParam;
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.cpp
new file mode 100644
index 000000000..fa4a8f0eb
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.cpp
@@ -0,0 +1,388 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "RenderApp.h"
+#include "../directx_third_party/d3dx12.h"
+
+RenderApp::RenderApp(BenchmarkOptions *args) {
+    if (args == nullptr) {
+        throw std::runtime_error("BenchmarkOptions is nullptr");
+    }
+    m_opts = args;
+    m_width = args->m_width;
+    m_height = args->m_height;
+    m_deviceResources = std::make_unique<DX::DeviceResources>(DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, DXGI_FORMAT_D32_FLOAT,
+                                                              m_swapChainBufferCount, D3D_FEATURE_LEVEL_11_0,
+                                                              DX::DeviceResources::c_AllowTearing);
+}
+
+RenderApp::RenderApp(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle)
+    : RenderApp(args) {
+    m_hinstance = hInstance;
+    m_hMainWnd = hMainWnd;
+    m_winTitle = winTitle;
+}
+
+RenderApp::~RenderApp() {
+    if (m_outfile.is_open()) {
+        m_outfile.close();
+    }
+    if (m_deviceResources) {
+        m_deviceResources->WaitForGpu();
+    }
+}
+
+void RenderApp::Initialize() {
+    if (m_deviceResources == nullptr) {
+        throw std::runtime_error("DeviceResources is nullptr");
+    }
+    m_deviceResources->SetWindow(m_hMainWnd, m_width, m_height);
+    m_deviceResources->CreateDeviceResources();
+    CreateDeviceDependentResources();
+
+    m_deviceResources->CreateWindowSizeDependentResources();
+    CreateWindowSizeDependentResources();
+
+    // Wait until initialization is complete.
+    // Execute the initialization commands.
+    m_deviceResources->WaitForGpu();
+
+    auto device = m_deviceResources->GetD3DDevice();
+    auto commandQueue = m_deviceResources->GetCommandQueue();
+    m_gpuTimer.init(device, commandQueue, m_maxTimerNum, D3D12::QueueType::compute);
+    m_outfile.open(m_opts->m_outfile, std::ios_base::out);
+}
+
+void RenderApp::CreateDeviceDependentResources() {
+    auto device = m_deviceResources->GetD3DDevice();
+    if (device == nullptr) {
+        throw std::runtime_error("D3D12Device is nullptr");
+    }
+    // Create a fence for synchronizing between different frames
+    ThrowIfFailed(device->CreateFence(m_deviceResources->GetCurrentFrameIndex(), D3D12_FENCE_FLAG_NONE,
+                                      IID_PPV_ARGS(m_fence.ReleaseAndGetAddressOf())));
+
+    // Start off the fence with the current frame index
+    uint64_t currentIdx = m_deviceResources->GetCurrentFrameIndex();
+    m_deviceResources->GetCommandQueue()->Signal(m_fence.Get(), currentIdx);
+
+    CreateRootSignatures(device);
+    BuildPipelineStates(device);
+}
+
+void RenderApp::CreateWindowSizeDependentResources() {
+    auto device = m_deviceResources->GetD3DDevice();
+    auto rtvHeap = m_deviceResources->m_rtvDescriptorHeap.Get();
+    auto pCmdList = m_deviceResources->GetCommandList();
+    auto cmdListAlloc = m_deviceResources->GetCommandAllocator();
+    auto cmdQueue = m_deviceResources->GetCommandQueue();
+    if (device == nullptr) {
+        throw std::runtime_error("D3D12Device is nullptr");
+    }
+    if (rtvHeap == nullptr) {
+        throw std::runtime_error("RTVDescriptorHeap is nullptr");
+    }
+    if (pCmdList == nullptr) {
+        throw std::runtime_error("CommandList is nullptr");
+    }
+    if (cmdListAlloc == nullptr) {
+        throw std::runtime_error("CommandAllocator is nullptr");
+    }
+    if (cmdQueue == nullptr) {
+        throw std::runtime_error("CommandQueue is nullptr");
+    }
+
+    ThrowIfFailed(cmdListAlloc->Reset());
+    ThrowIfFailed(pCmdList->Reset(cmdListAlloc, nullptr));
+
+    // Prepare and init GPU resources.
+    if (m_numPassRenderTargets > 0)
+        m_renderTargets.resize(m_numPassRenderTargets);
+    if (m_numShaderResource > 0)
+        m_shaderResources.resize(m_numShaderResource);
+    CreateRenderTargetView(device, m_width, m_height, rtvHeap);
+    CreateShaderResourceView(device, pCmdList, m_width, m_height);
+
+    // Send the command list off to the GPU for processing.
+    ThrowIfFailed(pCmdList->Close());
+    ID3D12CommandList *commandLists[] = {pCmdList};
+    cmdQueue->ExecuteCommandLists(1, commandLists);
+}
+
+void RenderApp::CreateRootSignatures(ID3D12Device *device) {
+    std::vector<CD3DX12_ROOT_PARAMETER> rootParameters;
+    int numRootParameters = DefineRootParameters(rootParameters);
+    CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc = {};
+    rootSignatureDesc.NumParameters = numRootParameters;
+    rootSignatureDesc.pParameters = rootParameters.data();
+    std::vector<CD3DX12_STATIC_SAMPLER_DESC> samplers;
+    auto numSamplers = DefineStaticSamplers(samplers);
+    rootSignatureDesc.NumStaticSamplers = (UINT)numSamplers;
+    rootSignatureDesc.pStaticSamplers = samplers.data();
+    rootSignatureDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT;
+
+    ID3DBlob *serializedRootSignature = nullptr;
+    ID3DBlob *errorBlob = nullptr;
+    auto hr = (D3D12SerializeRootSignature(&rootSignatureDesc, D3D_ROOT_SIGNATURE_VERSION_1_0, &serializedRootSignature,
+                                           &errorBlob));
+    if (hr != S_OK || errorBlob != nullptr) {
+        std::cout << ((char *)errorBlob->GetBufferPointer()) << std::endl;
+    }
+
+    ThrowIfFailed(device->CreateRootSignature(0, serializedRootSignature->GetBufferPointer(),
+                                              serializedRootSignature->GetBufferSize(),
+                                              IID_PPV_ARGS(&m_rootSignature)));
+}
+
+void RenderApp::CreateRenderTargetResource(ID3D12Device *device, UINT width, UINT height, DXGI_FORMAT format,
+                                           D3D12_RESOURCE_FLAGS flags,
+                                           Microsoft::WRL::ComPtr<ID3D12Resource> &renderTarget) {
+    // Create the render target resources:
+    D3D12_CLEAR_VALUE m_clearValue = {}; // Specify a clear value for the render target (optional)
+    m_clearValue.Format = format;
+    m_clearValue.Color[0] = 0.0f; // Red component
+    m_clearValue.Color[1] = 0.0f; // Green component
+    m_clearValue.Color[2] = 0.0f; // Blue component
+    m_clearValue.Color[3] = 1.0f; // Alpha component
+
+    D3D12_HEAP_PROPERTIES heapProperties = {}; // Specify heap properties for the render target (optional)
+    heapProperties.Type = D3D12_HEAP_TYPE_DEFAULT;
+
+    D3D12_RESOURCE_DESC resourceDesc = {}; // Specify resource properties for the render target
+    resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
+    resourceDesc.Alignment = 0;
+    resourceDesc.Width = width;
+    resourceDesc.Height = height;
+    resourceDesc.DepthOrArraySize = 1;
+    resourceDesc.MipLevels = 1;
+    resourceDesc.Format = format;
+    resourceDesc.SampleDesc.Count = 1;
+    resourceDesc.SampleDesc.Quality = 0;
+    resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
+    resourceDesc.Flags = flags;
+
+    // Create the render target resource
+    ThrowIfFailed(device->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE, &resourceDesc,
+                                                  D3D12_RESOURCE_STATE_COMMON, &m_clearValue,
+                                                  IID_PPV_ARGS(&renderTarget)));
+}
+
+CD3DX12_CPU_DESCRIPTOR_HANDLE RenderApp::GetRenderTargetView(ID3D12Device *device) {
+    const CD3DX12_CPU_DESCRIPTOR_HANDLE rtvDescriptor(m_rtvDescriptorHeap->GetCPUDescriptorHandleForHeapStart());
+
+    return rtvDescriptor;
+}
+
+void RenderApp::CreateRenderTargetView(ID3D12Device *device, UINT width, UINT height, ID3D12DescriptorHeap *rtvHeap) {
+    D3D12_DESCRIPTOR_HEAP_DESC rtvDescriptorHeapDesc = {};
+    rtvDescriptorHeapDesc.NumDescriptors = m_numPassRenderTargets;
+    rtvDescriptorHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV;
+    ThrowIfFailed(device->CreateDescriptorHeap(&rtvDescriptorHeapDesc,
+                                               IID_PPV_ARGS(m_rtvDescriptorHeap.ReleaseAndGetAddressOf())));
+    m_rtvDescriptorSize = device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
+
+    // Define the render target properties
+    DXGI_FORMAT format = m_renderTargetFormat;                            // Pixel format of the render target
+    D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET; // Specify the resource flags
+
+    // Create the render target resources.
+    for (int i = 0; i < m_numPassRenderTargets; ++i) {
+        CreateRenderTargetResource(device, width, height, format, flags, m_renderTargets[i]);
+    }
+
+    auto rtvHandle = GetRenderTargetView(device);
+
+    // Create a RTV for each custom render target.
+    for (UINT i = 0; i < m_numPassRenderTargets; ++i) {
+        // Create the RTV descriptor
+        device->CreateRenderTargetView(m_renderTargets[i].Get(), nullptr, rtvHandle);
+        // Increment the handle to the next descriptor
+        rtvHandle.Offset(1, m_rtvDescriptorSize);
+    }
+}
+
+D3D12_GRAPHICS_PIPELINE_STATE_DESC RenderApp::DefinePSODesc(const std::vector<D3D12_INPUT_ELEMENT_DESC> &inputLayout,
+                                                            ComPtr<ID3DBlob> vertexShader,
+                                                            ComPtr<ID3DBlob> pixelShader) {
+    // Describe and create the graphics pipeline state object (PSO).
+    D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {};
+    ZeroMemory(&psoDesc, sizeof(D3D12_GRAPHICS_PIPELINE_STATE_DESC));
+    psoDesc.InputLayout = {inputLayout.data(), (UINT)inputLayout.size()};
+    psoDesc.pRootSignature = m_rootSignature.Get();
+    psoDesc.VS = {reinterpret_cast<UINT8 *>(vertexShader->GetBufferPointer()), vertexShader->GetBufferSize()};
+    psoDesc.PS = {reinterpret_cast<UINT8 *>(pixelShader->GetBufferPointer()), pixelShader->GetBufferSize()};
+
+    psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT);
+    psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT);
+    psoDesc.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC(D3D12_DEFAULT);
+    psoDesc.SampleMask = UINT_MAX;
+    psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+    psoDesc.DSVFormat = DXGI_FORMAT_UNKNOWN; // No depth-stencil
+    psoDesc.NumRenderTargets = m_numPassRenderTargets;
+    for (int i = 0; i < m_numPassRenderTargets; i++) {
+        psoDesc.RTVFormats[i] = m_renderTargetFormat;
+    }
+    psoDesc.SampleDesc.Count = 1;
+    psoDesc.SampleDesc.Quality = 0;
+
+    return psoDesc;
+}
+
+void RenderApp::LoadAssets() {
+    auto device = m_deviceResources->GetD3DDevice();
+    auto pCmdList = m_deviceResources->GetCommandList();
+    auto cmdListAlloc = m_deviceResources->GetCommandAllocator();
+    auto cmdQueue = m_deviceResources->GetCommandQueue();
+
+    ThrowIfFailed(cmdListAlloc->Reset());
+    ThrowIfFailed(pCmdList->Reset(cmdListAlloc, nullptr));
+
+    CreateConstantBufferResources(device);
+    UpdateConstantBufferData();
+    BuildShapeGeometry(device, pCmdList);
+
+    ThrowIfFailed(pCmdList->Close());
+    ID3D12CommandList *commandLists[] = {pCmdList};
+    cmdQueue->ExecuteCommandLists(1, commandLists);
+
+    this->m_deviceResources->WaitForGpu();
+}
+
+void RenderApp::Tick() {
+    auto device = m_deviceResources->GetD3DDevice();
+    auto pCmdList = m_deviceResources->GetCommandList();
+    auto cmdListAlloc = m_deviceResources->GetCommandAllocator();
+    auto cmdQueue = m_deviceResources->GetCommandQueue();
+    Update();
+    Render();
+    this->m_deviceResources->WaitForGpu();
+    CalculateFrameStats();
+}
+
+void RenderApp::Update() {
+    // Check to see if the GPU is keeping up
+    auto const frameIdx = m_frameIndex;
+    auto const numBackBuffers = m_deviceResources->GetBackBufferCount();
+    uint64_t completedValue = m_fence->GetCompletedValue();
+    if ((frameIdx >
+         completedValue) // if frame index is reset to zero it may temporarily be smaller than the last GPU signal
+        && (frameIdx - completedValue > numBackBuffers)) {
+        // GPU not caught up, wait for at least one available frame
+        ThrowIfFailed(m_fence->SetEventOnCompletion(frameIdx - numBackBuffers, m_fenceEvent.Get()));
+        WaitForSingleObjectEx(m_fenceEvent.Get(), INFINITE, FALSE);
+    }
+}
+
+void RenderApp::CalculateFrameStats() {
+    auto timeInMs = m_gpuTimer.getElapsedMsByTimestampPair(m_gpuTimerIdx);
+    m_frameTimeList.push_back(timeInMs);
+    m_gpuTimerIdx++;
+    if (m_gpuTimerIdx == m_maxTimerNum) {
+        m_gpuTimerIdx = 0;
+    }
+
+    m_frameIndex++;
+    if (m_frameIndex < m_opts->m_warmup) {
+        m_frameTimeList.clear();
+    } else {
+        cout << m_frameTimeList.back() << endl;
+        m_outfile << m_frameTimeList.back() << endl;
+    }
+
+    if (m_frameIndex == m_opts->m_warmup + m_opts->m_num_frames) {
+        // Calculate the median
+        double median = 0;
+        std::sort(m_frameTimeList.begin(), m_frameTimeList.end());
+        int size = m_frameTimeList.size();
+        if (m_frameTimeList.size() % 2 == 0) {
+            median = (m_frameTimeList[size / 2 - 1] + m_frameTimeList[size / 2]) / 2;
+        } else {
+            median = m_frameTimeList[size / 2];
+        }
+        m_outfile << "Mean: " << median << std::endl;
+        std::cout << "Mean: " << median << std::endl;
+        PostMessage(m_hMainWnd, WM_CLOSE, 0, 0);
+    }
+}
+
+void RenderApp::ClearRenderTargetView() {
+    auto commandList = m_deviceResources->GetCommandList();
+    auto device = m_deviceResources->GetD3DDevice();
+
+    // Clear the views.
+    auto rtvDescriptor = GetRenderTargetView(device);
+    auto const dsvDescriptor = m_deviceResources->GetDepthStencilView();
+    float clearColor[4] = {0.0f, 0.0f, 0.0f, 1.0f};
+
+    std::vector<CD3DX12_CPU_DESCRIPTOR_HANDLE> rtvHandles(m_numPassRenderTargets);
+    for (int i = 0; i < m_numPassRenderTargets; i++) {
+        commandList->ClearRenderTargetView(rtvDescriptor, clearColor, 0, nullptr);
+        rtvHandles[i] = rtvDescriptor;
+        rtvDescriptor.Offset(1, m_rtvDescriptorSize);
+    }
+
+    rtvDescriptor = GetRenderTargetView(device);
+    commandList->ClearDepthStencilView(dsvDescriptor, D3D12_CLEAR_FLAG_DEPTH, 1.0f, 0, 0, nullptr);
+    // Indicate that the back buffer will be used as a render target.
+    commandList->OMSetRenderTargets(m_numPassRenderTargets, rtvHandles.data(), FALSE, nullptr);
+
+    // Set the viewport and scissor rect.
+    auto const viewport = m_deviceResources->GetScreenViewport();
+    auto const scissorRect = m_deviceResources->GetScissorRect();
+    commandList->RSSetViewports(1, &viewport);
+    commandList->RSSetScissorRects(1, &scissorRect);
+}
+
+void RenderApp::PrepareRenderTarget(ID3D12GraphicsCommandList *pCommandList) {
+    for (int i = 0; i < m_numPassRenderTargets; i++) {
+        // Transition from COMMON to RENDER_TARGET
+        D3D12_RESOURCE_BARRIER barrier = {};
+        barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+        barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+        barrier.Transition.pResource = m_renderTargets[i].Get();
+        barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+        barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON;
+        barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_RENDER_TARGET;
+        pCommandList->ResourceBarrier(1, &barrier);
+    }
+}
+
+void RenderApp::RestoreRenderTarget(ID3D12GraphicsCommandList *pCommandList) {
+    for (int i = 0; i < m_numPassRenderTargets; i++) {
+        // Indicate that the back buffer will now be used to present.
+        pCommandList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(m_renderTargets[i].Get(),
+                                                                               D3D12_RESOURCE_STATE_RENDER_TARGET,
+                                                                               D3D12_RESOURCE_STATE_COMMON));
+    }
+}
+
+void RenderApp::Render() {
+    auto device = m_deviceResources->GetD3DDevice();
+    auto cmdList = m_deviceResources->GetCommandList();
+    auto cmdQueue = m_deviceResources->GetCommandQueue();
+    m_deviceResources->Prepare();
+    PrepareRenderTarget(cmdList);
+    ClearRenderTargetView();
+    SetStatesBeforeDraw(cmdList);
+    eventStart(cmdList);
+    Draw(cmdList);
+    eventEnd(cmdList);
+    RestoreRenderTarget(cmdList);
+    m_deviceResources->Present();
+    // GPU will signal an increasing value each frame
+    m_deviceResources->GetCommandQueue()->Signal(m_fence.Get(), m_frameIndex);
+}
+
+void RenderApp::DrawRenderItems(ID3D12GraphicsCommandList *pCmdList, int drawNum) {
+    auto ri = m_geometry.get();
+    for (int i = 0; i < drawNum; ++i) {
+        pCmdList->DrawIndexedInstanced(ri->IndexCount, 1, ri->StartIndexLocation, ri->BaseVertexLocation, 0);
+    }
+}
+
+void RenderApp::BuildShapeGeometry(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList) {
+    // Create random geometry.
+    std::unique_ptr<Geometry> geoData = CreateRandomGeometry<Vertex>(m_opts->m_vertexNum, m_opts->m_indexNum);
+    m_geometry = std::make_unique<GeometryResource>();
+    m_geometry->Create(device, cmdList, geoData);
+}
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.h
new file mode 100644
index 000000000..3912010e6
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.h
@@ -0,0 +1,221 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <system_error>
+#include <tuple>
+#include <unordered_map>
+#include <windows.h>
+
+#include "../directx_third_party/DeviceResources.h"
+#include "../directx_utils/D3D12Timer.h"
+
+#include "BenchmarkOptions.h"
+#include "GeometryHelper.h"
+
+using Microsoft::WRL::ComPtr;
+using namespace DirectX;
+using namespace std;
+
+class RenderApp {
+  public:
+    RenderApp(BenchmarkOptions *args);
+    RenderApp(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle);
+    RenderApp(const RenderApp &rhs) = delete;
+    RenderApp &operator=(const RenderApp &rhs) = delete;
+    ~RenderApp();
+
+    /*
+     * @brief: Execute the update and render per frame.
+     */
+    void Tick();
+    /*
+     * @brief: Initialize the application.
+     */
+    virtual void Initialize();
+    /*
+     * @brief: Prepare the data assets needed for render.
+     */
+    virtual void LoadAssets();
+    /*
+     * @brief: Calculate the frame stats.
+     */
+    void CalculateFrameStats();
+    /*
+     * @brief: Update to run next frame.
+     */
+    void Update();
+    /*
+     * @brief: Executes basic render loop .
+     */
+    void Render();
+
+  protected:
+    /*
+     * @brief: Define the root parameters.
+     * @param: rootParameters The root parameters to be defined.
+     * @return: The number of root parameters.
+     */
+    virtual int DefineRootParameters(std::vector<CD3DX12_ROOT_PARAMETER> &rootParameters) = 0;
+    /*
+     * @brief: Define the static samplers.
+     * @param: samplers The static samplers to be defined.
+     * @return: The number of static samplers.
+     */
+    virtual int DefineStaticSamplers(std::vector<CD3DX12_STATIC_SAMPLER_DESC> &samplers) = 0;
+    /*
+     * @brief: Build the pipeline states.
+     * @param: device The device to build the pipeline states.
+     */
+    virtual void BuildPipelineStates(ID3D12Device *device) = 0;
+    /*
+     * @brief: Create the shader resource view.
+     * @param: device The device to create the shader resource view.
+     * @param: cmdList The command list to create the shader resource view.
+     * @param: width The width of the shader resource view.
+     * @param: height The height of the shader resource view.
+     */
+    virtual void CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width,
+                                          int height) = 0;
+    /*
+     * @brief: Create the constant buffer resources.
+     * @param: device The device to create the constant buffer resources.
+     */
+    virtual void CreateConstantBufferResources(ID3D12Device *device) = 0;
+    /*
+     * @brief: Update the constant buffer data.
+     */
+    virtual void UpdateConstantBufferData() = 0;
+    /*
+     * @brief: Render and draw defined by pass.
+     * @param: cmdList The command list to draw the render items.
+     */
+    virtual void Draw(ID3D12GraphicsCommandList *cmdList) = 0;
+    /*
+     * @brief: Set the states before draw.
+     * @param: cmdList The command list to set the states before draw.
+     */
+    virtual void SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) = 0;
+    /*
+     * @brief: Create the device dependent resources.
+     */
+    virtual void CreateDeviceDependentResources();
+    /*
+     * @brief: Create the window size dependent resources.
+     */
+    virtual void CreateWindowSizeDependentResources();
+    /*
+     * @brief: Create the root signature.
+     * @param: device The device to create the root signature.
+     */
+    virtual void CreateRootSignatures(ID3D12Device *device);
+    /*
+     * @brief: Build the geometry.
+     */
+    virtual void BuildShapeGeometry(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList);
+    /*
+     * @brief: Draw the render items.
+     * @param: pCmdList The command list to draw the render item.
+     * @param: drawTimes The times to draw the render item.
+     */
+    virtual void DrawRenderItems(ID3D12GraphicsCommandList *pCmdList, int drawTimes);
+    /*
+     * @brief: Create the render target view.
+     * @param: device The device to create the render target view.
+     * @param: width The width of the render target view.
+     * @param: height The height of the render target view.
+     * @param: rtvHeap The descriptor heap to create the render target view.
+     */
+    virtual void CreateRenderTargetView(ID3D12Device *device, UINT width, UINT height, ID3D12DescriptorHeap *rtvHeap);
+    /*
+     * @brief: Create the Render target resource.
+     * @param: device The device to create the render target resource.
+     * @param: width The width of the render target resource.
+     * @param: height The height of the render target resource.
+     * @param: format The format of the render target resource.
+     * @param: flags The flags of the render target resource.
+     * @param: renderTarget The render target resource to be created.
+     */
+    virtual void CreateRenderTargetResource(ID3D12Device *device, UINT width, UINT height, DXGI_FORMAT format,
+                                            D3D12_RESOURCE_FLAGS flags,
+                                            Microsoft::WRL::ComPtr<ID3D12Resource> &renderTarget);
+    /*
+     * @brief: Define the pipeline state description.
+     * @param: inputLayout The input layout of the pipeline state description.
+     * @param: vertexShader The vertex shader of the pipeline state description.
+     * @param: pixelShader The pixel shader of the pipeline state description.
+     * @return: The pipeline state description.
+     */
+    D3D12_GRAPHICS_PIPELINE_STATE_DESC DefinePSODesc(const std::vector<D3D12_INPUT_ELEMENT_DESC> &inputLayout,
+                                                     ComPtr<ID3DBlob> vertexShader, ComPtr<ID3DBlob> pixelShader);
+    /*
+     * @brief: Prepare the render target state to draw.
+     */
+    void PrepareRenderTarget(ID3D12GraphicsCommandList *pCommandList);
+    /*
+     * @brief: restore render target state.
+     */
+    void RestoreRenderTarget(ID3D12GraphicsCommandList *pCommandList);
+    /*
+     * @brief: Clear, bind the render target view and set the viewport and scissor rect.
+     */
+    void ClearRenderTargetView();
+    /*
+     * @brief: Get the first render target view of the pass.
+     */
+    CD3DX12_CPU_DESCRIPTOR_HANDLE GetRenderTargetView(ID3D12Device *device);
+
+    // Window info.
+    std::wstring m_winTitle;
+    int m_width = 1280;
+    int m_height = 720;
+    HINSTANCE m_hinstance = nullptr;
+    HWND m_hMainWnd = nullptr;
+    int m_swapChainBufferCount = 2;
+    // Device resources.
+    std::unique_ptr<DX::DeviceResources> m_deviceResources;
+    D3D_DRIVER_TYPE m_d3dDriverType = D3D_DRIVER_TYPE_HARDWARE;
+    // Root signature.
+    ComPtr<ID3D12RootSignature> m_rootSignature = nullptr;
+    // Render target view.
+    ComPtr<ID3D12DescriptorHeap> m_rtvDescriptorHeap = nullptr;
+    DXGI_FORMAT m_renderTargetFormat = DXGI_FORMAT_R16G16B16A16_FLOAT;
+    DXGI_FORMAT m_colorFormat = DXGI_FORMAT_R8G8B8A8_UNORM;
+    UINT m_numPassRenderTargets = 1;                                     // Number of render targets
+    std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>> m_renderTargets; // Array of render target resources
+    UINT m_rtvDescriptorSize = 0;
+    // Shader resource view.
+    UINT m_numShaderResource = 0; // Number of ShaderResources
+    std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>> m_shaderResources;
+    UINT m_cbvSrvDescriptorSize = 0;
+    ComPtr<ID3D12DescriptorHeap> m_srvDescriptorHeap = nullptr;
+    // PSO objects.
+    std::unordered_map<std::string, ComPtr<ID3D12PipelineState>> m_PSOs;
+    std::unique_ptr<GeometryResource> m_geometry;
+    // A synchronization fence and an event. These members will be used
+    // to synchronize the CPU with the GPU so that there will be no
+    // contention for the constant buffers.
+    Microsoft::WRL::ComPtr<ID3D12Fence> m_fence;
+    Microsoft::WRL::Wrappers::Event m_fenceEvent;
+
+    // Frame
+    UINT64 m_frameIndex = 0;
+    vector<double> m_frameTimeList;
+    // Benchmark options.
+    BenchmarkOptions *m_opts;
+    ofstream m_outfile;
+    // GPU timer
+    D3D12::D3D12Timer m_gpuTimer;
+    int m_maxTimerNum = 500;
+    int m_gpuTimerIdx = 0;
+
+    void eventStart(ID3D12GraphicsCommandList *pCommandList) { m_gpuTimer.start(pCommandList, m_gpuTimerIdx); }
+
+    void eventEnd(ID3D12GraphicsCommandList *pCommandList) {
+        m_gpuTimer.stop(pCommandList, m_gpuTimerIdx);
+        m_gpuTimer.resolveQueryToCPU(pCommandList, m_gpuTimerIdx);
+    }
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.cpp
new file mode 100644
index 000000000..3b3537e5d
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.cpp
@@ -0,0 +1,139 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "RenderGeometryPass.h"
+
+int RenderGeometryPass::DefineRootParameters(std::vector<CD3DX12_ROOT_PARAMETER> &rootParameters) {
+    int numRootParams = 5;
+    rootParameters.resize(numRootParams);
+
+    std::unique_ptr<CD3DX12_DESCRIPTOR_RANGE> texTable0 = std::make_unique<CD3DX12_DESCRIPTOR_RANGE>();
+    texTable0->Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0);
+    std::unique_ptr<CD3DX12_DESCRIPTOR_RANGE> texTable1 = std::make_unique<CD3DX12_DESCRIPTOR_RANGE>();
+    texTable1->Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, m_numShaderResource - 1, 1, 0);
+
+    rootParameters[0].InitAsConstantBufferView(0);                                                  // obj cb
+    rootParameters[1].InitAsConstantBufferView(1);                                                  // pass cb
+    rootParameters[2].InitAsConstantBufferView(2);                                                  // material cb
+    rootParameters[3].InitAsDescriptorTable(1, texTable0.release(), D3D12_SHADER_VISIBILITY_PIXEL); // cube texture
+    rootParameters[4].InitAsDescriptorTable(1, texTable1.release(), D3D12_SHADER_VISIBILITY_PIXEL); // texture array
+
+    return numRootParams;
+}
+
+int RenderGeometryPass::DefineStaticSamplers(std::vector<CD3DX12_STATIC_SAMPLER_DESC> &samplers) {
+    int samplersCount = 1;
+    samplers.resize(samplersCount);
+
+    CD3DX12_STATIC_SAMPLER_DESC anisotropicWrap(0,                               // shaderRegister
+                                                D3D12_FILTER_ANISOTROPIC,        // filter
+                                                D3D12_TEXTURE_ADDRESS_MODE_WRAP, // addressU
+                                                D3D12_TEXTURE_ADDRESS_MODE_WRAP, // addressV
+                                                D3D12_TEXTURE_ADDRESS_MODE_WRAP, // addressW
+                                                0.0f,                            // mipLODBias
+                                                8);                              // maxAnisotropy
+    samplers[0] = anisotropicWrap;
+
+    return samplersCount;
+}
+
+void RenderGeometryPass::CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width,
+                                                  int height) {
+    // Create a descriptor heap that will store the SRV:
+    D3D12_DESCRIPTOR_HEAP_DESC srvHeapDesc = {};
+    srvHeapDesc.NumDescriptors = m_numShaderResource;
+    srvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+    srvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+    ThrowIfFailed(device->CreateDescriptorHeap(&srvHeapDesc, IID_PPV_ARGS(&m_srvDescriptorHeap)));
+
+    CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(m_srvDescriptorHeap->GetCPUDescriptorHandleForHeapStart());
+    m_cbvSrvDescriptorSize = device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+
+    // Whole screen texture.
+    TextureCube(device, cmdList, m_shaderResources[0], m_width, m_height, m_colorFormat);
+    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
+    srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+    srvDesc.Format = m_shaderResources[0]->GetDesc().Format;
+    srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURECUBE;
+    srvDesc.Texture2D.MipLevels = m_shaderResources[0]->GetDesc().MipLevels;
+    device->CreateShaderResourceView(m_shaderResources[0].Get(), &srvDesc, cpuHandle);
+    cpuHandle.Offset(m_cbvSrvDescriptorSize);
+
+    // Small texture.
+    for (int i = 1; i < m_numShaderResource; i++) {
+        Texture2D(device, cmdList, m_shaderResources[i], width, height, m_colorFormat);
+        D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
+        srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+        srvDesc.Format = m_shaderResources[i]->GetDesc().Format;
+        srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
+        srvDesc.Texture2D.MipLevels = m_shaderResources[i]->GetDesc().MipLevels;
+        device->CreateShaderResourceView(m_shaderResources[i].Get(), &srvDesc, cpuHandle);
+        cpuHandle.Offset(m_cbvSrvDescriptorSize);
+    }
+}
+
+void RenderGeometryPass::CreateConstantBufferResources(ID3D12Device *device) {
+    m_viewCB = std::make_unique<UploadBuffer<BaseViewConstantBuffer>>(device, 1, true);
+    m_objectCB = std::make_unique<UploadBuffer<ObjectConstantBuffer>>(device, 1, true);
+    m_materialCB = std::make_unique<UploadBuffer<MaterialConstantBuffer>>(device, 1, true);
+}
+
+void RenderGeometryPass::UpdateConstantBufferData() {
+    BaseViewConstantBuffer viewCBData;
+    ObjectConstantBuffer objectCBData;
+    MaterialConstantBuffer materialCBData;
+    m_viewCB->CopyData(0, viewCBData);
+    m_objectCB->CopyData(0, objectCBData);
+    m_materialCB->CopyData(0, materialCBData);
+}
+
+void RenderGeometryPass::BuildShapeGeometry(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList) {
+    // Create random geometry.
+    std::unique_ptr<Geometry> geoData = CreateRandomGeometry<GeometryVertex>(m_opts->m_vertexNum, m_opts->m_indexNum);
+    m_geometry = std::make_unique<GeometryResource>();
+    m_geometry->Create(device, cmdList, geoData);
+}
+
+void RenderGeometryPass::BuildPipelineStates(ID3D12Device *device) {
+    std::string textureCount_str = std::to_string(m_numShaderResource - 1);
+    LPCSTR textureCount = textureCount_str.c_str();
+    D3D_SHADER_MACRO defines[] = {
+        {"TEXTURECOUNT", textureCount},
+        {nullptr, nullptr}}; // The last entry must be nullptr to indicate the end of the array
+    ComPtr<ID3DBlob> vertexShader = CompileShader(L"Shaders/Base.hlsl", defines, "VS", "vs_5_1");
+    ComPtr<ID3DBlob> pixelShader = CompileShader(L"Shaders/Base.hlsl", defines, "PS", "ps_5_1");
+
+    // Define shader input layout.
+    std::vector<D3D12_INPUT_ELEMENT_DESC> inputLayout = {
+        {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0},
+        {"NORMAL", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 12, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0},
+        {"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 24, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0},
+        {"TANGENT", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 32, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0},
+    };
+
+    auto psoDesc = DefinePSODesc(inputLayout, vertexShader, pixelShader);
+    ThrowIfFailed(device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_PSOs["deferredBase"])));
+}
+
+void RenderGeometryPass::Draw(ID3D12GraphicsCommandList *cmdList) { DrawRenderItems(cmdList, m_opts->m_num_object); }
+
+void RenderGeometryPass::SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) {
+    cmdList->SetPipelineState(m_PSOs["deferredBase"].Get());
+    cmdList->SetGraphicsRootSignature(m_rootSignature.Get());
+    ID3D12DescriptorHeap *heaps[] = {m_srvDescriptorHeap.Get()};
+    cmdList->SetDescriptorHeaps(_countof(heaps), heaps);
+
+    cmdList->SetGraphicsRootConstantBufferView(0, m_objectCB.get()->Resource()->GetGPUVirtualAddress());
+    cmdList->SetGraphicsRootConstantBufferView(1, m_viewCB.get()->Resource()->GetGPUVirtualAddress());
+    cmdList->SetGraphicsRootConstantBufferView(2, m_materialCB.get()->Resource()->GetGPUVirtualAddress());
+    CD3DX12_GPU_DESCRIPTOR_HANDLE srvHandle(m_srvDescriptorHeap->GetGPUDescriptorHandleForHeapStart());
+    cmdList->SetGraphicsRootDescriptorTable(3, srvHandle);
+    srvHandle.Offset(1, m_cbvSrvDescriptorSize);
+    cmdList->SetGraphicsRootDescriptorTable(4, m_srvDescriptorHeap->GetGPUDescriptorHandleForHeapStart());
+
+    auto ri = m_geometry.get();
+    // Set vertex and index buffers
+    cmdList->IASetVertexBuffers(0, 1, &ri->VertexBufferView());
+    cmdList->IASetIndexBuffer(&ri->IndexBufferView());
+    cmdList->IASetPrimitiveTopology(ri->PrimitiveType);
+}
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.h
new file mode 100644
index 000000000..db1056844
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.h
@@ -0,0 +1,99 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "RenderApp.h"
+
+class GeometryVertex : Vertex {
+  public:
+    GeometryVertex() : Vertex() {
+        float tx = MathHelper::genRand2N_f(2) - 1;
+        float ty = MathHelper::genRand2N_f(2) - 1;
+        float tz = MathHelper::genRand2N_f(2) - 1;
+
+        float nx = MathHelper::genRand2N_f(2) - 1;
+        float ny = MathHelper::genRand2N_f(2) - 1;
+        float nz = MathHelper::genRand2N_f(2) - 1;
+
+        float u = MathHelper::genRand2N_f(1);
+        float v = MathHelper::genRand2N_f(1);
+
+        Normal = {tx, ty, tz};
+        TangentU = {nx, ny, nz};
+        TexC = {u, v};
+    }
+    GeometryVertex(const DirectX::XMFLOAT3 &p, const DirectX::XMFLOAT3 &n, const DirectX::XMFLOAT2 &uv,
+                   const DirectX::XMFLOAT3 &t)
+        : Vertex(p.x, p.y, p.z), Normal(n), TangentU(t), TexC(uv) {}
+    GeometryVertex(float px, float py, float pz, float nx, float ny, float nz, float tx, float ty, float tz, float u,
+                   float v)
+        : Vertex(px, py, pz), Normal(nx, ny, nz), TangentU(tx, ty, tz), TexC(u, v) {}
+    GeometryVertex(const GeometryVertex &rhs) {
+        Normal = rhs.Normal;
+        TangentU = rhs.TangentU;
+        TexC = rhs.TexC;
+        x = rhs.x;
+        y = rhs.y;
+        z = rhs.z;
+    }
+
+    DirectX::XMFLOAT3 Normal;
+    DirectX::XMFLOAT2 TexC;
+    DirectX::XMFLOAT3 TangentU;
+};
+
+struct ObjectConstantBuffer {
+    DirectX::XMFLOAT4X4 World = MathHelper::Identity4x4();
+    DirectX::XMFLOAT4X4 TexTransform = MathHelper::Identity4x4();
+    UINT MaterialIndex;
+};
+
+struct BaseViewConstantBuffer {
+    DirectX::XMFLOAT4X4 View = MathHelper::Identity4x4();
+    DirectX::XMFLOAT4X4 ViewProj = MathHelper::Identity4x4();
+    DirectX::XMFLOAT3 EyePosW = {0.0f, 0.0f, 0.0f};
+};
+
+struct MaterialConstantBuffer {
+    DirectX::XMFLOAT4 DiffuseAlbedo = {1.0f, 1.0f, 1.0f, 1.0f};
+    DirectX::XMFLOAT3 FresnelR0 = {0.01f, 0.01f, 0.01f};
+    float Roughness = 0.5f;
+
+    // Used in texture mapping.
+    DirectX::XMFLOAT4X4 MatTransform = MathHelper::Identity4x4();
+
+    UINT DiffuseMapIndex = 0;
+    UINT NormalMapIndex = 1;
+};
+
+class RenderGeometryPass : public RenderApp {
+  public:
+    RenderGeometryPass(BenchmarkOptions *args) : RenderApp(args) {
+        // screen + texture size
+        m_numShaderResource = args->m_textureNum + 1;
+        m_numPassRenderTargets = 3;
+    }
+    RenderGeometryPass(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle)
+        : RenderApp(args, hInstance, hMainWnd, winTitle) {
+        m_numShaderResource = args->m_textureNum + 1;
+        m_numPassRenderTargets = 3;
+    }
+    RenderGeometryPass(const RenderGeometryPass &rhs) = delete;
+    RenderGeometryPass &operator=(const RenderGeometryPass &rhs) = delete;
+    ~RenderGeometryPass() = default;
+
+  protected:
+    virtual int DefineRootParameters(std::vector<CD3DX12_ROOT_PARAMETER> &rootParameters) override;
+    virtual int DefineStaticSamplers(std::vector<CD3DX12_STATIC_SAMPLER_DESC> &samplers) override;
+    virtual void CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width,
+                                          int height) override;
+    virtual void CreateConstantBufferResources(ID3D12Device *device) override;
+    virtual void UpdateConstantBufferData() override;
+    virtual void BuildPipelineStates(ID3D12Device *device) override;
+    virtual void Draw(ID3D12GraphicsCommandList *cmdList) override;
+    virtual void SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) override;
+    virtual void BuildShapeGeometry(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList) override;
+
+    std::unique_ptr<UploadBuffer<ObjectConstantBuffer>> m_objectCB = nullptr;
+    std::unique_ptr<UploadBuffer<BaseViewConstantBuffer>> m_viewCB = nullptr;
+    std::unique_ptr<UploadBuffer<MaterialConstantBuffer>> m_materialCB = nullptr;
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.cpp
new file mode 100644
index 000000000..418657149
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.cpp
@@ -0,0 +1,217 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "RenderLightingPass.h"
+
+void RenderLightingPass::CreateConstantBufferResources(ID3D12Device *device) {
+    m_stencilingCB = std::make_unique<UploadBuffer<StencilingConstantBuffer>>(device, 1, true);
+    m_viewCB = std::make_unique<UploadBuffer<ViewConstantBuffer>>(device, 1, true);
+    m_lightingCB = std::make_unique<UploadBuffer<DeferredLightUniformsConstantBuffer>>(device, 1, true);
+    m_shadowProjectionCB = std::make_unique<UploadBuffer<ShadowProjectionConstantBuffer>>(device, 1, true);
+}
+
+void RenderLightingPass::UpdateConstantBufferData() {
+    StencilingConstantBuffer stencilCBData;
+    ViewConstantBuffer viewDBData;
+    DeferredLightUniformsConstantBuffer lightingCBData;
+    ShadowProjectionConstantBuffer shadowProjectionCBData;
+
+    m_stencilingCB.get()->CopyData(0, stencilCBData);
+    m_viewCB.get()->CopyData(0, viewDBData);
+    m_lightingCB.get()->CopyData(0, lightingCBData);
+    m_shadowProjectionCB.get()->CopyData(0, shadowProjectionCBData);
+}
+
+void RenderLightingPass::CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width,
+                                                  int height) {
+    // Create a descriptor heap that will store the SRV:
+    D3D12_DESCRIPTOR_HEAP_DESC srvHeapDesc = {};
+    srvHeapDesc.NumDescriptors = m_numShaderResource;
+    srvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+    srvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+    ThrowIfFailed(device->CreateDescriptorHeap(&srvHeapDesc, IID_PPV_ARGS(&m_srvDescriptorHeap)));
+
+    CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(m_srvDescriptorHeap->GetCPUDescriptorHandleForHeapStart());
+    m_cbvSrvDescriptorSize = device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+
+    // Fill out the heap with actual descriptors.
+    for (int i = 0; i < m_numShaderResource; i++) {
+        Texture2D(device, cmdList, m_shaderResources[i], width, height, m_colorFormat);
+        D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
+        srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+        srvDesc.Format = m_shaderResources[i]->GetDesc().Format;
+        srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
+        srvDesc.Texture2D.MipLevels = m_shaderResources[i]->GetDesc().MipLevels;
+        device->CreateShaderResourceView(m_shaderResources[i].Get(), &srvDesc, cpuHandle);
+        cpuHandle.Offset(m_cbvSrvDescriptorSize);
+    }
+}
+
+int RenderLightingPass::DefineRootParameters(std::vector<CD3DX12_ROOT_PARAMETER> &rootParameters) {
+    const int numRootParameters = 5;
+    rootParameters.resize(numRootParameters);
+    // Root signature defines what resources are bound to the graphics pipeline.
+    int rootParametersIndex = 0;
+
+    // Create root signatures consisting of 3 constant buffers.
+    rootParameters[rootParametersIndex].InitAsConstantBufferView(rootParametersIndex, 0,
+                                                                 D3D12_SHADER_VISIBILITY_VERTEX);
+    rootParametersIndex++;
+    rootParameters[rootParametersIndex].InitAsConstantBufferView(rootParametersIndex, 0, D3D12_SHADER_VISIBILITY_ALL);
+    rootParametersIndex++;
+    rootParameters[rootParametersIndex].InitAsConstantBufferView(rootParametersIndex, 0, D3D12_SHADER_VISIBILITY_PIXEL);
+    rootParametersIndex++;
+    rootParameters[rootParametersIndex].InitAsConstantBufferView(rootParametersIndex, 0, D3D12_SHADER_VISIBILITY_PIXEL);
+    rootParametersIndex++;
+
+    // SRV root parameter
+    std::unique_ptr<CD3DX12_DESCRIPTOR_RANGE[]> descriptorRange = std::make_unique<CD3DX12_DESCRIPTOR_RANGE[]>(1);
+    descriptorRange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, m_numShaderResource, 0,
+                            0); // Using valid D3D12_DESCRIPTOR_RANGE_TYPE_SRV
+    rootParameters[rootParametersIndex].InitAsDescriptorTable(1, descriptorRange.release(),
+                                                              D3D12_SHADER_VISIBILITY_PIXEL);
+    rootParametersIndex++;
+
+    return numRootParameters;
+}
+
+void RenderLightingPass::BuildPipelineStates(ID3D12Device *device) {
+    // Define shader input layout.
+    std::vector<D3D12_INPUT_ELEMENT_DESC> inputLayout = {D3D12_INPUT_ELEMENT_DESC{
+        "POSITION",                                 // SemanticName
+        0,                                          // SemanticIndex
+        DXGI_FORMAT_R32G32B32_FLOAT,                // Format
+        0,                                          // InputSlot
+        D3D12_APPEND_ALIGNED_ELEMENT,               // AlignedByteOffset
+        D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, // InputSlotClass
+        0                                           // InstanceDataStepRate
+    }};
+    // Create the pipeline state, which includes compiling and loading shaders.
+    ComPtr<ID3DBlob> vertexShader =
+        CompileShader(L"Shaders/DefferredLightingVertex.hlsl", nullptr, "RadialVertexMain", "vs_5_1");
+    ComPtr<ID3DBlob> pixelShader =
+        CompileShader(L"Shaders/DefferredLightingPixel.hlsl", nullptr, "DeferredLightPixelMain", "ps_5_1");
+    auto psoDesc = DefinePSODesc(inputLayout, vertexShader, pixelShader);
+    ThrowIfFailed(device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_PSOs["deferredLighting"])));
+
+    vertexShader = CompileShader(L"Shaders/DefferredLightingVertex.hlsl", nullptr, "RadialVertexMain", "vs_5_1");
+    pixelShader =
+        CompileShader(L"Shaders/DefferredLightingPixel.hlsl", nullptr, "MainOnePassPointLightShadowPS", "ps_5_1");
+    psoDesc = DefinePSODesc(inputLayout, vertexShader, pixelShader);
+    ThrowIfFailed(device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_PSOs["ShadowProjection"])));
+}
+
+void RenderLightingPass::SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) {
+    ID3D12DescriptorHeap *ppHeaps[] = {m_srvDescriptorHeap.Get()};
+    cmdList->SetDescriptorHeaps(ARRAYSIZE(ppHeaps), ppHeaps);
+    cmdList->SetGraphicsRootSignature(m_rootSignature.Get());
+    cmdList->SetGraphicsRootConstantBufferView(0, m_stencilingCB.get()->Resource()->GetGPUVirtualAddress());
+    cmdList->SetGraphicsRootConstantBufferView(1, m_viewCB.get()->Resource()->GetGPUVirtualAddress());
+    cmdList->SetGraphicsRootConstantBufferView(2, m_lightingCB.get()->Resource()->GetGPUVirtualAddress());
+    cmdList->SetGraphicsRootConstantBufferView(3, m_shadowProjectionCB.get()->Resource()->GetGPUVirtualAddress());
+    cmdList->SetGraphicsRootDescriptorTable(4, m_srvDescriptorHeap->GetGPUDescriptorHandleForHeapStart());
+
+    auto ri = m_geometry.get();
+    // Set vertex and index buffers
+    cmdList->IASetVertexBuffers(0, 1, &ri->VertexBufferView());
+    cmdList->IASetIndexBuffer(&ri->IndexBufferView());
+    cmdList->IASetPrimitiveTopology(ri->PrimitiveType);
+}
+
+void RenderLightingPass::Draw(ID3D12GraphicsCommandList *cmdList) {
+    DrawShadowProjection(cmdList);
+    DrawLighting(cmdList);
+}
+
+void RenderLightingPass::DrawShadowProjection(ID3D12GraphicsCommandList *cmdList) {
+    cmdList->SetPipelineState(m_PSOs["ShadowProjection"].Get());
+    DrawRenderItems(cmdList, m_opts->m_num_light);
+}
+
+void RenderLightingPass::DrawLighting(ID3D12GraphicsCommandList *cmdList) {
+
+    cmdList->SetPipelineState(m_PSOs["deferredLighting"].Get());
+    DrawRenderItems(cmdList, m_opts->m_num_light);
+}
+
+/*
+ * @brief: Get the samplers.
+ * @return: The static samplers.
+ */
+int RenderLightingPass::DefineStaticSamplers(std::vector<CD3DX12_STATIC_SAMPLER_DESC> &samplerData)
+
+{
+    int samplersCount = 10;
+    samplerData.resize(samplersCount);
+
+    int samplerIndex = 0;
+    CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_SceneDepthTextureSampler(
+        samplerIndex,                      // shaderRegister
+        D3D12_FILTER_MIN_MAG_MIP_POINT,    // filter
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressU
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressV
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW
+
+    samplerData[samplerIndex++] = SceneTexturesStruct_SceneDepthTextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferATextureSampler(
+        samplerIndex,                      // shaderRegister
+        D3D12_FILTER_MIN_MAG_MIP_POINT,    // filter
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressU
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressV
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW
+    samplerData[samplerIndex++] = SceneTexturesStruct_GBufferATextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferBTextureSampler(
+        samplerIndex,                      // shaderRegister
+        D3D12_FILTER_MIN_MAG_MIP_POINT,    // filter
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressU
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressV
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW
+    samplerData[samplerIndex++] = SceneTexturesStruct_GBufferBTextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferCTextureSampler(
+        samplerIndex,                      // shaderRegister
+        D3D12_FILTER_MIN_MAG_MIP_POINT,    // filter
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressU
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressV
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW
+    samplerData[samplerIndex++] = SceneTexturesStruct_GBufferCTextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferDTextureSampler(
+        samplerIndex,                      // shaderRegister
+        D3D12_FILTER_MIN_MAG_MIP_POINT,    // filter
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressU
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressV
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW
+    samplerData[samplerIndex++] = SceneTexturesStruct_GBufferDTextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferETextureSampler(
+        samplerIndex,                      // shaderRegister
+        D3D12_FILTER_MIN_MAG_MIP_POINT,    // filter
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressU
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressV
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW
+    samplerData[samplerIndex++] = SceneTexturesStruct_GBufferETextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_ScreenSpaceAOTextureSampler(
+        samplerIndex,                      // shaderRegister
+        D3D12_FILTER_MIN_MAG_MIP_POINT,    // filter
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressU
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressV
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW
+    samplerData[samplerIndex++] = SceneTexturesStruct_ScreenSpaceAOTextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC LightAttenuationTextureSampler(
+        samplerIndex, D3D12_FILTER_MIN_MAG_MIP_POINT, D3D12_TEXTURE_ADDRESS_MODE_WRAP, D3D12_TEXTURE_ADDRESS_MODE_WRAP,
+        D3D12_TEXTURE_ADDRESS_MODE_WRAP);
+    samplerData[samplerIndex++] = LightAttenuationTextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_CustomDepthTextureSampler(
+        8,                                 // shaderRegister
+        D3D12_FILTER_MIN_MAG_MIP_POINT,    // filter
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressU
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP,  // addressV
+        D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW
+    samplerData[samplerIndex++] = SceneTexturesStruct_CustomDepthTextureSampler;
+    const CD3DX12_STATIC_SAMPLER_DESC ShadowDepthCubeTextureSampler(9, // shaderRegister
+                                                                    D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT, // filter
+                                                                    D3D12_TEXTURE_ADDRESS_MODE_CLAMP,      // addressU
+                                                                    D3D12_TEXTURE_ADDRESS_MODE_CLAMP,      // addressV
+                                                                    D3D12_TEXTURE_ADDRESS_MODE_CLAMP);     // addressW
+    samplerData[samplerIndex++] = ShadowDepthCubeTextureSampler;
+
+    return samplersCount;
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.h
new file mode 100644
index 000000000..8a2388bf1
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.h
@@ -0,0 +1,119 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "RenderApp.h"
+
+struct DeferredLightUniformsConstantBuffer {
+    XMFLOAT4 DeferredLightUniforms_ShadowMapChannelMask = {0.00, 0.00, 0.00, 0.00};
+    XMFLOAT2 DeferredLightUniforms_DistanceFadeMAD = {0.00, 0.00};
+    float DeferredLightUniforms_ContactShadowLength = 0.00;
+    float DeferredLightUniforms_VolumetricScatteringIntensity = 1.00;
+    UINT DeferredLightUniforms_ShadowedBits = 3;
+    UINT DeferredLightUniforms_LightingChannelMask = 1;
+    float PrePadding_DeferredLightUniforms_40 = 0.00;
+    float PrePadding_DeferredLightUniforms_44 = 0.00;
+    XMFLOAT3 DeferredLightUniforms_Position = {722.74805, 2515.36084, 94.87169};
+    float DeferredLightUniforms_InvRadius = 0.00195;
+    XMFLOAT3 DeferredLightUniforms_Color = {8.64818, 6.97867, 4.4531};
+    float DeferredLightUniforms_FalloffExponent = 8.00;
+    XMFLOAT3 DeferredLightUniforms_Direction = {1.00, 0.00, 0.00};
+    float DeferredLightUniforms_SpecularScale = 1.00;
+    XMFLOAT3 DeferredLightUniforms_Tangent = {0.00, 0.00, 1.00};
+    float DeferredLightUniforms_SourceRadius = 0.00;
+    XMFLOAT2 DeferredLightUniforms_SpotAngles = {2.00, 1.00};
+    float DeferredLightUniforms_SoftSourceRadius = 0.00;
+    float DeferredLightUniforms_SourceLength = 0.00;
+    float DeferredLightUniforms_RectLightBarnCosAngle = 2652.84375;
+    float DeferredLightUniforms_RectLightBarnLength = 5.89947E-43;
+};
+
+struct ViewConstantBuffer {
+    XMFLOAT4 View_InvDeviceZToWorldZTransform = {0.00, 0.00, 0.10, -1.00000E-08};
+    XMFLOAT4 View_TemporalAAParams = {0.00, 1.00, 0.00, 0.00};
+    XMFLOAT4 View_BufferSizeAndInvSize = {1384.00, 676.00, 0.00072, 0.00148};
+    XMFLOAT4 View_DiffuseOverrideParameter = {0.00, 0.00, 0.00, 1.00};
+    XMFLOAT4 View_SpecularOverrideParameter = {0.00, 0.00, 0.00, 1.00};
+    XMFLOAT4X4 View_ClipToView = {0.00, 0.48821, 0.00, 0.00, 0.00, 0.00, 0.00, 0.10,
+                                  0.00, 0.00,    1.00, 0.00, 0,    0,    0,    0};
+    XMFLOAT4X4 View_ViewToClip = {
+        1.00, 0.00, 0.00, 0.00, 0.00, 2.04831, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 10.00, 0.00,
+    };
+    XMFLOAT4X4 View_ScreenToWorld = {-0.04472, 0.00587,    0.48612,   0.00, -0.98725, 0.12963, -0.09239, 0.00,
+                                     -7.70195, 2584.20215, 184.65012, 1.00, 0,        0,       0,        0};
+    XMFLOAT3 View_WorldCameraOrigin = {-7.70195, 2584.20215, 184.65012};
+    float padding0 = 0;
+
+    XMFLOAT3 View_PreViewTranslation = {7.70195, -2584.20215, -184.65012};
+    float padding1 = 0;
+    XMFLOAT4 View_ScreenPositionScaleBias = {0.49928, -0.49926, 0.49926, 0.49928};
+
+    XMFLOAT4X4 View_TranslatedWorldToClip = {
+        1.00, 0.00, 0.00, 0.00, 0.00, 2.04831, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 10.00, 0.00,
+    };
+    UINT View_StateFrameIndexMod8View_StateFrameIndexMod8 = 1;
+    XMFLOAT3 Padding = {0, 0, 0}; // Add padding to maintain 16-byte alignment
+};
+
+struct StencilingConstantBuffer {
+    XMFLOAT4 StencilingGeometryPosAndScale = {715.04608, -68.84131, -89.77843, 530.0614};
+    XMFLOAT4 StencilingConeParameters = {0.00, 0.00, 0.00, 0.00};
+    XMFLOAT4X4 StencilingConeTransform = {0.00, 0.00, -0.005,   0.00,  -1.00, 1.00,  0.50, 1.00,
+                                          1.00, 0.00, -1.00143, -1.00, 0.00,  -1.00, 0.00, 0.00};
+
+    XMFLOAT3 StencilingPreViewTranslation = {1.00, 0.00, 0.00};
+};
+
+struct ShadowProjectionConstantBuffer {
+    XMFLOAT4 LightPositionAndInvRadius = {722.74805, 2515.36084, 94.87169, 0.00195};
+    XMFLOAT4 PointLightDepthBiasAndProjParameters = {0.025, 0.00, -0.99805, -1.00};
+    XMFLOAT4X4 ShadowViewProjectionMatrices[6] = {{0.00, 0.00, -1.00196, -1.00, 0.00, -1.00, 0.00, 0.00, 1.00, 0.00,
+                                                   0.00, 0.00, -94.87168, 2515.3606, -725.16437, -722.74805},
+                                                  {0.00, 0.00, 1.00196, 1.00, 0.00, -1.00, 0.00, 0.00, -1.00, 0.00,
+                                                   0.00, 0.00, 94.87168, 2515.3606, 723.16046, 722.74805},
+                                                  {-1.00, 0.00, 0.00, 0.00, 0.00, 0.00, -1.00196, -1.00, 0.00, 1.00,
+                                                   0.00, 0.00, -722.74799, -94.87168, 2519.28125, 2515.36084},
+                                                  {-1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00196, 1.00, 0.00, -1.00,
+                                                   0.00, 0.00, -722.74799, 94.87168, -2521.28516, -2515.36084},
+                                                  {-1.00, 0.00, 0.00, 0.00, 0.00, -1.00, 0.00, 0.00, 0.00, 0.00,
+                                                   -1.00196, -1.00, -722.74799, 2515.3606, 94.05539, 94.87169},
+                                                  {1.00, 0.00, 0.00, 0.00, 0.00, -1.00, 0.00, 0.00, 0.00, 0.00, 1.00196,
+                                                   1.00, 722.74799, 2515.3606, -96.05931, -94.87169}};
+
+    float ShadowSharpen = 1;
+    float ShadowFadeFraction = 1;
+    float InvShadowmapResolution = 0.00098;
+};
+
+class RenderLightingPass : public RenderApp {
+  public:
+    RenderLightingPass(BenchmarkOptions *opts) : RenderApp(opts) {
+        m_numShaderResource = 10;
+        m_numPassRenderTargets = 1;
+    }
+    RenderLightingPass(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle)
+        : RenderApp(args, hInstance, hMainWnd, winTitle) {}
+    RenderLightingPass(const RenderLightingPass &rhs) = delete;
+    RenderLightingPass &operator=(const RenderLightingPass &rhs) = delete;
+    ~RenderLightingPass() = default;
+
+    void DrawShadowProjection(ID3D12GraphicsCommandList *cmdList);
+    void DrawLighting(ID3D12GraphicsCommandList *cmdList);
+
+  protected:
+    virtual int DefineRootParameters(std::vector<CD3DX12_ROOT_PARAMETER> &rootParameters) override;
+    virtual int DefineStaticSamplers(std::vector<CD3DX12_STATIC_SAMPLER_DESC> &samplers) override;
+    virtual void CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width,
+                                          int height) override;
+    virtual void CreateConstantBufferResources(ID3D12Device *device) override;
+    virtual void UpdateConstantBufferData() override;
+    virtual void BuildPipelineStates(ID3D12Device *device) override;
+    virtual void Draw(ID3D12GraphicsCommandList *cmdList) override;
+    virtual void SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) override;
+
+    std::unique_ptr<UploadBuffer<StencilingConstantBuffer>> m_stencilingCB = nullptr;
+    std::unique_ptr<UploadBuffer<ViewConstantBuffer>> m_viewCB = nullptr;
+    std::unique_ptr<UploadBuffer<DeferredLightUniformsConstantBuffer>> m_lightingCB = nullptr;
+    std::unique_ptr<UploadBuffer<ShadowProjectionConstantBuffer>> m_shadowProjectionCB = nullptr;
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.cpp
new file mode 100644
index 000000000..d058c3937
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.cpp
@@ -0,0 +1,77 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "RenderShadowMapPass.h"
+
+int RenderShadowMapPass::DefineRootParameters(std::vector<CD3DX12_ROOT_PARAMETER> &rootParameters) {
+    const int numRootParameters = 1;
+
+    rootParameters.resize(numRootParameters);
+    rootParameters[0].InitAsConstantBufferView(0); // obj cb
+    return numRootParameters;
+}
+
+int RenderShadowMapPass::DefineStaticSamplers(std::vector<CD3DX12_STATIC_SAMPLER_DESC> &samplers) { return 0; }
+
+void RenderShadowMapPass::CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width,
+                                                   int height) {
+    return;
+}
+
+void RenderShadowMapPass::CreateConstantBufferResources(ID3D12Device *device) {
+    m_viewCB = std::make_unique<UploadBuffer<ShadowViewConstantBuffer>>(device, 1, true);
+}
+
+void RenderShadowMapPass::UpdateConstantBufferData() {
+    ShadowViewConstantBuffer viewDBData;
+    viewDBData.world = MathHelper::Identity4x4();
+    viewDBData.projection = MathHelper::Identity4x4();
+    m_viewCB.get()->CopyData(0, viewDBData);
+}
+
+void RenderShadowMapPass::BuildPipelineStates(ID3D12Device *device) {
+    // Define shader input layout.
+    std::vector<D3D12_INPUT_ELEMENT_DESC> inputLayout = {
+        {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0},
+        {"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 24, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}
+
+    };
+    // Create the pipeline state, which includes compiling and loading shaders.
+    ComPtr<ID3DBlob> vertexShader = CompileShader(L"Shaders/ShadowMap.hlsl", nullptr, "VS", "vs_5_1");
+    ComPtr<ID3DBlob> pixelShader = CompileShader(L"Shaders/ShadowMap.hlsl", nullptr, "PS", "ps_5_1");
+
+    CD3DX12_DEPTH_STENCIL_DESC depthStencilDesc(D3D12_DEFAULT);
+    depthStencilDesc.DepthEnable = true;
+    depthStencilDesc.DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL;
+    depthStencilDesc.DepthFunc = D3D12_COMPARISON_FUNC_LESS;
+    depthStencilDesc.StencilEnable = false;
+
+    auto psoDesc = DefinePSODesc(inputLayout, vertexShader, pixelShader);
+    psoDesc.DSVFormat = m_deviceResources->m_depthBufferFormat;
+    psoDesc.RasterizerState.DepthBias = 100000;
+    psoDesc.RasterizerState.DepthBiasClamp = 0.0f;
+    psoDesc.RasterizerState.SlopeScaledDepthBias = 1.0f;
+    psoDesc.DepthStencilState = depthStencilDesc;
+
+    ThrowIfFailed(device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_PSOs["ShadowMap"])));
+}
+
+void RenderShadowMapPass::SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) {
+    // Set necessary state.
+    cmdList->SetPipelineState(m_PSOs["ShadowMap"].Get());
+    cmdList->SetGraphicsRootSignature(m_rootSignature.Get());
+    auto dsv = m_deviceResources->GetDepthStencilView();
+    cmdList->ClearDepthStencilView(dsv, D3D12_CLEAR_FLAG_DEPTH | D3D12_CLEAR_FLAG_STENCIL, 1.0f, 0, 0, nullptr);
+    auto device = m_deviceResources->GetD3DDevice();
+    cmdList->OMSetRenderTargets(1, &GetRenderTargetView(device), true, &dsv);
+    // Set root arguments.
+    cmdList->SetGraphicsRootConstantBufferView(0, m_viewCB->Resource()->GetGPUVirtualAddress());
+
+    auto ri = m_geometry.get();
+    // Set vertex and index buffers
+    cmdList->IASetVertexBuffers(0, 1, &ri->VertexBufferView());
+    cmdList->IASetIndexBuffer(&ri->IndexBufferView());
+    cmdList->IASetPrimitiveTopology(ri->PrimitiveType);
+}
+
+void RenderShadowMapPass::Draw(ID3D12GraphicsCommandList *cmdList) { DrawRenderItems(cmdList, m_opts->m_num_object); }
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.h
new file mode 100644
index 000000000..c6f3a6909
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.h
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "RenderApp.h"
+
+struct ShadowViewConstantBuffer {
+    XMFLOAT4X4 world;
+    XMFLOAT4X4 projection;
+};
+
+class RenderShadowMapPass : public RenderApp {
+  public:
+    RenderShadowMapPass(BenchmarkOptions *opts) : RenderApp(opts) {}
+    RenderShadowMapPass(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle)
+        : RenderApp(args, hInstance, hMainWnd, winTitle) {}
+    RenderShadowMapPass(const RenderShadowMapPass &rhs) = delete;
+    RenderShadowMapPass &operator=(const RenderShadowMapPass &rhs) = delete;
+    ~RenderShadowMapPass() = default;
+
+  protected:
+    virtual int DefineRootParameters(std::vector<CD3DX12_ROOT_PARAMETER> &rootParameters) override;
+    virtual int DefineStaticSamplers(std::vector<CD3DX12_STATIC_SAMPLER_DESC> &samplers) override;
+    virtual void CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width,
+                                          int height) override;
+    virtual void CreateConstantBufferResources(ID3D12Device *device) override;
+    virtual void UpdateConstantBufferData() override;
+    virtual void BuildPipelineStates(ID3D12Device *device) override;
+    virtual void Draw(ID3D12GraphicsCommandList *cmdList) override;
+    virtual void SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) override;
+
+    std::unique_ptr<UploadBuffer<ShadowViewConstantBuffer>> m_viewCB = nullptr;
+};
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/Base.hlsl b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/Base.hlsl
new file mode 100644
index 000000000..3f7643d3c
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/Base.hlsl
@@ -0,0 +1,134 @@
+// Scene cube.
+TextureCube gCubeMap : register(t0);
+
+// An array of textures, which is only supported in shader model 5.1+.  Unlike Texture2DArray, the textures
+// in this array can be different sizes and formats, making it more flexible than texture arrays.
+Texture2D gTextureMaps[TEXTURECOUNT] : register(t1);
+
+
+SamplerState gsamAnisotropicWrap : register(s0);
+
+
+// Constant data that varies per frame.
+cbuffer D3DObjectConstantBuffer : register(b0)
+{
+    float4x4 gWorld;
+    float4x4 gTexTransform;
+    uint gMaterialIndex;
+};
+
+// Constant data that varies per material.
+cbuffer PassConstantBuffer : register(b1)
+{
+    float4x4 gView;
+    float4x4 gViewProj;
+    float3 gEyePosW;
+};
+
+cbuffer MaterialDataConstantBuffer : register(b2)
+{
+    float4 DiffuseAlbedo;
+    float3 FresnelR0;
+    float Roughness;
+    float4x4 MatTransform;
+    uint DiffuseMapIndex;
+    uint NormalMapIndex;
+};
+
+
+//---------------------------------------------------------------------------------------
+// Transforms a normal map sample to world space.
+//---------------------------------------------------------------------------------------
+float3 NormalSampleToWorldSpace(float3 normalMapSample, float3 unitNormalW, float3 tangentW)
+{
+	// Transform from [0,1] to [-1,1].
+    float3 normalT = 2.0f * normalMapSample - 1.0f;
+
+    float3 N = unitNormalW;
+    float3 T = normalize(tangentW - dot(tangentW, N) * N);
+    float3 B = cross(N, T);
+
+    float3x3 TBN = float3x3(T, B, N);
+
+	// Trans to world space.
+    float3 bumpedNormalW = mul(normalT, TBN);
+
+    return bumpedNormalW;
+}
+
+
+struct VertexIn
+{
+	float3 PosL    : POSITION;
+    float3 NormalL : NORMAL;
+	float2 TexC    : TEXCOORD;
+	float3 TangentU : TANGENT;
+};
+
+struct VertexOut
+{
+	float4 PosH    : SV_POSITION;
+    float3 PosW    : POSITION;
+    float3 NormalW : NORMAL;
+	float3 TangentW : TANGENT;
+	float2 TexC    : TEXCOORD;
+};
+
+struct PixelOut
+{
+    float4 position : SV_Target0;
+    float4 normal : SV_Target1;
+    float4 color : SV_Target2;
+};
+
+float3 SchlickFresnel(float3 R0, float3 normal, float3 lightVec)
+{
+    float cosIncidentAngle = saturate(dot(normal, lightVec));
+
+    float f0 = 1.0f - cosIncidentAngle;
+    float3 reflectPercent = R0 + (1.0f - R0) * (f0 * f0 * f0 * f0 * f0);
+
+    return reflectPercent;
+}
+
+VertexOut VS(VertexIn vin)
+{
+    VertexOut vout = (VertexOut)0.0f;
+
+    float4 posW = mul(float4(vin.PosL, 1.0f), gWorld);
+    vout.PosW = posW.xyz;
+    vout.PosH = mul(posW, gViewProj);
+    vout.NormalW = mul(vin.NormalL, (float3x3) gWorld);
+    vout.TangentW = mul(vin.TangentU, (float3x3) gWorld);
+    float4 texC = mul(float4(vin.TexC, 0.0f, 1.0f), gTexTransform);
+    vout.TexC = mul(texC, MatTransform).xy;
+
+    return vout;
+}
+
+PixelOut PS(VertexOut pin)
+{
+    // Normalize normap map.
+    pin.NormalW = normalize(pin.NormalW);
+    
+    float4 normalSample = gTextureMaps[NormalMapIndex].Sample(gsamAnisotropicWrap, pin.TexC);
+    float3 bumpedNormalW = NormalSampleToWorldSpace(normalSample.xyz, pin.NormalW, pin.TangentW);
+    float4 diffuseAlbedo = DiffuseAlbedo * 
+    gTextureMaps[DiffuseMapIndex].Sample(gsamAnisotropicWrap, pin.TexC);
+    
+    const float shininess = (1.0f - Roughness) * normalSample.a;
+
+    PixelOut pout;
+    pout.position = float4(pin.PosW, FresnelR0.x);
+    pout.normal = float4(bumpedNormalW, shininess);
+    
+    float3 toEyeW = normalize(gEyePosW - pin.PosW);
+    float3 ref = reflect(-toEyeW, bumpedNormalW);
+    float4 reflectColor = gCubeMap.Sample(gsamAnisotropicWrap, ref);
+    float3 fresnelFactor = SchlickFresnel(FresnelR0, bumpedNormalW, ref);
+    pout.color = float4(diffuseAlbedo.xyz + shininess * fresnelFactor * reflectColor.xyz, 1.0f);
+    
+    return pout;
+}
+
+
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingPixel.hlsl b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingPixel.hlsl
new file mode 100644
index 000000000..29aa971a1
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingPixel.hlsl
@@ -0,0 +1,919 @@
+
+
+// Constant buffers
+cbuffer ViewConstantBuffer : register(b1)  
+{  
+    float4 View_InvDeviceZToWorldZTransform;  
+    float4 View_TemporalAAParams;  
+    float4 View_BufferSizeAndInvSize;  
+    float4 View_DiffuseOverrideParameter;  
+    float4 View_SpecularOverrideParameter;  
+    float4x4 View_ClipToView;  
+    float4x4 View_ViewToClip;  
+    float4x4 View_ScreenToWorld;  
+    float3 View_WorldCameraOrigin;
+    float Padding0;
+    float3 View_PreViewTranslation;  
+    float Padding1;
+    float4 View_ScreenPositionScaleBias;  
+    float4x4 View_TranslatedWorldToClip;  
+    uint View_StateFrameIndexMod8;  
+    float3 Padding; // Add padding to maintain 16-byte alignment  
+};  
+
+
+
+cbuffer DeferredLightUniformsConstantBuffer : register(b2)  
+{  
+    float4 DeferredLightUniforms_ShadowMapChannelMask;  
+    float2 DeferredLightUniforms_DistanceFadeMAD;  
+    float DeferredLightUniforms_ContactShadowLength;  
+    float DeferredLightUniforms_VolumetricScatteringIntensity;  
+    uint DeferredLightUniforms_ShadowedBits;  
+    uint DeferredLightUniforms_LightingChannelMask;  
+    float3 DeferredLightUniforms_Position;  
+    float DeferredLightUniforms_InvRadius;  
+    float3 DeferredLightUniforms_Color;  
+    float DeferredLightUniforms_FalloffExponent;  
+    float3 DeferredLightUniforms_Direction;  
+    float DeferredLightUniforms_SpecularScale;  
+    float3 DeferredLightUniforms_Tangent;  
+    float DeferredLightUniforms_SourceRadius;  
+    float2 DeferredLightUniforms_SpotAngles;  
+    float DeferredLightUniforms_SoftSourceRadius;  
+    float DeferredLightUniforms_SourceLength;  
+    float DeferredLightUniforms_RectLightBarnCosAngle;  
+    float DeferredLightUniforms_RectLightBarnLength;  
+};  
+
+cbuffer ShadowConstantBuffer : register(b3)
+{
+    float4 LightPositionAndInvRadius;
+    float4 PointLightDepthBiasAndProjParameters;
+    float4x4 ShadowViewProjectionMatrices[6]; 
+    float ShadowSharpen;
+    float ShadowFadeFraction;
+    float InvShadowmapResolution;
+}
+
+
+
+// Texture declarations  
+Texture2D<float4> SceneTexturesStruct_SceneDepthTexture: register(t0);
+Texture2D<float4> SceneTexturesStruct_GBufferATexture : register(t1);
+Texture2D<float4> SceneTexturesStruct_GBufferBTexture : register(t2);
+Texture2D<float4> SceneTexturesStruct_GBufferCTexture : register(t3);
+Texture2D<float4> SceneTexturesStruct_GBufferDTexture : register(t4);
+Texture2D<float4> SceneTexturesStruct_GBufferETexture : register(t5);
+Texture2D<float4> SceneTexturesStruct_ScreenSpaceAOTexture : register(t6);
+Texture2D<float4> LightAttenuationTexture: register(t7);
+Texture2D<float4> SceneTexturesStruct_CustomDepthTexture : register(t8);
+Texture2D<float4> ShadowDepthCubeTexture : register(t9);
+
+
+
+// Sampler declarations (assuming sampler registers are in the same register space)  
+SamplerState SceneTexturesStruct_SceneDepthTextureSampler : register(s0);
+SamplerState SceneTexturesStruct_GBufferATextureSampler : register(s1);
+SamplerState SceneTexturesStruct_GBufferBTextureSampler : register(s2);
+SamplerState SceneTexturesStruct_GBufferCTextureSampler : register(s3);
+SamplerState SceneTexturesStruct_GBufferDTextureSampler : register(s4);
+SamplerState SceneTexturesStruct_GBufferETextureSampler : register(s5);
+SamplerState SceneTexturesStruct_ScreenSpaceAOTextureSampler : register(s6);
+SamplerState LightAttenuationTextureSampler : register(s7);
+SamplerState SceneTexturesStruct_CustomDepthTextureSampler : register(s8);
+SamplerComparisonState ShadowDepthCubeTextureSampler : register(s9);
+
+const static float PI = 3.1415926535897932f;
+const static float MaxHalfFloat = 65504.0f;
+
+struct FLightAccumulator {
+	float3 Diffuse;
+	float3 Specular;
+	float3 Transmission;
+	float EstimatedCost;
+	float3 TotalLight;
+};
+
+struct FGBufferData {
+	float3 WorldNormal;
+	float PerObjectGBufferData;
+	float Metallic;
+	float Specular;
+	float Roughness;
+	uint ShadingModelID;
+	uint SelectiveOutputMask;
+	float3 BaseColor;
+	float GBufferAO;
+	float IndirectIrradiance;
+	float4 CustomData;
+	float4 PrecomputedShadowFactors;
+	float CustomDepth;
+	uint CustomStencil;
+	float Depth;
+	float3 StoredBaseColor;
+	float StoredMetallic;
+	float StoredSpecular;
+	float3 SpecularColor;
+	float3 DiffuseColor;
+	float4 Velocity;
+};
+
+
+struct FDeferredLightData {
+	float3 Position;
+	float InvRadius;
+	float3 Color;
+	float FalloffExponent;
+	float3 Direction;
+	float3 Tangent;
+	float2 SpotAngles;
+	float SourceRadius;
+	float SourceLength;
+	float SoftSourceRadius;
+	float SpecularScale;
+	float ContactShadowLength;
+	bool ContactShadowLengthInWS;
+	float2 DistanceFadeMAD;
+	float4 ShadowMapChannelMask;
+	uint ShadowedBits;
+	bool bInverseSquared;
+	bool bRadialLight;
+	bool bSpotLight;
+	bool bRectLight;
+	float RectLightBarnCosAngle;
+	float RectLightBarnLength;
+};
+
+
+struct FShadowTerms {
+	float SurfaceShadow;
+	float TransmissionShadow;
+	float TransmissionThickness;
+};
+
+struct FDirectLighting {
+	float3 Diffuse;
+	float3 Specular;
+	float3 Transmission;
+};
+
+struct FRectTexture {
+	float Dummy;
+};
+
+struct FCapsuleLight {
+	float Length;
+	float Radius;
+	float SoftRadius;
+	float DistBiasSqr;
+	float3 LightPos[2];
+};
+
+struct FRect {
+	float Dummy;
+};
+
+
+struct FAreaLight
+{
+	float SphereSinAlpha;
+	float SphereSinAlphaSoft;
+	float LineCosSubtended;
+	float FalloffColor;
+	FRect Rect; // Assuming FRect is a custom struct representing a rectangle  
+	bool bIsRect;
+	FRectTexture Texture;
+};
+
+
+struct BxDFContext {
+	float NoL;  // Normal dot Light
+	float NoV;  // Normal dot View
+	float VoL;  // View dot Light
+	float NoH;  // Normal dot Half
+	float VoH;  // View dot Half
+};
+
+struct FScreenSpaceData {
+	FGBufferData GBuffer;
+	float AmbientOcclusion;
+};
+
+Texture2D DummyRectLightTextureForCapsuleCompilerWarning;
+Texture2D DeferredLightUniforms_SourceTexture;
+
+FLightAccumulator LightAccumulator_Init() {
+	FLightAccumulator acc;
+	acc.TotalLight = float3(0, 0, 0);
+	acc.EstimatedCost = 0;
+	return acc;
+}
+
+FRectTexture InitRectTexture(Texture2D SourceTexture) {
+	FRectTexture Output;
+	Output.Dummy = 0;
+
+	return Output;
+}
+
+float4 Texture2DSampleLevel(Texture2D Tex, SamplerState Sampler, float2 UV,
+	float Mip) {
+	return Tex.SampleLevel(Sampler, UV, Mip);
+}
+
+float ConvertFromDeviceZ(float DeviceZ) {
+
+	return DeviceZ * View_InvDeviceZToWorldZTransform[0] +
+		View_InvDeviceZToWorldZTransform[1] +
+		1.0f / (DeviceZ * View_InvDeviceZToWorldZTransform[2] -
+			View_InvDeviceZToWorldZTransform[3]);
+}
+
+float CalcSceneDepth(float2 ScreenUV) {
+
+	return ConvertFromDeviceZ(
+		Texture2DSampleLevel(SceneTexturesStruct_SceneDepthTexture,
+			SceneTexturesStruct_SceneDepthTextureSampler,
+			ScreenUV, 0)
+		.r);
+}
+
+
+bool CheckerFromPixelPos(uint2 PixelPos) {
+
+	uint TemporalAASampleIndex = View_TemporalAAParams.x;
+
+	return (PixelPos.x + PixelPos.y + TemporalAASampleIndex) % 2;
+}
+
+bool UseSubsurfaceProfile(int ShadingModel) {
+	return ShadingModel == 5 || ShadingModel == 9;
+}
+
+bool CheckerFromSceneColorUV(float2 UVSceneColor) {
+
+	uint2 PixelPos = uint2(UVSceneColor * View_BufferSizeAndInvSize.xy);
+
+	return CheckerFromPixelPos(PixelPos);
+}
+
+float3 DecodeNormal(float3 N) { return N * 2 - 1; }
+
+uint DecodeShadingModelId(float InPackedChannel) {
+	return ((uint)round(InPackedChannel * (float)0xFF)) & 0xF;
+}
+
+uint DecodeSelectiveOutputMask(float InPackedChannel) {
+	return ((uint)round(InPackedChannel * (float)0xFF)) & ~0xF;
+}
+
+float3 DecodeBaseColor(float3 BaseColor) { return BaseColor; }
+
+float DecodeIndirectIrradiance(float IndirectIrradiance) {
+
+	const float OneOverPreExposure = 1.f;
+
+	float LogL = IndirectIrradiance;
+	const float LogBlackPoint = 0.00390625;
+	return OneOverPreExposure * (exp2(LogL * 16 - 8) - LogBlackPoint);
+}
+
+float DielectricSpecularToF0(float Specular) { return 0.08f * Specular; }
+
+float Lerp(float a, float b, float t) { return a + (b - a) * t; }
+
+float3 ComputeF0(float Specular, float3 BaseColor, float Metallic) {
+	float4 F0 = DielectricSpecularToF0(Specular);
+	return lerp(F0.xxx, BaseColor, Metallic.xxx);
+}
+
+FGBufferData DecodeGBufferData(float4 InGBufferA, float4 InGBufferB,
+	float4 InGBufferC, float4 InGBufferD,
+	float4 InGBufferE, float4 InGBufferVelocity,
+	float CustomNativeDepth, uint CustomStencil,
+	float SceneDepth, bool bGetNormalizedNormal,
+	bool bChecker) {
+	FGBufferData GBuffer;
+
+	GBuffer.WorldNormal = DecodeNormal(InGBufferA.xyz);
+	if (bGetNormalizedNormal) {
+		GBuffer.WorldNormal = normalize(GBuffer.WorldNormal);
+	}
+
+	GBuffer.PerObjectGBufferData = InGBufferA.a;
+	GBuffer.Metallic = InGBufferB.r;
+	GBuffer.Specular = InGBufferB.g;
+	GBuffer.Roughness = InGBufferB.b;
+
+	GBuffer.ShadingModelID = DecodeShadingModelId(InGBufferB.a);
+	GBuffer.SelectiveOutputMask = DecodeSelectiveOutputMask(InGBufferB.a);
+
+	GBuffer.BaseColor = DecodeBaseColor(InGBufferC.rgb);
+
+	GBuffer.GBufferAO = 1;
+	GBuffer.IndirectIrradiance = DecodeIndirectIrradiance(InGBufferC.a);
+
+	GBuffer.CustomData =
+		!(GBuffer.SelectiveOutputMask & (1 << 4)) ? InGBufferD : 0;
+
+	GBuffer.PrecomputedShadowFactors =
+		!(GBuffer.SelectiveOutputMask & (1 << 5))
+		? InGBufferE
+		: ((GBuffer.SelectiveOutputMask & (1 << 6)) ? 0 : 1);
+	GBuffer.CustomDepth = ConvertFromDeviceZ(CustomNativeDepth);
+	GBuffer.CustomStencil = CustomStencil;
+	GBuffer.Depth = SceneDepth;
+
+	GBuffer.StoredBaseColor = GBuffer.BaseColor;
+	GBuffer.StoredMetallic = GBuffer.Metallic;
+	GBuffer.StoredSpecular = GBuffer.Specular;
+
+	[flatten] if (GBuffer.ShadingModelID == 9) { GBuffer.Metallic = 0.0; }
+
+	{
+		GBuffer.SpecularColor =
+			ComputeF0(GBuffer.Specular, GBuffer.BaseColor, GBuffer.Metallic);
+
+		GBuffer.DiffuseColor =
+			GBuffer.BaseColor - GBuffer.BaseColor * GBuffer.Metallic;
+
+		{
+
+			GBuffer.DiffuseColor =
+				GBuffer.DiffuseColor * View_DiffuseOverrideParameter.www +
+				View_DiffuseOverrideParameter.xyz;
+			GBuffer.SpecularColor =
+				GBuffer.SpecularColor * View_SpecularOverrideParameter.w +
+				View_SpecularOverrideParameter.xyz;
+		}
+	}
+
+	GBuffer.Velocity =
+		!(GBuffer.SelectiveOutputMask & (1 << 7)) ? InGBufferVelocity : 0;
+
+	return GBuffer;
+}
+
+FGBufferData GetGBufferData(float2 UV, bool bGetNormalizedNormal = true) {
+	float4 GBufferA =
+		Texture2DSampleLevel(SceneTexturesStruct_GBufferATexture,
+			SceneTexturesStruct_GBufferATextureSampler, UV, 0);
+	float4 GBufferB =
+		Texture2DSampleLevel(SceneTexturesStruct_GBufferBTexture,
+			SceneTexturesStruct_GBufferBTextureSampler, UV, 0);
+	float4 GBufferC =
+		Texture2DSampleLevel(SceneTexturesStruct_GBufferCTexture,
+			SceneTexturesStruct_GBufferCTextureSampler, UV, 0);
+	float4 GBufferD =
+		Texture2DSampleLevel(SceneTexturesStruct_GBufferDTexture,
+			SceneTexturesStruct_GBufferDTextureSampler, UV, 0);
+	float CustomNativeDepth =
+		Texture2DSampleLevel(SceneTexturesStruct_CustomDepthTexture,
+			SceneTexturesStruct_CustomDepthTextureSampler, UV, 0)
+		.r;
+    uint CustomStencil;
+
+	float4 GBufferE =
+		Texture2DSampleLevel(SceneTexturesStruct_GBufferETexture,
+			SceneTexturesStruct_GBufferETextureSampler, UV, 0);
+
+	float4 GBufferVelocity = 0;
+
+	float SceneDepth = CalcSceneDepth(UV);
+
+	return DecodeGBufferData(GBufferA, GBufferB, GBufferC, GBufferD, GBufferE,
+		GBufferVelocity, CustomNativeDepth, CustomStencil,
+		SceneDepth, bGetNormalizedNormal,
+		CheckerFromSceneColorUV(UV));
+}
+
+
+FScreenSpaceData GetScreenSpaceData(float2 UV,
+	bool bGetNormalizedNormal = true) {
+	FScreenSpaceData Out;
+
+	Out.GBuffer = GetGBufferData(UV, bGetNormalizedNormal);
+	float4 ScreenSpaceAO = Texture2DSampleLevel(
+		SceneTexturesStruct_ScreenSpaceAOTexture,
+		SceneTexturesStruct_ScreenSpaceAOTextureSampler, UV, 0);
+
+	Out.AmbientOcclusion = ScreenSpaceAO.r;
+
+	return Out;
+}
+
+FDeferredLightData SetupLightDataForStandardDeferred() {
+
+	FDeferredLightData LightData;
+	LightData.Position = DeferredLightUniforms_Position;
+	LightData.InvRadius = DeferredLightUniforms_InvRadius;
+	LightData.Color = DeferredLightUniforms_Color;
+	LightData.FalloffExponent = DeferredLightUniforms_FalloffExponent;
+	LightData.Direction = DeferredLightUniforms_Direction;
+	LightData.Tangent = DeferredLightUniforms_Tangent;
+	LightData.SpotAngles = DeferredLightUniforms_SpotAngles;
+	LightData.SourceRadius = DeferredLightUniforms_SourceRadius,
+		LightData.SourceLength = DeferredLightUniforms_SourceLength;
+	LightData.SoftSourceRadius = DeferredLightUniforms_SoftSourceRadius;
+	LightData.SpecularScale = DeferredLightUniforms_SpecularScale;
+	LightData.ContactShadowLength =
+		abs(DeferredLightUniforms_ContactShadowLength);
+	LightData.ContactShadowLengthInWS =
+		DeferredLightUniforms_ContactShadowLength < 0.0f;
+	LightData.DistanceFadeMAD = DeferredLightUniforms_DistanceFadeMAD;
+	LightData.ShadowMapChannelMask = DeferredLightUniforms_ShadowMapChannelMask;
+	LightData.ShadowedBits = DeferredLightUniforms_ShadowedBits;
+
+	LightData.bInverseSquared = 0;
+	LightData.bRadialLight = 1 > 0;
+
+	LightData.bSpotLight = 1 > 0;
+	LightData.bRectLight = 1 == 2;
+
+	LightData.RectLightBarnCosAngle = DeferredLightUniforms_RectLightBarnCosAngle;
+	LightData.RectLightBarnLength = DeferredLightUniforms_RectLightBarnLength;
+
+	return LightData;
+}
+
+
+float InterleavedGradientNoise(float2 uv, float FrameId) {
+
+	uv += FrameId * (float2(47, 17) * 0.695f);
+
+	const float3 magic = float3(0.06711056f, 0.00583715f, 52.9829189f);
+	return frac(magic.z * frac(dot(uv, magic.xy)));
+}
+
+float4 Square(float4 x) {
+	return x * x;
+}
+
+float Square(float x) {
+	return x * x;
+}
+
+float2 Square(float2 x) {
+	return x * x;
+}
+
+float3 Square(float3 x) {
+	return x * x;
+}
+
+float4 GetPerPixelLightAttenuation(float2 UV) {
+	return Square(Texture2DSampleLevel(LightAttenuationTexture,
+		LightAttenuationTextureSampler, UV, 0));
+}
+
+float RadialAttenuation(float3 WorldLightVector, float FalloffExponent) {
+	float NormalizeDistanceSquared = dot(WorldLightVector, WorldLightVector);
+
+	return pow(1.0f - saturate(NormalizeDistanceSquared), FalloffExponent);
+}
+
+float SpotAttenuation(float3 L, float3 SpotDirection, float2 SpotAngles) {
+	float ConeAngleFalloff =
+		Square(saturate((dot(L, -SpotDirection) - SpotAngles.x) * SpotAngles.y));
+	return ConeAngleFalloff;
+}
+
+
+float GetLocalLightAttenuation(float3 WorldPosition,
+	FDeferredLightData LightData,
+	inout float3 ToLight, inout float3 L) {
+	ToLight = LightData.Position - WorldPosition;
+
+	float DistanceSqr = dot(ToLight, ToLight);
+	L = ToLight * rsqrt(DistanceSqr);
+
+	float LightMask;
+	if (LightData.bInverseSquared) {
+		LightMask =
+			Square(saturate(1 - Square(DistanceSqr * Square(LightData.InvRadius))));
+	}
+	else {
+		LightMask = RadialAttenuation(ToLight * LightData.InvRadius,
+			LightData.FalloffExponent);
+	}
+
+	if (LightData.bSpotLight) {
+		LightMask *= SpotAttenuation(L, -LightData.Direction, LightData.SpotAngles);
+	}
+
+	if (LightData.bRectLight) {
+
+		LightMask = dot(LightData.Direction, L) < 0 ? 0 : LightMask;
+	}
+
+	return LightMask;
+}
+
+
+// Example implementation of DistanceFromCameraFade() function  
+float DistanceFromCameraFade(float depth, FDeferredLightData lightData, float3 worldPosition, float3 cameraOrigin)
+{
+	// Calculate the distance between the world position and the camera origin  
+	float distance = length(worldPosition - cameraOrigin);
+
+	// Apply a fade function based on the distance  
+	float fadeFactor = saturate(1.0f - distance / depth);
+
+	return fadeFactor;
+}
+
+
+// Main function  
+void GetShadowTerms(FGBufferData GBuffer, FDeferredLightData LightData,
+	float3 WorldPosition, float3 L, float4 LightAttenuation,
+	float Dither, inout FShadowTerms Shadow) {
+	float ContactShadowLength = 0.0f;
+	const float ContactShadowLengthScreenScale =
+		View_ClipToView[1][1] * GBuffer.Depth;
+
+	if (LightData.ShadowedBits) {
+
+		float UsesStaticShadowMap =
+			dot(LightData.ShadowMapChannelMask, float4(1, 1, 1, 1));
+		float StaticShadowing = lerp(
+			1,
+			dot(GBuffer.PrecomputedShadowFactors, LightData.ShadowMapChannelMask),
+			UsesStaticShadowMap);
+
+		if (LightData.bRadialLight) {
+
+			Shadow.SurfaceShadow = LightAttenuation.z * StaticShadowing;
+
+			Shadow.TransmissionShadow = LightAttenuation.w * StaticShadowing;
+
+			Shadow.TransmissionThickness = LightAttenuation.w;
+		}
+		else {
+
+			float DynamicShadowFraction = DistanceFromCameraFade(
+				GBuffer.Depth, LightData, WorldPosition, View_WorldCameraOrigin);
+
+			Shadow.SurfaceShadow =
+				lerp(LightAttenuation.x, StaticShadowing, DynamicShadowFraction);
+
+			Shadow.TransmissionShadow =
+				min(lerp(LightAttenuation.y, StaticShadowing, DynamicShadowFraction),
+					LightAttenuation.w);
+
+			Shadow.SurfaceShadow *= LightAttenuation.z;
+			Shadow.TransmissionShadow *= LightAttenuation.z;
+
+			Shadow.TransmissionThickness =
+				min(LightAttenuation.y, LightAttenuation.w);
+		}
+
+		if (LightData.ShadowedBits > 1 &&
+			LightData.ContactShadowLength > 0) {
+			ContactShadowLength =
+				LightData.ContactShadowLength *
+				(LightData.ContactShadowLengthInWS ? 1.0f
+					: ContactShadowLengthScreenScale);
+		}
+	}
+}
+
+
+void Init(inout BxDFContext Context, float3 N, float3 V, float3 L) {
+	Context.NoL = dot(N, L);
+	Context.NoV = dot(N, V);
+	Context.VoL = dot(V, L);
+	float InvLenH = rsqrt(2 + 2 * Context.VoL);
+	Context.NoH = saturate((Context.NoL + Context.NoV) * InvLenH);
+	Context.VoH = saturate(InvLenH + InvLenH * Context.VoL);
+}
+
+float3 Diffuse_Lambert(float3 DiffuseColor) {
+    return DiffuseColor * (1 / PI);
+}
+
+float3 SpecularGGX(float Roughness, float3 SpecularColor, BxDFContext Context, float NoL, FAreaLight AreaLight) {
+	// Calculation of GGX Specular term is complex, and involves Fresnel, Geometric, and Distribution functions.
+	// Here, we'll simplify it with a placeholder function. In actual practice, this function should compute the complete microfacet specular BRDF.
+	float D = max(0.0, Context.NoH); // Placeholder distribution term (D)
+	float G = min(1.0, Context.NoV * Context.NoL); // Placeholder geometric term (G)
+	float3 F = SpecularColor; // Placeholder Fresnel term (F)
+
+	// Combine all the terms
+	return (D * G * F) / (4 * NoL * Context.NoV); // Microfacet specular BRDF
+}
+
+FDirectLighting DefaultLitBxDF(FGBufferData GBuffer, float3 N, float3 V,
+	float3 L, float Falloff, float NoL,
+	FAreaLight AreaLight, FShadowTerms Shadow) {
+	BxDFContext Context;
+	Init(Context, N, V, L);
+	Context.NoV = saturate(abs(Context.NoV) + 1e-5);
+
+	FDirectLighting Lighting;
+	Lighting.Diffuse = AreaLight.FalloffColor * (Falloff * NoL) *
+		Diffuse_Lambert(GBuffer.DiffuseColor);
+
+
+	Lighting.Specular = AreaLight.FalloffColor * (Falloff * NoL) *
+		SpecularGGX(GBuffer.Roughness, GBuffer.SpecularColor,
+			Context, NoL, AreaLight);
+
+	Lighting.Transmission = 0;
+	return Lighting;
+}
+
+float Pow2(float x) {
+	return x * x;
+}
+
+FDirectLighting IntegrateBxDF(FGBufferData GBuffer, float3 N, float3 V, FCapsuleLight Capsule, FShadowTerms Shadow, bool bInverseSquared) {
+	float NoL;
+	float Falloff;
+	float LineCosSubtended = 1;
+
+	float DistSqr = dot(Capsule.LightPos[0], Capsule.LightPos[0]);
+	Falloff = rcp(DistSqr + Capsule.DistBiasSqr);
+
+	float3 L = Capsule.LightPos[0] * rsqrt(DistSqr);
+	NoL = dot(N, L);
+
+	NoL = saturate(NoL);
+	Falloff = bInverseSquared ? Falloff : 1;
+
+	float3 ToLight = Capsule.LightPos[0];
+
+	DistSqr = dot(ToLight, ToLight);
+	float InvDist = rsqrt(DistSqr);
+	L = ToLight * InvDist;
+
+	GBuffer.Roughness = max(GBuffer.Roughness, 0.02);
+	float a = Pow2(GBuffer.Roughness);
+
+	FAreaLight AreaLight;
+	AreaLight.SphereSinAlpha = saturate(Capsule.Radius * InvDist * (1 - a));
+	AreaLight.SphereSinAlphaSoft = saturate(Capsule.SoftRadius * InvDist);
+	AreaLight.LineCosSubtended = LineCosSubtended;
+	AreaLight.FalloffColor = 1;
+	AreaLight.Rect = (FRect)0;
+	AreaLight.bIsRect = false;
+	AreaLight.Texture = InitRectTexture(DummyRectLightTextureForCapsuleCompilerWarning);
+
+	return DefaultLitBxDF(GBuffer, N, V, L, Falloff, NoL, AreaLight, Shadow);
+}
+
+FLightAccumulator LightAccumulator_Add(
+	FLightAccumulator In, float3 TotalLight, float3 ScatterableLight,
+	float3 CommonMultiplier,
+	const bool bNeedsSeparateSubsurfaceLightAccumulation) {
+
+	In.TotalLight += TotalLight * CommonMultiplier;
+	return In;
+}
+
+float4 LightAccumulator_GetResult(FLightAccumulator In) {
+	float4 Ret;
+
+	Ret = float4(In.TotalLight, 0);
+	return Ret;
+}
+
+FCapsuleLight GetCapsule(float3 ToLight, FDeferredLightData LightData) {
+	FCapsuleLight Capsule;
+	Capsule.Length = LightData.SourceLength;
+	Capsule.Radius = LightData.SourceRadius;
+	Capsule.SoftRadius = LightData.SoftSourceRadius;
+	Capsule.DistBiasSqr = 1.0f;
+	Capsule.LightPos[0] = ToLight - 0.5 * Capsule.Length * LightData.Tangent;
+	Capsule.LightPos[1] = ToLight + 0.5 * Capsule.Length * LightData.Tangent;
+	return Capsule;
+}
+
+
+float4 GetDynamicLighting(
+	float3 WorldPosition,
+	float3 CameraVector,
+	FGBufferData GBuffer,
+	float AmbientOcclusion,
+	uint ShadingModelID,
+	FDeferredLightData LightData,
+	float4 LightAttenuation,
+	float Dither,
+	uint2 SVPos,
+	FRectTexture SourceTexture
+) {
+	FLightAccumulator LightAccumulator = LightAccumulator_Init();
+	LightAccumulator.EstimatedCost += 0.3f;
+
+	float3 V = -CameraVector;
+	float3 N = GBuffer.WorldNormal;
+
+	float3 L = LightData.Direction;
+	float3 ToLight = L;
+
+	float LightMask = 1;
+	if (LightData.bRadialLight) {
+		LightMask = GetLocalLightAttenuation(WorldPosition, LightData, ToLight, L);
+	}
+
+	if (LightMask > 0) {
+		FShadowTerms Shadow;
+		Shadow.SurfaceShadow = AmbientOcclusion;
+		Shadow.TransmissionShadow = 1;
+		Shadow.TransmissionThickness = 1;
+		GetShadowTerms(GBuffer, LightData, WorldPosition, L, LightAttenuation, Dither, Shadow);
+
+		LightAccumulator.EstimatedCost += 0.3f;
+
+		if (Shadow.SurfaceShadow + Shadow.TransmissionShadow > 0) {
+			bool bNeedsSeparateSubsurfaceLightAccumulation = UseSubsurfaceProfile(GBuffer.ShadingModelID);
+			float3 LightColor = LightData.Color;
+
+			FDirectLighting Lighting;
+
+			FCapsuleLight Capsule = GetCapsule(ToLight, LightData);
+			Lighting = IntegrateBxDF(GBuffer, N, V, Capsule, Shadow, LightData.bInverseSquared);
+
+			Lighting.Specular *= LightData.SpecularScale;
+
+			LightAccumulator = LightAccumulator_Add(
+				LightAccumulator,
+				Lighting.Diffuse + Lighting.Specular,
+				Lighting.Diffuse,
+				LightColor * LightMask * Shadow.SurfaceShadow,
+				bNeedsSeparateSubsurfaceLightAccumulation
+			);
+			LightAccumulator = LightAccumulator_Add(
+				LightAccumulator,
+				Lighting.Transmission,
+				Lighting.Transmission,
+				LightColor * LightMask * Shadow.TransmissionShadow,
+				bNeedsSeparateSubsurfaceLightAccumulation
+			);
+
+			LightAccumulator.EstimatedCost += 0.4f;
+		}
+	}
+
+	return LightAccumulator_GetResult(LightAccumulator);
+}
+
+
+float ComputeLightProfileMultiplier(
+	float3 WorldPosition,
+	float3 LightPosition,
+	float3 LightDirection,
+	float3 LightTangent
+) {
+	return 1.0f;
+}
+
+
+struct VertexOutput
+{
+    float4 OutScreenPosition : TEXCOORD0;
+    float4 OutPosition : SV_POSITION;
+};
+
+
+float4 DeferredLightPixelMain(VertexOutput vout) : SV_TARGET0
+{
+
+	//printf("DeferredLightPixelMain\n");
+
+	float4 InScreenPosition = vout.OutScreenPosition;
+	float4 SVPos = vout.OutPosition;
+	float4 OutColor = 0;
+
+	float2 ScreenUV = InScreenPosition.xy / InScreenPosition.w * View_ScreenPositionScaleBias.xy + View_ScreenPositionScaleBias.wz;
+
+	FScreenSpaceData ScreenSpaceData = GetScreenSpaceData(ScreenUV);
+
+	if (ScreenSpaceData.GBuffer.ShadingModelID > 0)
+	{
+		float SceneDepth = CalcSceneDepth(ScreenUV);
+
+		float2 ClipPosition = InScreenPosition.xy / InScreenPosition.w * (View_ViewToClip[3][3] < 1.0f ? SceneDepth : 1.0f);
+		float4 position = mul(float4(ClipPosition, SceneDepth, 1), View_ScreenToWorld);
+		float3 WorldPosition = position.xyz;
+		float3 CameraVector = normalize(WorldPosition - View_WorldCameraOrigin);
+
+		FDeferredLightData LightData = SetupLightDataForStandardDeferred();
+
+		float Dither = InterleavedGradientNoise(SVPos.xy, View_StateFrameIndexMod8);
+
+		FRectTexture RectTexture = InitRectTexture(DeferredLightUniforms_SourceTexture);
+		OutColor = GetDynamicLighting(WorldPosition, CameraVector, ScreenSpaceData.GBuffer, ScreenSpaceData.AmbientOcclusion, ScreenSpaceData.GBuffer.ShadingModelID, LightData, GetPerPixelLightAttenuation(ScreenUV), Dither, uint2(SVPos.xy), RectTexture);
+		OutColor *= ComputeLightProfileMultiplier(WorldPosition, DeferredLightUniforms_Position, -DeferredLightUniforms_Direction, DeferredLightUniforms_Tangent);
+	}
+
+	return OutColor;
+}
+
+
+
+static const float2 DiscSamples29[]=
+{
+	float2(0.000000, 2.500000),
+	float2(1.016842, 2.283864),
+	float2(1.857862, 1.672826),
+	float2(2.377641, 0.772542),
+	float2(2.486305, -0.261321),
+	float2(2.165063, -1.250000),
+	float2(1.469463, -2.022543),
+	float2(0.519779, -2.445369),
+	float2(-0.519779, -2.445369),
+	float2(-1.469463, -2.022542),
+	float2(-2.165064, -1.250000),
+	float2(-2.486305, -0.261321),
+	float2(-2.377641, 0.772543),
+	float2(-1.857862, 1.672827),
+	float2(-1.016841, 2.283864),
+	float2(0.091021, -0.642186),
+	float2(0.698035, 0.100940),
+	float2(0.959731, -1.169393),
+	float2(-1.053880, 1.180380),
+	float2(-1.479156, -0.606937),
+	float2(-0.839488, -1.320002),
+	float2(1.438566, 0.705359),
+	float2(0.067064, -1.605197),
+	float2(0.728706, 1.344722),
+	float2(1.521424, -0.380184),
+	float2(-0.199515, 1.590091),
+	float2(-1.524323, 0.364010),
+	float2(-0.692694, -0.086749),
+	float2(-0.082476, 0.654088),
+};
+
+
+float CubemapHardwarePCF(float3 WorldPosition, float3 LightPosition, float LightInvRadius, float DepthBias)
+{
+	float Shadow = 1;
+	float3 LightVector = LightPosition - WorldPosition.xyz;
+	float Distance = length(LightVector);
+	[branch]
+	if (Distance * LightInvRadius < 1.0f)
+	{
+		float3 NormalizedLightVector = LightVector / Distance;
+		float3 SideVector = normalize(cross(NormalizedLightVector, float3(0, 0, 1)));
+		float3 UpVector = cross(SideVector, NormalizedLightVector);
+		SideVector *= InvShadowmapResolution;
+		UpVector *= InvShadowmapResolution;
+		float3 AbsLightVector = abs(LightVector);
+		float MaxCoordinate = max(AbsLightVector.x, max(AbsLightVector.y, AbsLightVector.z));
+		int CubeFaceIndex = 0;
+		if (MaxCoordinate == AbsLightVector.x)
+		{
+			CubeFaceIndex = AbsLightVector.x == LightVector.x ? 0 : 1;
+		}
+		else if (MaxCoordinate == AbsLightVector.y)
+		{
+			CubeFaceIndex = AbsLightVector.y == LightVector.y ? 2 : 3;
+		}
+		else
+		{
+			CubeFaceIndex = AbsLightVector.z == LightVector.z ? 4 : 5;
+		}
+		float4 ShadowPosition = mul(float4(WorldPosition.xyz, 1), ShadowViewProjectionMatrices[CubeFaceIndex]);
+		float CompareDistance = ShadowPosition.z / ShadowPosition.w;
+		float ShadowDepthBias = - DepthBias / ShadowPosition.w;
+		Shadow = 0;
+		[unroll]  for(int i = 0; i < 29; ++i)
+		{
+			float3 SamplePos = NormalizedLightVector + SideVector * DiscSamples29[i].x + UpVector * DiscSamples29[i].y;
+			Shadow += ShadowDepthCubeTexture.SampleCmpLevelZero(
+				ShadowDepthCubeTextureSampler,
+				SamplePos.xy,
+				CompareDistance + ShadowDepthBias * length(DiscSamples29[i])).r;
+		}
+		Shadow /= 29;
+	}
+	return Shadow;
+}
+
+
+float  EncodeLightAttenuation( float  InColor)
+{
+	return sqrt(InColor);
+}
+
+
+float4 MainOnePassPointLightShadowPS(
+	VertexOutput vout
+	): SV_TARGET0
+{
+	float4 OutColor;
+	float2 ScreenUV = float2( vout.OutPosition.xy * View_BufferSizeAndInvSize.zw );
+	float SceneW = CalcSceneDepth( ScreenUV );
+	float2 ScreenPosition = ( ScreenUV.xy - View_ScreenPositionScaleBias.wz ) / View_ScreenPositionScaleBias.xy;
+	float4 position = mul(float4(ScreenPosition.xy * SceneW, SceneW, 1), View_ScreenToWorld);
+	float3 WorldPosition = position.xyz;
+	float3 LightVector = LightPositionAndInvRadius.xyz - WorldPosition.xyz;
+ 	float Shadow = CubemapHardwarePCF(WorldPosition, LightPositionAndInvRadius.xyz, LightPositionAndInvRadius.w, PointLightDepthBiasAndProjParameters.x);
+	Shadow = saturate( (Shadow - 0.5) * ShadowSharpen + 0.5 );
+	float FadedShadow = lerp(1.0f, Square(Shadow), ShadowFadeFraction);
+	OutColor.b = EncodeLightAttenuation(FadedShadow);
+	OutColor.rga = 1;
+	OutColor.a = OutColor.b;
+	return OutColor;
+}
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingVertex.hlsl b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingVertex.hlsl
new file mode 100644
index 000000000..6ad7bea59
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingVertex.hlsl
@@ -0,0 +1,199 @@
+
+
+cbuffer StencilingParametersConstantBuffer : register(b0)
+{
+    float4 StencilingGeometryPosAndScale;
+    float4 StencilingConeParameters;
+    float4x4 StencilingConeTransform;
+    float3 StencilingPreViewTranslation;
+    float padding;
+};
+
+
+cbuffer ViewConstantBuffer : register(b1)  
+{  
+    float4 View_InvDeviceZToWorldZTransform;  
+    float4 View_TemporalAAParams;  
+    float4 View_BufferSizeAndInvSize;  
+    float4 View_DiffuseOverrideParameter;  
+    float4 View_SpecularOverrideParameter;  
+    float4x4 View_ClipToView;  
+    float4x4 View_ViewToClip;  
+    float4x4 View_ScreenToWorld;  
+    float3 View_WorldCameraOrigin;
+    float Padding0;
+    float3 View_PreViewTranslation;  
+    float Padding1;
+    float4 View_ScreenPositionScaleBias;  
+    float4x4 View_TranslatedWorldToClip;  
+    uint View_StateFrameIndexMod8;  
+    float3 Padding; // Add padding to maintain 16-byte alignment  
+};  
+
+
+struct VertexInput  
+{  
+    float3 InPosition : POSITION;  
+};  
+
+
+struct VertexOutput
+{
+    float4 OutScreenPosition : TEXCOORD0;
+    float4 OutPosition : SV_POSITION;
+};
+
+const static float PI = 3.1415926535897932f;
+const static float MaxHalfFloat = 65504.0f;
+
+VertexOutput RadialVertexMain1(VertexInput input, uint InVertexId : SV_VertexID)
+{
+    //printf ("RadialVertexMain\n");
+    VertexOutput output;
+
+    float3 WorldPosition;
+    uint NumSides = StencilingConeParameters.x;
+
+    if (NumSides != 0)
+    {
+        float SphereRadius = StencilingConeParameters.w;
+        float ConeAngle = StencilingConeParameters.z;
+
+        const float InvCosRadiansPerSide = 1.0f / cos(PI / (float)NumSides);
+
+        const float ZRadius = SphereRadius * cos(ConeAngle);
+        const float TanConeAngle = tan(ConeAngle);
+
+        uint NumSlices = StencilingConeParameters.y;
+        uint CapIndexStart = NumSides * NumSlices;
+
+        if (InVertexId < CapIndexStart)
+        {
+            uint SliceIndex = InVertexId / NumSides;
+            uint SideIndex = InVertexId % NumSides;
+
+            const float CurrentAngle = SideIndex * 2 * PI / (float)NumSides;
+            const float DistanceDownConeDirection =
+                ZRadius * SliceIndex / (float)(NumSlices - 1);
+
+            const float SliceRadius =
+                DistanceDownConeDirection * TanConeAngle * InvCosRadiansPerSide;
+
+            const float3 LocalPosition = float3(
+                ZRadius * SliceIndex / (float)(NumSlices - 1),
+                SliceRadius * sin(CurrentAngle), SliceRadius * cos(CurrentAngle));
+
+            WorldPosition =
+                mul(float4(LocalPosition, 1), StencilingConeTransform).xyz +
+                StencilingPreViewTranslation;
+        }
+        else
+        {
+            const float CapRadius = ZRadius * tan(ConeAngle);
+
+            uint VertexId = InVertexId - CapIndexStart;
+            uint SliceIndex = VertexId / NumSides;
+            uint SideIndex = VertexId % NumSides;
+
+            const float UnadjustedSliceRadius =
+                CapRadius * SliceIndex / (float)(NumSlices - 1);
+
+            const float SliceRadius = UnadjustedSliceRadius * InvCosRadiansPerSide;
+
+            const float ZDistance =
+                sqrt(SphereRadius * SphereRadius -
+                    UnadjustedSliceRadius * UnadjustedSliceRadius);
+
+            const float CurrentAngle = SideIndex * 2 * PI / (float)NumSides;
+            const float3 LocalPosition =
+                float3(ZDistance, SliceRadius * sin(CurrentAngle),
+                    SliceRadius * cos(CurrentAngle));
+            WorldPosition =
+                mul(float4(LocalPosition, 1), StencilingConeTransform).xyz +
+                StencilingPreViewTranslation;
+        }
+    }
+    else
+    {
+        WorldPosition = input.InPosition * StencilingGeometryPosAndScale.w +
+            StencilingGeometryPosAndScale.xyz;
+    }
+
+    output.OutScreenPosition = output.OutPosition =
+        mul(float4(WorldPosition, 1), View_TranslatedWorldToClip);
+
+    return output;
+}
+
+VertexOutput RadialVertexMain(VertexInput input, uint InVertexId : SV_VertexID)
+{
+    VertexOutput   output;
+    float3 WorldPosition = {0,0, 0};
+    uint NumSides = StencilingConeParameters.x;
+  
+    if (NumSides != 0)
+    {
+        float SphereRadius = StencilingConeParameters.w;
+        float ConeAngle = StencilingConeParameters.z;
+        const float InvCosRadiansPerSide = 1.0f / cos(PI / (float)NumSides);
+
+        const float ZRadius = SphereRadius * cos(ConeAngle);
+        const float TanConeAngle = tan(ConeAngle);
+
+        uint NumSlices = StencilingConeParameters.y;
+        uint CapIndexStart = NumSides * NumSlices;
+        if (InVertexId < CapIndexStart)
+        {
+            uint SliceIndex = InVertexId / NumSides;
+            uint SideIndex = InVertexId % NumSides;
+
+            const float CurrentAngle = SideIndex * 2 * PI / (float)NumSides;
+            const float DistanceDownConeDirection =
+                ZRadius * SliceIndex / (float)(NumSlices - 1);
+
+            const float SliceRadius =
+                DistanceDownConeDirection * TanConeAngle * InvCosRadiansPerSide;
+
+            const float3 LocalPosition = float3(
+                ZRadius * SliceIndex / (float)(NumSlices - 1),
+                SliceRadius * sin(CurrentAngle), SliceRadius * cos(CurrentAngle));
+            float4 position = mul(float4(LocalPosition, 1), StencilingConeTransform);
+            WorldPosition = position.xyz + StencilingPreViewTranslation;
+        }
+        else
+        {
+            const float CapRadius = ZRadius * tan(ConeAngle);
+
+            uint VertexId = InVertexId - CapIndexStart;
+            uint SliceIndex = VertexId / NumSides;
+            uint SideIndex = VertexId % NumSides;
+
+            const float UnadjustedSliceRadius =
+                CapRadius * SliceIndex / (float)(NumSlices - 1);
+
+            const float SliceRadius = UnadjustedSliceRadius * InvCosRadiansPerSide;
+
+            const float ZDistance =
+                sqrt(SphereRadius * SphereRadius -
+                    UnadjustedSliceRadius * UnadjustedSliceRadius);
+
+            const float CurrentAngle = SideIndex * 2 * PI / (float)NumSides;
+            const float3 LocalPosition =
+                float3(ZDistance, SliceRadius * sin(CurrentAngle),
+                    SliceRadius * cos(CurrentAngle));
+            float4 position = mul(float4(LocalPosition, 1), StencilingConeTransform);
+            WorldPosition = position.xyz + StencilingPreViewTranslation;
+        }
+    }
+    else
+    {
+        WorldPosition = input.InPosition * StencilingGeometryPosAndScale.w +
+            StencilingGeometryPosAndScale.xyz;
+    }
+
+
+    output.OutScreenPosition = output.OutPosition =
+        mul(float4(WorldPosition, 1), View_TranslatedWorldToClip);
+    
+    return output;
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/ShadowMap.hlsl b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/ShadowMap.hlsl
new file mode 100644
index 000000000..b31cdc1e3
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/ShadowMap.hlsl
@@ -0,0 +1,55 @@
+
+cbuffer ObjectConstantBuffer : register(b0)
+{
+    float4x4 gWorld;
+    float4x4 gViewProj;
+};
+
+struct VertexIn
+{
+    float3 PosL : POSITION;
+    float2 TexC : TEXCOORD0;
+};
+
+struct VertexOut
+{
+    float4 PosH : SV_POSITION;
+};
+
+void SetShadowDepthOutputs(float4 WorldPosition, float4x4 gViewProj, out float4 OutPosition, out float ShadowDepth)
+{
+    // Transform the vertex position from world to view
+    OutPosition = mul(WorldPosition, gViewProj);
+
+    float DepthBias = 0.01;
+    float InvMaxSubjectDepth = 0.001;
+
+    // Output linear, normalized depth
+    ShadowDepth = OutPosition.z * InvMaxSubjectDepth + DepthBias;
+    OutPosition.z = ShadowDepth * OutPosition.w;
+}
+
+// Generate depth info from the view of light.
+VertexOut VS(VertexIn vin)
+{
+    VertexOut vout;
+
+    // Transform the vertex position from object / local space to world space
+    float4 WorldPos = mul(float4(vin.PosL, 1.0), gWorld);
+
+    float dummy; // Corrected the variable name
+
+    SetShadowDepthOutputs(
+        WorldPos,
+        gViewProj,
+        vout.PosH,
+        dummy
+    );
+
+    return vout;
+}
+
+void PS(VertexOut pin)
+{
+    // Pixel shader implementation goes here
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.cpp b/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.cpp
new file mode 100644
index 000000000..9fa3a3bd6
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.cpp
@@ -0,0 +1,670 @@
+//
+// DeviceResources.cpp - A wrapper for the Direct3D 12 device and swapchain
+//
+
+#include "DeviceResources.h"
+
+using namespace DirectX;
+using namespace DX;
+
+using Microsoft::WRL::ComPtr;
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
+#pragma clang diagnostic ignored "-Wswitch-enum"
+#endif
+
+#pragma warning(disable : 4061)
+
+namespace {
+inline DXGI_FORMAT NoSRGB(DXGI_FORMAT fmt) noexcept {
+    switch (fmt) {
+    case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+        return DXGI_FORMAT_R8G8B8A8_UNORM;
+    case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+        return DXGI_FORMAT_B8G8R8A8_UNORM;
+    case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+        return DXGI_FORMAT_B8G8R8X8_UNORM;
+    default:
+        return fmt;
+    }
+}
+
+inline long ComputeIntersectionArea(long ax1, long ay1, long ax2, long ay2, long bx1, long by1, long bx2,
+                                    long by2) noexcept {
+    return std::max(0l, std::min(ax2, bx2) - std::max(ax1, bx1)) *
+           std::max(0l, std::min(ay2, by2) - std::max(ay1, by1));
+}
+} // namespace
+
+// Constructor for DeviceResources.
+DeviceResources::DeviceResources(DXGI_FORMAT backBufferFormat, DXGI_FORMAT depthBufferFormat, UINT backBufferCount,
+                                 D3D_FEATURE_LEVEL minFeatureLevel, unsigned int flags) noexcept(false)
+    : m_backBufferIndex(0), m_fenceValues{}, m_rtvDescriptorSize(0), m_screenViewport{}, m_scissorRect{},
+      m_backBufferFormat(backBufferFormat), m_depthBufferFormat(depthBufferFormat), m_backBufferCount(backBufferCount),
+      m_d3dMinFeatureLevel(minFeatureLevel), m_window(nullptr), m_d3dFeatureLevel(D3D_FEATURE_LEVEL_11_0),
+      m_dxgiFactoryFlags(0), m_outputSize{0, 0, 1, 1}, m_colorSpace(DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709),
+      m_options(flags), m_deviceNotify(nullptr) {
+    if (backBufferCount < 2 || backBufferCount > MAX_BACK_BUFFER_COUNT) {
+        throw std::out_of_range("invalid backBufferCount");
+    }
+
+    if (minFeatureLevel < D3D_FEATURE_LEVEL_11_0) {
+        throw std::out_of_range("minFeatureLevel too low");
+    }
+}
+
+// Destructor for DeviceResources.
+DeviceResources::~DeviceResources() {
+    // Ensure that the GPU is no longer referencing resources that are about to be destroyed.
+    WaitForGpu();
+}
+
+// Configures the Direct3D device, and stores handles to it and the device context.
+void DeviceResources::CreateDeviceResources() {
+#if defined(_DEBUG)
+    // Enable the debug layer (requires the Graphics Tools "optional feature").
+    //
+    // NOTE: Enabling the debug layer after device creation will invalidate the active device.
+    {
+        ComPtr<ID3D12Debug> debugController;
+        if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(debugController.GetAddressOf())))) {
+            debugController->EnableDebugLayer();
+        } else {
+            OutputDebugStringA("WARNING: Direct3D Debug Device is not available\n");
+        }
+
+        ComPtr<IDXGIInfoQueue> dxgiInfoQueue;
+        if (SUCCEEDED(DXGIGetDebugInterface1(0, IID_PPV_ARGS(dxgiInfoQueue.GetAddressOf())))) {
+            m_dxgiFactoryFlags = DXGI_CREATE_FACTORY_DEBUG;
+
+            dxgiInfoQueue->SetBreakOnSeverity(DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR, true);
+            dxgiInfoQueue->SetBreakOnSeverity(DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION, true);
+
+            DXGI_INFO_QUEUE_MESSAGE_ID hide[] = {
+                80 /* IDXGISwapChain::GetContainingOutput: The swapchain's adapter does not control the output on which
+                      the swapchain's window resides. */
+                ,
+            };
+            DXGI_INFO_QUEUE_FILTER filter = {};
+            filter.DenyList.NumIDs = static_cast<UINT>(std::size(hide));
+            filter.DenyList.pIDList = hide;
+            dxgiInfoQueue->AddStorageFilterEntries(DXGI_DEBUG_DXGI, &filter);
+        }
+    }
+#endif
+
+    ThrowIfFailed(CreateDXGIFactory2(m_dxgiFactoryFlags, IID_PPV_ARGS(m_dxgiFactory.ReleaseAndGetAddressOf())));
+
+    // Determines whether tearing support is available for fullscreen borderless windows.
+    if (m_options & c_AllowTearing) {
+        BOOL allowTearing = FALSE;
+        HRESULT hr =
+            m_dxgiFactory->CheckFeatureSupport(DXGI_FEATURE_PRESENT_ALLOW_TEARING, &allowTearing, sizeof(allowTearing));
+        if (FAILED(hr) || !allowTearing) {
+            m_options &= ~c_AllowTearing;
+#ifdef _DEBUG
+            OutputDebugStringA("WARNING: Variable refresh rate displays not supported");
+#endif
+        }
+    }
+
+    ComPtr<IDXGIAdapter1> adapter;
+    GetAdapter(adapter.GetAddressOf());
+
+    // Create the DX12 API device object.
+    HRESULT hr =
+        D3D12CreateDevice(adapter.Get(), m_d3dMinFeatureLevel, IID_PPV_ARGS(m_d3dDevice.ReleaseAndGetAddressOf()));
+    ThrowIfFailed(hr);
+
+    m_d3dDevice->SetName(L"DeviceResources");
+
+#ifndef NDEBUG
+    // Configure debug device (if active).
+    ComPtr<ID3D12InfoQueue> d3dInfoQueue;
+    if (SUCCEEDED(m_d3dDevice.As(&d3dInfoQueue))) {
+#ifdef _DEBUG
+        d3dInfoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, true);
+        d3dInfoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, true);
+#endif
+        D3D12_MESSAGE_ID hide[] = {
+            D3D12_MESSAGE_ID_MAP_INVALID_NULLRANGE,
+            D3D12_MESSAGE_ID_UNMAP_INVALID_NULLRANGE,
+            // Workarounds for debug layer issues on hybrid-graphics systems
+            D3D12_MESSAGE_ID_EXECUTECOMMANDLISTS_WRONGSWAPCHAINBUFFERREFERENCE,
+            D3D12_MESSAGE_ID_RESOURCE_BARRIER_MISMATCHING_COMMAND_LIST_TYPE,
+        };
+        D3D12_INFO_QUEUE_FILTER filter = {};
+        filter.DenyList.NumIDs = static_cast<UINT>(std::size(hide));
+        filter.DenyList.pIDList = hide;
+        d3dInfoQueue->AddStorageFilterEntries(&filter);
+    }
+#endif
+
+    // Determine maximum supported feature level for this device
+    static const D3D_FEATURE_LEVEL s_featureLevels[] = {
+#if defined(NTDDI_WIN10_FE) || defined(USING_D3D12_AGILITY_SDK)
+        D3D_FEATURE_LEVEL_12_2,
+#endif
+        D3D_FEATURE_LEVEL_12_1,
+        D3D_FEATURE_LEVEL_12_0,
+        D3D_FEATURE_LEVEL_11_1,
+        D3D_FEATURE_LEVEL_11_0,
+    };
+
+    D3D12_FEATURE_DATA_FEATURE_LEVELS featLevels = {static_cast<UINT>(std::size(s_featureLevels)), s_featureLevels,
+                                                    D3D_FEATURE_LEVEL_11_0};
+
+    hr = m_d3dDevice->CheckFeatureSupport(D3D12_FEATURE_FEATURE_LEVELS, &featLevels, sizeof(featLevels));
+    if (SUCCEEDED(hr)) {
+        m_d3dFeatureLevel = featLevels.MaxSupportedFeatureLevel;
+    } else {
+        m_d3dFeatureLevel = m_d3dMinFeatureLevel;
+    }
+
+    // Create the command queue.
+    D3D12_COMMAND_QUEUE_DESC queueDesc = {};
+    queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+    queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+
+    ThrowIfFailed(m_d3dDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(m_commandQueue.ReleaseAndGetAddressOf())));
+
+    m_commandQueue->SetName(L"DeviceResources");
+
+    // Create descriptor heaps for render target views and depth stencil views.
+    D3D12_DESCRIPTOR_HEAP_DESC rtvDescriptorHeapDesc = {};
+    rtvDescriptorHeapDesc.NumDescriptors = m_backBufferCount;
+    rtvDescriptorHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV;
+
+    ThrowIfFailed(m_d3dDevice->CreateDescriptorHeap(&rtvDescriptorHeapDesc,
+                                                    IID_PPV_ARGS(m_rtvDescriptorHeap.ReleaseAndGetAddressOf())));
+
+    m_rtvDescriptorHeap->SetName(L"DeviceResources");
+
+    m_rtvDescriptorSize = m_d3dDevice->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
+
+    if (m_depthBufferFormat != DXGI_FORMAT_UNKNOWN) {
+        D3D12_DESCRIPTOR_HEAP_DESC dsvDescriptorHeapDesc = {};
+        dsvDescriptorHeapDesc.NumDescriptors = 1;
+        dsvDescriptorHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_DSV;
+
+        ThrowIfFailed(m_d3dDevice->CreateDescriptorHeap(&dsvDescriptorHeapDesc,
+                                                        IID_PPV_ARGS(m_dsvDescriptorHeap.ReleaseAndGetAddressOf())));
+
+        m_dsvDescriptorHeap->SetName(L"DeviceResources");
+    }
+
+    // Create a command allocator for each back buffer that will be rendered to.
+    for (UINT n = 0; n < m_backBufferCount; n++) {
+        ThrowIfFailed(m_d3dDevice->CreateCommandAllocator(
+            D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(m_commandAllocators[n].ReleaseAndGetAddressOf())));
+
+        wchar_t name[25] = {};
+        swprintf_s(name, L"Render target %u", n);
+        m_commandAllocators[n]->SetName(name);
+    }
+
+    // Create a command list for recording graphics commands.
+    ThrowIfFailed(m_d3dDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_commandAllocators[0].Get(),
+                                                 nullptr, IID_PPV_ARGS(m_commandList.ReleaseAndGetAddressOf())));
+    ThrowIfFailed(m_commandList->Close());
+
+    m_commandList->SetName(L"DeviceResources");
+
+    // Create a fence for tracking GPU execution progress.
+    ThrowIfFailed(m_d3dDevice->CreateFence(m_fenceValues[m_backBufferIndex], D3D12_FENCE_FLAG_NONE,
+                                           IID_PPV_ARGS(m_fence.ReleaseAndGetAddressOf())));
+    m_fenceValues[m_backBufferIndex]++;
+
+    m_fence->SetName(L"DeviceResources");
+
+    m_fenceEvent.Attach(CreateEventEx(nullptr, nullptr, 0, EVENT_MODIFY_STATE | SYNCHRONIZE));
+    if (!m_fenceEvent.IsValid()) {
+        throw std::system_error(std::error_code(static_cast<int>(GetLastError()), std::system_category()),
+                                "CreateEventEx");
+    }
+}
+
+// These resources need to be recreated every time the window size is changed.
+void DeviceResources::CreateWindowSizeDependentResources() {
+    if (!m_window) {
+        throw std::logic_error("Call SetWindow with a valid Win32 window handle");
+    }
+
+    // Wait until all previous GPU work is complete.
+    WaitForGpu();
+
+    // Release resources that are tied to the swap chain and update fence values.
+    for (UINT n = 0; n < m_backBufferCount; n++) {
+        m_renderTargets[n].Reset();
+        m_fenceValues[n] = m_fenceValues[m_backBufferIndex];
+    }
+
+    // Determine the render target size in pixels.
+    const UINT backBufferWidth = std::max<UINT>(static_cast<UINT>(m_outputSize.right - m_outputSize.left), 1u);
+    const UINT backBufferHeight = std::max<UINT>(static_cast<UINT>(m_outputSize.bottom - m_outputSize.top), 1u);
+    const DXGI_FORMAT backBufferFormat = NoSRGB(m_backBufferFormat);
+
+    // If the swap chain already exists, resize it, otherwise create one.
+    if (m_swapChain) {
+        // If the swap chain already exists, resize it.
+        HRESULT hr = m_swapChain->ResizeBuffers(m_backBufferCount, backBufferWidth, backBufferHeight, backBufferFormat,
+                                                (m_options & c_AllowTearing) ? DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING : 0u);
+
+        if (hr == DXGI_ERROR_DEVICE_REMOVED || hr == DXGI_ERROR_DEVICE_RESET) {
+#ifdef _DEBUG
+            char buff[64] = {};
+            sprintf_s(buff, "Device Lost on ResizeBuffers: Reason code 0x%08X\n",
+                      static_cast<unsigned int>(
+                          (hr == DXGI_ERROR_DEVICE_REMOVED) ? m_d3dDevice->GetDeviceRemovedReason() : hr));
+            OutputDebugStringA(buff);
+#endif
+            // If the device was removed for any reason, a new device and swap chain will need to be created.
+            HandleDeviceLost();
+
+            // Everything is set up now. Do not continue execution of this method. HandleDeviceLost will reenter this
+            // method and correctly set up the new device.
+            return;
+        } else {
+            ThrowIfFailed(hr);
+        }
+    } else {
+        // Create a descriptor for the swap chain.
+        DXGI_SWAP_CHAIN_DESC1 swapChainDesc = {};
+        swapChainDesc.Width = backBufferWidth;
+        swapChainDesc.Height = backBufferHeight;
+        swapChainDesc.Format = backBufferFormat;
+        swapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
+        swapChainDesc.BufferCount = m_backBufferCount;
+        swapChainDesc.SampleDesc.Count = 1;
+        swapChainDesc.SampleDesc.Quality = 0;
+        swapChainDesc.Scaling = DXGI_SCALING_STRETCH;
+        swapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD;
+        swapChainDesc.AlphaMode = DXGI_ALPHA_MODE_IGNORE;
+        swapChainDesc.Flags = (m_options & c_AllowTearing) ? DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING : 0u;
+
+        DXGI_SWAP_CHAIN_FULLSCREEN_DESC fsSwapChainDesc = {};
+        fsSwapChainDesc.Windowed = TRUE;
+
+        // Create a swap chain for the window.
+        ComPtr<IDXGISwapChain1> swapChain;
+        ThrowIfFailed(m_dxgiFactory->CreateSwapChainForHwnd(m_commandQueue.Get(), m_window, &swapChainDesc,
+                                                            &fsSwapChainDesc, nullptr, swapChain.GetAddressOf()));
+
+        ThrowIfFailed(swapChain.As(&m_swapChain));
+
+        // This class does not support exclusive full-screen mode and prevents DXGI from responding to the ALT+ENTER
+        // shortcut
+        ThrowIfFailed(m_dxgiFactory->MakeWindowAssociation(m_window, DXGI_MWA_NO_ALT_ENTER));
+    }
+
+    // Handle color space settings for HDR
+    UpdateColorSpace();
+
+    // Obtain the back buffers for this window which will be the final render targets
+    // and create render target views for each of them.
+    for (UINT n = 0; n < m_backBufferCount; n++) {
+        ThrowIfFailed(m_swapChain->GetBuffer(n, IID_PPV_ARGS(m_renderTargets[n].GetAddressOf())));
+
+        wchar_t name[25] = {};
+        swprintf_s(name, L"Render target %u", n);
+        m_renderTargets[n]->SetName(name);
+
+        D3D12_RENDER_TARGET_VIEW_DESC rtvDesc = {};
+        rtvDesc.Format = m_backBufferFormat;
+        rtvDesc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D;
+
+        const CD3DX12_CPU_DESCRIPTOR_HANDLE rtvDescriptor(m_rtvDescriptorHeap->GetCPUDescriptorHandleForHeapStart(),
+                                                          static_cast<INT>(n), m_rtvDescriptorSize);
+        m_d3dDevice->CreateRenderTargetView(m_renderTargets[n].Get(), &rtvDesc, rtvDescriptor);
+    }
+
+    // Reset the index to the current back buffer.
+    m_backBufferIndex = m_swapChain->GetCurrentBackBufferIndex();
+
+    if (m_depthBufferFormat != DXGI_FORMAT_UNKNOWN) {
+        // Allocate a 2-D surface as the depth/stencil buffer and create a depth/stencil view
+        // on this surface.
+        const CD3DX12_HEAP_PROPERTIES depthHeapProperties(D3D12_HEAP_TYPE_DEFAULT);
+
+        D3D12_RESOURCE_DESC depthStencilDesc =
+            CD3DX12_RESOURCE_DESC::Tex2D(m_depthBufferFormat, backBufferWidth, backBufferHeight,
+                                         1, // This depth stencil view has only one texture.
+                                         1  // Use a single mipmap level.
+            );
+        depthStencilDesc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL;
+
+        const CD3DX12_CLEAR_VALUE depthOptimizedClearValue(m_depthBufferFormat,
+                                                           (m_options & c_ReverseDepth) ? 0.0f : 1.0f, 0u);
+
+        ThrowIfFailed(m_d3dDevice->CreateCommittedResource(
+            &depthHeapProperties, D3D12_HEAP_FLAG_NONE, &depthStencilDesc, D3D12_RESOURCE_STATE_DEPTH_WRITE,
+            &depthOptimizedClearValue, IID_PPV_ARGS(m_depthStencil.ReleaseAndGetAddressOf())));
+
+        m_depthStencil->SetName(L"Depth stencil");
+
+        D3D12_DEPTH_STENCIL_VIEW_DESC dsvDesc = {};
+        dsvDesc.Format = m_depthBufferFormat;
+        dsvDesc.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE2D;
+
+        m_d3dDevice->CreateDepthStencilView(m_depthStencil.Get(), &dsvDesc,
+                                            m_dsvDescriptorHeap->GetCPUDescriptorHandleForHeapStart());
+    }
+
+    // Set the 3D rendering viewport and scissor rectangle to target the entire window.
+    m_screenViewport.TopLeftX = m_screenViewport.TopLeftY = 0.f;
+    m_screenViewport.Width = static_cast<float>(backBufferWidth);
+    m_screenViewport.Height = static_cast<float>(backBufferHeight);
+    m_screenViewport.MinDepth = D3D12_MIN_DEPTH;
+    m_screenViewport.MaxDepth = D3D12_MAX_DEPTH;
+
+    m_scissorRect.left = m_scissorRect.top = 0;
+    m_scissorRect.right = static_cast<LONG>(backBufferWidth);
+    m_scissorRect.bottom = static_cast<LONG>(backBufferHeight);
+}
+
+// This method is called when the Win32 window is created (or re-created).
+void DeviceResources::SetWindow(HWND window, int width, int height) noexcept {
+    m_window = window;
+
+    m_outputSize.left = m_outputSize.top = 0;
+    m_outputSize.right = static_cast<long>(width);
+    m_outputSize.bottom = static_cast<long>(height);
+}
+
+// This method is called when the Win32 window changes size.
+bool DeviceResources::WindowSizeChanged(int width, int height) {
+    if (!m_window)
+        return false;
+
+    RECT newRc;
+    newRc.left = newRc.top = 0;
+    newRc.right = static_cast<long>(width);
+    newRc.bottom = static_cast<long>(height);
+    if (newRc.right == m_outputSize.right && newRc.bottom == m_outputSize.bottom) {
+        // Handle color space settings for HDR
+        UpdateColorSpace();
+
+        return false;
+    }
+
+    m_outputSize = newRc;
+    CreateWindowSizeDependentResources();
+    return true;
+}
+
+// Recreate all device resources and set them back to the current state.
+void DeviceResources::HandleDeviceLost() {
+    if (m_deviceNotify) {
+        m_deviceNotify->OnDeviceLost();
+    }
+
+    for (UINT n = 0; n < m_backBufferCount; n++) {
+        m_commandAllocators[n].Reset();
+        m_renderTargets[n].Reset();
+    }
+
+    m_depthStencil.Reset();
+    m_commandQueue.Reset();
+    m_commandList.Reset();
+    m_fence.Reset();
+    m_rtvDescriptorHeap.Reset();
+    m_dsvDescriptorHeap.Reset();
+    m_swapChain.Reset();
+    m_d3dDevice.Reset();
+    m_dxgiFactory.Reset();
+
+#ifdef _DEBUG
+    {
+        ComPtr<IDXGIDebug1> dxgiDebug;
+        if (SUCCEEDED(DXGIGetDebugInterface1(0, IID_PPV_ARGS(&dxgiDebug)))) {
+            dxgiDebug->ReportLiveObjects(DXGI_DEBUG_ALL,
+                                         DXGI_DEBUG_RLO_FLAGS(DXGI_DEBUG_RLO_SUMMARY | DXGI_DEBUG_RLO_IGNORE_INTERNAL));
+        }
+    }
+#endif
+
+    CreateDeviceResources();
+    CreateWindowSizeDependentResources();
+
+    if (m_deviceNotify) {
+        m_deviceNotify->OnDeviceRestored();
+    }
+}
+
+// Prepare the command list and render target for rendering.
+void DeviceResources::Prepare(D3D12_RESOURCE_STATES beforeState, D3D12_RESOURCE_STATES afterState) {
+    // Reset command list and allocator.
+    ThrowIfFailed(m_commandAllocators[m_backBufferIndex]->Reset());
+    ThrowIfFailed(m_commandList->Reset(m_commandAllocators[m_backBufferIndex].Get(), nullptr));
+
+    if (beforeState != afterState) {
+        // Transition the render target into the correct state to allow for drawing into it.
+        const D3D12_RESOURCE_BARRIER barrier =
+            CD3DX12_RESOURCE_BARRIER::Transition(m_renderTargets[m_backBufferIndex].Get(), beforeState, afterState);
+        m_commandList->ResourceBarrier(1, &barrier);
+    }
+}
+
+// Present the contents of the swap chain to the screen.
+void DeviceResources::Present(D3D12_RESOURCE_STATES beforeState) {
+    if (beforeState != D3D12_RESOURCE_STATE_PRESENT) {
+        // Transition the render target to the state that allows it to be presented to the display.
+        const D3D12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(
+            m_renderTargets[m_backBufferIndex].Get(), beforeState, D3D12_RESOURCE_STATE_PRESENT);
+        m_commandList->ResourceBarrier(1, &barrier);
+    }
+
+    // Send the command list off to the GPU for processing.
+    ThrowIfFailed(m_commandList->Close());
+    m_commandQueue->ExecuteCommandLists(1, CommandListCast(m_commandList.GetAddressOf()));
+
+    HRESULT hr;
+    if (m_options & c_AllowTearing) {
+        // Recommended to always use tearing if supported when using a sync interval of 0.
+        // Note this will fail if in true 'fullscreen' mode.
+        hr = m_swapChain->Present(0, DXGI_PRESENT_ALLOW_TEARING);
+    } else {
+        // The first argument instructs DXGI to block until VSync, putting the application
+        // to sleep until the next VSync. This ensures we don't waste any cycles rendering
+        // frames that will never be displayed to the screen.
+        hr = m_swapChain->Present(1, 0);
+    }
+
+    // If the device was reset we must completely reinitialize the renderer.
+    if (hr == DXGI_ERROR_DEVICE_REMOVED || hr == DXGI_ERROR_DEVICE_RESET) {
+#ifdef _DEBUG
+        char buff[64] = {};
+        sprintf_s(
+            buff, "Device Lost on Present: Reason code 0x%08X\n",
+            static_cast<unsigned int>((hr == DXGI_ERROR_DEVICE_REMOVED) ? m_d3dDevice->GetDeviceRemovedReason() : hr));
+        OutputDebugStringA(buff);
+#endif
+        HandleDeviceLost();
+    } else {
+        ThrowIfFailed(hr);
+
+        MoveToNextFrame();
+
+        if (!m_dxgiFactory->IsCurrent()) {
+            UpdateColorSpace();
+        }
+    }
+}
+
+// Wait for pending GPU work to complete.
+void DeviceResources::WaitForGpu() noexcept {
+    if (m_commandQueue && m_fence && m_fenceEvent.IsValid()) {
+        // Schedule a Signal command in the GPU queue.
+        const UINT64 fenceValue = m_fenceValues[m_backBufferIndex];
+        if (SUCCEEDED(m_commandQueue->Signal(m_fence.Get(), fenceValue))) {
+            // Wait until the Signal has been processed.
+            if (SUCCEEDED(m_fence->SetEventOnCompletion(fenceValue, m_fenceEvent.Get()))) {
+                std::ignore = WaitForSingleObjectEx(m_fenceEvent.Get(), INFINITE, FALSE);
+
+                // Increment the fence value for the current frame.
+                m_fenceValues[m_backBufferIndex]++;
+            }
+        }
+    }
+}
+
+// Prepare to render the next frame.
+void DeviceResources::MoveToNextFrame() {
+    // Schedule a Signal command in the queue.
+    const UINT64 currentFenceValue = m_fenceValues[m_backBufferIndex];
+    ThrowIfFailed(m_commandQueue->Signal(m_fence.Get(), currentFenceValue));
+
+    // Update the back buffer index.
+    m_backBufferIndex = m_swapChain->GetCurrentBackBufferIndex();
+
+    // If the next frame is not ready to be rendered yet, wait until it is ready.
+    if (m_fence->GetCompletedValue() < m_fenceValues[m_backBufferIndex]) {
+        ThrowIfFailed(m_fence->SetEventOnCompletion(m_fenceValues[m_backBufferIndex], m_fenceEvent.Get()));
+        std::ignore = WaitForSingleObjectEx(m_fenceEvent.Get(), INFINITE, FALSE);
+    }
+
+    // Set the fence value for the next frame.
+    m_fenceValues[m_backBufferIndex] = currentFenceValue + 1;
+}
+
+// This method acquires the first available hardware adapter that supports Direct3D 12.
+// If no such adapter can be found, try WARP. Otherwise throw an exception.
+void DeviceResources::GetAdapter(IDXGIAdapter1 **ppAdapter) {
+    *ppAdapter = nullptr;
+
+    ComPtr<IDXGIAdapter1> adapter;
+    for (UINT adapterIndex = 0; SUCCEEDED(m_dxgiFactory->EnumAdapterByGpuPreference(
+             adapterIndex, DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE, IID_PPV_ARGS(adapter.ReleaseAndGetAddressOf())));
+         adapterIndex++) {
+        DXGI_ADAPTER_DESC1 desc;
+        ThrowIfFailed(adapter->GetDesc1(&desc));
+
+        if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) {
+            // Don't select the Basic Render Driver adapter.
+            continue;
+        }
+
+        // Check to see if the adapter supports Direct3D 12, but don't create the actual device yet.
+        if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), m_d3dMinFeatureLevel, __uuidof(ID3D12Device), nullptr))) {
+#ifdef _DEBUG
+            wchar_t buff[256] = {};
+            swprintf_s(buff, L"Direct3D Adapter (%u): VID:%04X, PID:%04X - %ls\n", adapterIndex, desc.VendorId,
+                       desc.DeviceId, desc.Description);
+            OutputDebugStringW(buff);
+#endif
+            break;
+        }
+    }
+
+#if !defined(NDEBUG)
+    if (!adapter) {
+        // Try WARP12 instead
+        if (FAILED(m_dxgiFactory->EnumWarpAdapter(IID_PPV_ARGS(adapter.ReleaseAndGetAddressOf())))) {
+            throw std::runtime_error("WARP12 not available. Enable the 'Graphics Tools' optional feature");
+        }
+
+        OutputDebugStringA("Direct3D Adapter - WARP12\n");
+    }
+#endif
+
+    if (!adapter) {
+        throw std::runtime_error("No Direct3D 12 device found");
+    }
+
+    *ppAdapter = adapter.Detach();
+}
+
+// Sets the color space for the swap chain in order to handle HDR output.
+void DeviceResources::UpdateColorSpace() {
+    if (!m_dxgiFactory)
+        return;
+
+    if (!m_dxgiFactory->IsCurrent()) {
+        // Output information is cached on the DXGI Factory. If it is stale we need to create a new factory.
+        ThrowIfFailed(CreateDXGIFactory2(m_dxgiFactoryFlags, IID_PPV_ARGS(m_dxgiFactory.ReleaseAndGetAddressOf())));
+    }
+
+    DXGI_COLOR_SPACE_TYPE colorSpace = DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709;
+
+    bool isDisplayHDR10 = false;
+
+    if (m_swapChain) {
+        // To detect HDR support, we will need to check the color space in the primary
+        // DXGI output associated with the app at this point in time
+        // (using window/display intersection).
+
+        // Get the retangle bounds of the app window.
+        RECT windowBounds;
+        if (!GetWindowRect(m_window, &windowBounds))
+            throw std::system_error(std::error_code(static_cast<int>(GetLastError()), std::system_category()),
+                                    "GetWindowRect");
+
+        const long ax1 = windowBounds.left;
+        const long ay1 = windowBounds.top;
+        const long ax2 = windowBounds.right;
+        const long ay2 = windowBounds.bottom;
+
+        ComPtr<IDXGIOutput> bestOutput;
+        long bestIntersectArea = -1;
+
+        ComPtr<IDXGIAdapter> adapter;
+        for (UINT adapterIndex = 0;
+             SUCCEEDED(m_dxgiFactory->EnumAdapters(adapterIndex, adapter.ReleaseAndGetAddressOf())); ++adapterIndex) {
+            ComPtr<IDXGIOutput> output;
+            for (UINT outputIndex = 0; SUCCEEDED(adapter->EnumOutputs(outputIndex, output.ReleaseAndGetAddressOf()));
+                 ++outputIndex) {
+                // Get the rectangle bounds of current output.
+                DXGI_OUTPUT_DESC desc;
+                ThrowIfFailed(output->GetDesc(&desc));
+                const auto &r = desc.DesktopCoordinates;
+
+                // Compute the intersection
+                const long intersectArea =
+                    ComputeIntersectionArea(ax1, ay1, ax2, ay2, r.left, r.top, r.right, r.bottom);
+                if (intersectArea > bestIntersectArea) {
+                    bestOutput.Swap(output);
+                    bestIntersectArea = intersectArea;
+                }
+            }
+        }
+
+        if (bestOutput) {
+            ComPtr<IDXGIOutput6> output6;
+            if (SUCCEEDED(bestOutput.As(&output6))) {
+                DXGI_OUTPUT_DESC1 desc;
+                ThrowIfFailed(output6->GetDesc1(&desc));
+
+                if (desc.ColorSpace == DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020) {
+                    // Display output is HDR10.
+                    isDisplayHDR10 = true;
+                }
+            }
+        }
+    }
+
+    if ((m_options & c_EnableHDR) && isDisplayHDR10) {
+        switch (m_backBufferFormat) {
+        case DXGI_FORMAT_R10G10B10A2_UNORM:
+            // The application creates the HDR10 signal.
+            colorSpace = DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020;
+            break;
+
+        case DXGI_FORMAT_R16G16B16A16_FLOAT:
+            // The system creates the HDR10 signal; application uses linear values.
+            colorSpace = DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709;
+            break;
+
+        default:
+            break;
+        }
+    }
+
+    m_colorSpace = colorSpace;
+
+    UINT colorSpaceSupport = 0;
+    if (m_swapChain && SUCCEEDED(m_swapChain->CheckColorSpaceSupport(colorSpace, &colorSpaceSupport)) &&
+        (colorSpaceSupport & DXGI_SWAP_CHAIN_COLOR_SPACE_SUPPORT_FLAG_PRESENT)) {
+        ThrowIfFailed(m_swapChain->SetColorSpace1(colorSpace));
+    }
+}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.h b/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.h
new file mode 100644
index 000000000..792e533e8
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.h
@@ -0,0 +1,138 @@
+//
+// DeviceResources.h - A wrapper for the Direct3D 12 device and swapchain
+//
+
+#pragma once
+
+#include <system_error>
+#include <tuple>
+
+#include "pch.h"
+
+namespace DX {
+// Provides an interface for an application that owns DeviceResources to be notified of the device being lost or
+// created.
+interface IDeviceNotify {
+    virtual void OnDeviceLost() = 0;
+    virtual void OnDeviceRestored() = 0;
+
+  protected:
+    ~IDeviceNotify() = default;
+};
+
+// Controls all the DirectX device resources.
+class DeviceResources {
+  public:
+    static constexpr unsigned int c_AllowTearing = 0x1;
+    static constexpr unsigned int c_EnableHDR = 0x2;
+    static constexpr unsigned int c_ReverseDepth = 0x4;
+
+    DeviceResources(DXGI_FORMAT backBufferFormat = DXGI_FORMAT_B8G8R8A8_UNORM,
+                    DXGI_FORMAT depthBufferFormat = DXGI_FORMAT_D32_FLOAT, UINT backBufferCount = 2,
+                    D3D_FEATURE_LEVEL minFeatureLevel = D3D_FEATURE_LEVEL_11_0, unsigned int flags = 0) noexcept(false);
+    ~DeviceResources();
+
+    DeviceResources(DeviceResources &&) = default;
+    DeviceResources &operator=(DeviceResources &&) = default;
+
+    DeviceResources(DeviceResources const &) = delete;
+    DeviceResources &operator=(DeviceResources const &) = delete;
+
+    void CreateDeviceResources();
+    void CreateWindowSizeDependentResources();
+    void SetWindow(HWND window, int width, int height) noexcept;
+    bool WindowSizeChanged(int width, int height);
+    void HandleDeviceLost();
+    void RegisterDeviceNotify(IDeviceNotify *deviceNotify) noexcept { m_deviceNotify = deviceNotify; }
+    void Prepare(D3D12_RESOURCE_STATES beforeState = D3D12_RESOURCE_STATE_PRESENT,
+                 D3D12_RESOURCE_STATES afterState = D3D12_RESOURCE_STATE_RENDER_TARGET);
+    void Present(D3D12_RESOURCE_STATES beforeState = D3D12_RESOURCE_STATE_RENDER_TARGET);
+    void WaitForGpu() noexcept;
+    void UpdateColorSpace();
+
+    // Device Accessors.
+    RECT GetOutputSize() const noexcept { return m_outputSize; }
+
+    // Direct3D Accessors.
+    auto GetD3DDevice() const noexcept { return m_d3dDevice.Get(); }
+    auto GetSwapChain() const noexcept { return m_swapChain.Get(); }
+    auto GetDXGIFactory() const noexcept { return m_dxgiFactory.Get(); }
+    HWND GetWindow() const noexcept { return m_window; }
+    D3D_FEATURE_LEVEL GetDeviceFeatureLevel() const noexcept { return m_d3dFeatureLevel; }
+    ID3D12Resource *GetRenderTarget() const noexcept { return m_renderTargets[m_backBufferIndex].Get(); }
+    ID3D12Resource *GetDepthStencil() const noexcept { return m_depthStencil.Get(); }
+    ID3D12CommandQueue *GetCommandQueue() const noexcept { return m_commandQueue.Get(); }
+    ID3D12CommandAllocator *GetCommandAllocator() const noexcept {
+        return m_commandAllocators[m_backBufferIndex].Get();
+    }
+    auto GetCommandList() const noexcept { return m_commandList.Get(); }
+    DXGI_FORMAT GetBackBufferFormat() const noexcept { return m_backBufferFormat; }
+    DXGI_FORMAT GetDepthBufferFormat() const noexcept { return m_depthBufferFormat; }
+    D3D12_VIEWPORT GetScreenViewport() const noexcept { return m_screenViewport; }
+    D3D12_RECT GetScissorRect() const noexcept { return m_scissorRect; }
+    UINT GetCurrentFrameIndex() const noexcept { return m_backBufferIndex; }
+    UINT GetBackBufferCount() const noexcept { return m_backBufferCount; }
+    DXGI_COLOR_SPACE_TYPE GetColorSpace() const noexcept { return m_colorSpace; }
+    unsigned int GetDeviceOptions() const noexcept { return m_options; }
+
+    CD3DX12_CPU_DESCRIPTOR_HANDLE GetRenderTargetView() const noexcept {
+        return CD3DX12_CPU_DESCRIPTOR_HANDLE(m_rtvDescriptorHeap->GetCPUDescriptorHandleForHeapStart(),
+                                             static_cast<INT>(m_backBufferIndex), m_rtvDescriptorSize);
+    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE GetDepthStencilView() const noexcept {
+        return CD3DX12_CPU_DESCRIPTOR_HANDLE(m_dsvDescriptorHeap->GetCPUDescriptorHandleForHeapStart());
+    }
+
+    void MoveToNextFrame();
+    void GetAdapter(IDXGIAdapter1 **ppAdapter);
+
+    static constexpr size_t MAX_BACK_BUFFER_COUNT = 3;
+
+    UINT m_backBufferIndex;
+
+    // Direct3D objects.
+    Microsoft::WRL::ComPtr<ID3D12Device> m_d3dDevice;
+    Microsoft::WRL::ComPtr<ID3D12GraphicsCommandList> m_commandList;
+    Microsoft::WRL::ComPtr<ID3D12CommandQueue> m_commandQueue;
+    Microsoft::WRL::ComPtr<ID3D12CommandAllocator> m_commandAllocators[MAX_BACK_BUFFER_COUNT];
+
+    // Swap chain objects.
+    Microsoft::WRL::ComPtr<IDXGIFactory6> m_dxgiFactory;
+    Microsoft::WRL::ComPtr<IDXGISwapChain3> m_swapChain;
+    Microsoft::WRL::ComPtr<ID3D12Resource> m_renderTargets[MAX_BACK_BUFFER_COUNT];
+    Microsoft::WRL::ComPtr<ID3D12Resource> m_depthStencil;
+
+    // Presentation fence objects.
+    Microsoft::WRL::ComPtr<ID3D12Fence> m_fence;
+    UINT64 m_fenceValues[MAX_BACK_BUFFER_COUNT];
+    Microsoft::WRL::Wrappers::Event m_fenceEvent;
+
+    // Direct3D rendering objects.
+    Microsoft::WRL::ComPtr<ID3D12DescriptorHeap> m_rtvDescriptorHeap;
+    Microsoft::WRL::ComPtr<ID3D12DescriptorHeap> m_dsvDescriptorHeap;
+    UINT m_rtvDescriptorSize;
+    D3D12_VIEWPORT m_screenViewport;
+    D3D12_RECT m_scissorRect;
+
+    // Direct3D properties.
+    DXGI_FORMAT m_backBufferFormat;
+    DXGI_FORMAT m_depthBufferFormat;
+    UINT m_backBufferCount;
+    D3D_FEATURE_LEVEL m_d3dMinFeatureLevel;
+
+    // Cached device properties.
+    HWND m_window;
+    D3D_FEATURE_LEVEL m_d3dFeatureLevel;
+    DWORD m_dxgiFactoryFlags;
+    RECT m_outputSize;
+
+    // HDR Support
+    DXGI_COLOR_SPACE_TYPE m_colorSpace;
+
+    // DeviceResources options (see flags above)
+    unsigned int m_options;
+
+    // The IDeviceNotify can be held directly as it owns the DeviceResources.
+    IDeviceNotify *m_deviceNotify;
+};
+} // namespace DX
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/pch.h b/superbench/benchmarks/micro_benchmarks/directx_third_party/pch.h
new file mode 100644
index 000000000..c51edb428
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/pch.h
@@ -0,0 +1,97 @@
+//--------------------------------------------------------------------------------------
+// pch.h
+//
+// Header for standard system include files.
+//
+// Advanced Technology Group (ATG)
+// Copyright (C) Microsoft Corporation. All rights reserved.
+//--------------------------------------------------------------------------------------
+
+#pragma once
+
+#include <WinSDKVer.h>
+#define _WIN32_WINNT 0x0A00
+#include <SDKDDKVer.h>
+
+// Use the C++ standard templated min/max
+#define NOMINMAX
+
+// DirectX apps don't need GDI
+#define NODRAWTEXT
+#define NOGDI
+#define NOBITMAP
+
+// Include <mcx.h> if you need this
+#define NOMCX
+
+// Include <winsvc.h> if you need this
+#define NOSERVICE
+
+// WinHelp is deprecated
+#define NOHELP
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+#include <wrl/client.h>
+#include <wrl/event.h>
+
+#include <d3d12.h>
+
+#if defined(NTDDI_WIN10_RS2)
+#include <dxgi1_6.h>
+#else
+#include <dxgi1_5.h>
+#endif
+
+#include <DirectXColors.h>
+#include <DirectXMath.h>
+
+#include "d3dx12.h"
+
+#include <algorithm>
+#include <exception>
+#include <memory>
+#include <stdexcept>
+
+#ifdef _DEBUG
+#include <dxgidebug.h>
+#endif
+
+#include <stdio.h>
+
+// To use graphics and CPU markup events with the latest version of PIX, change this to include <pix3.h>
+// then add the NuGet package WinPixEventRuntime to the project.
+#include <pix.h>
+
+#include <D3Dcompiler.h>
+#include <DirectXMath.h>
+
+#pragma comment(lib, "D3Dcompiler.lib")
+#pragma comment(lib, "d3d12.lib")
+#pragma comment(lib, "dxgi.lib")
+#pragma comment(lib, "dxguid.lib")
+
+namespace DX {
+// Helper class for COM exceptions
+class com_exception : public std::exception {
+  public:
+    com_exception(HRESULT hr) noexcept : result(hr) {}
+
+    const char *what() const override {
+        static char s_str[64] = {};
+        sprintf_s(s_str, "Failure with HRESULT of %08X", static_cast<unsigned int>(result));
+        return s_str;
+    }
+
+  private:
+    HRESULT result;
+};
+
+// Helper utility converts D3D API failures into exceptions.
+inline void ThrowIfFailed(HRESULT hr) {
+    if (FAILED(hr)) {
+        throw com_exception(hr);
+    }
+}
+} // namespace DX

From 27a10811afb4f2f9c5404d02b1056391f14f4b1a Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Tue, 22 Aug 2023 18:56:33 +0800
Subject: [PATCH 28/33] Benchmarks:  micro benchmark - source code for
 evaluating NVDEC decoding performance (#560)

**Description**
source code for evaluating NVDEC decoding performance.

---------

Co-authored-by: yukirora <yuting.jiang@microsoft.com>
---
 .azure-pipelines/cuda-unit-test.yml           |    6 +-
 .github/workflows/codeql-analysis.yml         |    4 +
 .gitignore                                    |    3 -
 dockerfile/cuda11.1.1.dockerfile              |    5 +
 dockerfile/cuda12.1.dockerfile                |    5 +
 .../cuda_decode_performance/AppDecPerf.cpp    |  454 +++++++
 .../cuda_decode_performance/CMakeLists.txt    |  117 ++
 .../OptimizedNvDecoder.cpp                    |  263 ++++
 .../OptimizedNvDecoder.h                      |   52 +
 .../cuda_decode_performance/ThreadPoolUtils.h |   99 ++
 .../Video_Codec_SDK/Interface/cuviddec.h      | 1173 +++++++++++++++++
 .../Video_Codec_SDK/Interface/nvcuvid.h       |  486 +++++++
 .../Lib/linux/stubs/x86_64/libnvcuvid.so      |  Bin 0 -> 3528 bytes
 .../Samples/NvCodec/NvDecoder/NvDecoder.cpp   |  709 ++++++++++
 .../Samples/NvCodec/NvDecoder/NvDecoder.h     |  528 ++++++++
 .../Samples/Utils/FFmpegDemuxer.h             |  379 ++++++
 .../Samples/Utils/FFmpegStreamer.h            |  148 +++
 .../Video_Codec_SDK/Samples/Utils/Logger.h    |  235 ++++
 .../Samples/Utils/NvCodecUtils.h              |  547 ++++++++
 19 files changed, 5208 insertions(+), 5 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
 create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp
 create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h
 create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h
 create mode 100644 third_party/Video_Codec_SDK/Interface/cuviddec.h
 create mode 100644 third_party/Video_Codec_SDK/Interface/nvcuvid.h
 create mode 100644 third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so
 create mode 100644 third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp
 create mode 100644 third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h
 create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h
 create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h
 create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/Logger.h
 create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h

diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml
index 3afcd49fd..2d953d659 100644
--- a/.azure-pipelines/cuda-unit-test.yml
+++ b/.azure-pipelines/cuda-unit-test.yml
@@ -11,7 +11,7 @@ pool:
 
 container:
   image: nvcr.io/nvidia/pytorch:20.12-py3
-  options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker'
+  options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/'
 
 steps:
   - script: |
@@ -21,6 +21,8 @@ steps:
       python3 -m pip install --upgrade pip setuptools==65.7
       python3 -m pip install .[test,nvworker]
       make postinstall
+      sudo DEBIAN_FRONTEND=noninteractive apt-get update
+      sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev
     displayName: Install dependencies
   - script: |
       python3 setup.py lint
@@ -31,7 +33,7 @@ steps:
   - script: |
       SB_MICRO_PATH=$PWD python3 setup.py test
     displayName: Run unit tests
-    timeoutInMinutes: 15
+    timeoutInMinutes: 30
   - script: |
       bash <(curl -s https://codecov.io/bash) -cF cuda-unit-test
     displayName: Report coverage results
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index ef9f652b7..e53acebf6 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -49,6 +49,10 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+      - name: Install Dependency
+        run: |
+          DEBIAN_FRONTEND=noninteractive apt-get update
+          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v2
         with:
diff --git a/.gitignore b/.gitignore
index e1ab18ca4..5888455a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,9 +9,6 @@ __pycache__/
 *.py[cod]
 *$py.class
 
-# C extensions
-*.so
-
 # Distribution / packaging
 .Python
 build/
diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index 8b92c5463..d7feb2baa 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -26,13 +26,18 @@ RUN apt-get update && \
     build-essential \
     curl \
     dmidecode \
+    ffmpeg \
     git \
     iproute2 \
     jq \
     libaio-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
     libcap2 \
     libnuma-dev \
     libpci-dev \
+    libswresample-dev \
     libtinfo5 \
     libtool \
     lshw \
diff --git a/dockerfile/cuda12.1.dockerfile b/dockerfile/cuda12.1.dockerfile
index 4a257bf43..2f9e430fa 100644
--- a/dockerfile/cuda12.1.dockerfile
+++ b/dockerfile/cuda12.1.dockerfile
@@ -25,14 +25,19 @@ RUN apt-get update && \
     build-essential \
     curl \
     dmidecode \
+    ffmpeg \
     git \
     iproute2 \
     jq \
     libaio-dev \
+    libavcodec-dev \
+    libavformat-dev \
+    libavutil-dev \
     libboost-program-options-dev \
     libcap2 \
     libnuma-dev \
     libpci-dev \
+    libswresample-dev \
     libtinfo5 \
     libtool \
     lshw \
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp
new file mode 100644
index 000000000..1ae5ae121
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp
@@ -0,0 +1,454 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <algorithm>
+#include <chrono>
+#include <cuda.h>
+#include <cudaProfiler.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <stdio.h>
+#include <string.h>
+#include <string>
+#include <thread>
+
+#include "../Utils/FFmpegDemuxer.h"
+#include "../Utils/NvCodecUtils.h"
+#include "OptimizedNvDecoder.h"
+#include "ThreadPoolUtils.h"
+
+// Define logger which need in third party utils
+simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger();
+
+// Define the codec map
+std::map<std::string, cudaVideoCodec_enum> codecMap = {
+    {"mpeg1", cudaVideoCodec_MPEG1},       {"mpeg2", cudaVideoCodec_MPEG2},       {"mpeg4", cudaVideoCodec_MPEG4},
+    {"vc1", cudaVideoCodec_VC1},           {"h264", cudaVideoCodec_H264},         {"jpeg", cudaVideoCodec_JPEG},
+    {"h264_svc", cudaVideoCodec_H264_SVC}, {"h264_mvc", cudaVideoCodec_H264_MVC}, {"hevc", cudaVideoCodec_HEVC},
+    {"vp8", cudaVideoCodec_VP8},           {"vp9", cudaVideoCodec_VP9},           {"av1", cudaVideoCodec_AV1}};
+
+/**
+ *   @brief  Function to decode video file using OptimizedNvDecoder interface
+ *   @param  pDec    - Handle to OptimizedNvDecoder
+ *   @param  demuxer - Pointer to an FFmpegDemuxer instance
+ *   @param  pnFrame - Variable to record the number of frames decoded
+ *   @param  ex      - Stores current exception in case of failure
+ */
+void DecProc(OptimizedNvDecoder *pDec, const char *szInFilePath, int *pnFrame, std::exception_ptr &ex) {
+    try {
+        std::unique_ptr<FFmpegDemuxer> demuxer(new FFmpegDemuxer(szInFilePath));
+        int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0;
+        uint8_t *pVideo = NULL, *pFrame = NULL;
+        do {
+            // Demux video from file using FFmpegDemuxer
+            demuxer->Demux(&pVideo, &nVideoBytes);
+            // Decode the video frame from demuxed packet
+            nFrameReturned = pDec->Decode(pVideo, nVideoBytes);
+            if (!nFrame && nFrameReturned)
+                LOG(INFO) << pDec->GetVideoInfo();
+            nFrame += nFrameReturned;
+        } while (nVideoBytes);
+        *pnFrame = nFrame;
+    } catch (std::exception &) {
+        ex = std::current_exception();
+    }
+}
+
+/**
+ *   @brief  Function to show help message and exit
+ */
+void ShowHelpAndExit(const char *szBadOption = NULL) {
+    std::ostringstream oss;
+    bool bThrowError = false;
+    if (szBadOption) {
+        bThrowError = true;
+        oss << "Error parsing \"" << szBadOption << "\"" << std::endl;
+    }
+    oss << "Options:" << std::endl
+        << "-i           Input file path. No default value. One of -i and -multi_input is required." << std::endl
+        << "-o           Output file path of raw data. No default value. Optional." << std::endl
+        << "-gpu         Ordinal of GPU to use. Default 0. Optional." << std::endl
+        << "-thread      Number of decoding thread. Default 5. Optional." << std::endl
+        << "-total       Number of total video to test. Default 100. Optional." << std::endl
+        << "-single      (No value) Use single cuda context for every thread. Default is multi-context, one context "
+           "per thread."
+        << std::endl
+        << "-host        (No value) Copy frame to host memory .Default is device memory)" << std::endl
+        << "-multi_input The file path which lists the path of multiple video in each line." << std::endl
+        << "-codec       The codec of video to test. Default H264." << std::endl;
+    if (bThrowError) {
+        throw std::invalid_argument(oss.str());
+    } else {
+        std::cout << oss.str();
+        exit(0);
+    }
+}
+
+/**
+ *   @brief  Function to parse commandline arguments
+ */
+void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu, int &nThread, int &nTotalVideo,
+                      bool &bSingle, bool &bHost, std::string &inputFilesListPath, std::string &outputFile,
+                      cudaVideoCodec &codec) {
+    for (int i = 1; i < argc; i++) {
+        if (!_stricmp(argv[i], "-h")) {
+            ShowHelpAndExit();
+        }
+        if (!_stricmp(argv[i], "-i")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-i");
+            }
+            sprintf(szInputFileName, "%s", argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-o")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-o");
+            }
+            outputFile = std::string(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-gpu")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-gpu");
+            }
+            iGpu = atoi(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-thread")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-thread");
+            }
+            nThread = atoi(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-total")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-total");
+            }
+            nTotalVideo = atoi(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-multi_input")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-multi_input");
+            }
+            inputFilesListPath = std::string(argv[i]);
+            continue;
+        }
+        if (!_stricmp(argv[i], "-single")) {
+            bSingle = true;
+            continue;
+        }
+        if (!_stricmp(argv[i], "-host")) {
+            bHost = true;
+            continue;
+        }
+        if (!_stricmp(argv[i], "-codec")) {
+            if (++i == argc) {
+                ShowHelpAndExit("-codec");
+            }
+            std::string codecName = std::string(argv[i]);
+            std::transform(codecName.begin(), codecName.end(), codecName.begin(),
+                           [](unsigned char c) { return std::tolower(c); });
+            if (codecMap.find(codecName) != codecMap.end()) {
+                codec = codecMap[codecName];
+            } else {
+                std::cout << "Codec name not found in the map." << std::endl;
+                exit(1);
+            }
+            continue;
+        }
+        ShowHelpAndExit(argv[i]);
+    }
+}
+
+/**
+ *  @brief  Function to create cuda context and initialize decoder
+ */
+OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUcontext &cuContext, bool bSingle,
+                                           bool bHost, cudaVideoCodec codec, CUVIDDECODECAPS decodecaps) {
+    if (!bSingle) {
+        ck(cuCtxCreate(&cuContext, 0, cuDevice));
+    }
+    OptimizedNvDecoder *sessionObject = new OptimizedNvDecoder(cuContext, !bHost, codec, decodecaps);
+    sessionObject->setDecoderSessionID(i);
+    return sessionObject;
+}
+
+/**
+ *  @brief  Function to decode a video in a thread and measure the latency
+ */
+double DecodeVideo(size_t i, const std::vector<OptimizedNvDecoder *> &vDec, const char *szInFilePath, int *pnFrame,
+                   std::exception_ptr &ex) {
+    try {
+        OptimizedNvDecoder *pDec = vDec[i];
+        auto start = std::chrono::high_resolution_clock::now();
+        DecProc(pDec, szInFilePath, pnFrame, ex);
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsedTime = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+        std::cout << "Decode finished --"
+                  << " duration:" << elapsedTime << " frames:" << *pnFrame << std::endl;
+        return elapsedTime / 1000.0f;
+    } catch (const std::exception &e) {
+        std::cerr << "Exception in deocding: " << e.what() << std::endl;
+        return 0;
+    }
+}
+
+/**
+ *  @brief  Function to read the video paths from a file
+ */
+std::vector<std::string> ReadMultipleVideoFiles(const std::string &filepath) {
+    std::ifstream file(filepath);
+    if (!file) {
+        std::cerr << "Error opening the file." << std::endl;
+        exit(1);
+    }
+    std::string line;
+    std::vector<std::string> tokens;
+    while (std::getline(file, line)) {
+        tokens.push_back(line);
+    }
+    file.close();
+    return tokens;
+}
+
+/**
+ * @brief  Function to get the decoder capability
+ */
+void GetDefaultDecoderCaps(CUVIDDECODECAPS &decodecaps, cudaVideoCodec codec) {
+    memset(&decodecaps, 0, sizeof(decodecaps));
+    decodecaps.eCodecType = codec;
+    decodecaps.eChromaFormat = cudaVideoChromaFormat_420;
+    decodecaps.nBitDepthMinus8 = 0;
+    NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps));
+}
+
+/**
+ * @brief  Function to initialize the cuda device, cuda context, query the decoder capability and create decoder for
+ * each thread
+ */
+void InitializeContext(std::vector<OptimizedNvDecoder *> &vDec, int iGpu, int nThread, bool bSingle, bool bHost,
+                       cudaVideoCodec codec) {
+    ck(cuInit(0));
+    int nGpu = 0;
+    ck(cuDeviceGetCount(&nGpu));
+    if (iGpu < 0 || iGpu >= nGpu) {
+        std::cout << "GPU ordinal out of range. Should be within [" << 0 << ", " << nGpu - 1 << "]" << std::endl;
+        exit(1);
+    }
+    CUdevice cuDevice = 0;
+    ck(cuDeviceGet(&cuDevice, iGpu));
+    char szDeviceName[80];
+    ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice));
+    std::cout << "GPU in use: " << szDeviceName << std::endl;
+
+    CUcontext cuContext = NULL;
+    ck(cuCtxCreate(&cuContext, 0, cuDevice));
+
+    CUVIDDECODECAPS decodecaps;
+    GetDefaultDecoderCaps(decodecaps, codec);
+
+    ThreadPool threadPool(nThread);
+    std::vector<std::future<OptimizedNvDecoder *>> futures;
+    for (int i = 0; i < nThread; i++) {
+        futures.push_back(
+            threadPool.enqueue(InitOptimizedNvDecoder, cuDevice, cuContext, bSingle, bHost, codec, decodecaps));
+    }
+    for (auto &future : futures) {
+        vDec.push_back(future.get()); // Retrieve the results from each task
+    }
+}
+
+/**
+ * @brief  Function to write the latency and FPS data of each video to a file
+ */
+void WriteRawData(std::vector<OptimizedNvDecoder *> &vDec, int nThread, const std::vector<double> &data,
+                  std::vector<int> &frames, std::string filename) {
+    // Open the output file stream
+    std::ofstream outputFile(filename);
+    outputFile << "Frame Latency" << std::endl;
+    for (int i = 0; i < nThread; i++) {
+        for (const auto &tuple : vDec[i]->GetFrameLatency()) {
+            int frame = std::get<0>(tuple);
+            double latency = std::get<1>(tuple);
+            outputFile << "Frame: " << frame << ", Latency: " << latency << std::endl;
+        }
+    }
+    outputFile << "Video Latency" << std::endl;
+    for (int i = 0; i < data.size(); i++) {
+        outputFile << data[i] << std::endl;
+    }
+    outputFile << "Video FPS" << std::endl;
+    for (int i = 0; i < data.size(); i++) {
+        outputFile << frames[i] / data[i] << std::endl;
+    }
+
+    // Close the file stream
+    outputFile.close();
+}
+
+/**
+ * @brief  Function to calculate the statistical metrics
+ */
+std::tuple<double, double, double, double, double, double, double, double>
+CalMetrics(const std::vector<double> &originData) {
+    std::vector<double> data = originData;
+    double sum = std::accumulate(data.begin(), data.end(), 0.0);
+    double mean = sum / data.size();
+    double min = *std::min_element(data.begin(), data.end());
+    double max = *std::max_element(data.begin(), data.end());
+    std::sort(data.begin(), data.end());
+    double p50 = data[data.size() / 2];
+    double p90 = data[static_cast<size_t>(data.size() * 0.9)];
+    double p95 = data[static_cast<size_t>(data.size() * 0.95)];
+    double p99 = data[static_cast<size_t>(data.size() * 0.99)];
+    return std::make_tuple(sum, mean, min, max, p50, p90, p95, p99);
+}
+
+/**
+ * @brief  Function to generate the total file list for the given total number of videos.
+ *        If the number of videos is less than the total number of videos, the list will be repeated.
+ *        If the number of videos is greater than the total number of videos, the list will be truncated.
+ */
+std::vector<std::string> GenerateTotalFileList(const std::string &inputFilesListPath, int nTotalVideo,
+                                               const char *szInFilePath) {
+    std::vector<std::string> files;
+    if (inputFilesListPath.size() != 0) {
+        auto videofiles = ReadMultipleVideoFiles(inputFilesListPath);
+        int smallerSize = videofiles.size();
+
+        if (nTotalVideo > smallerSize) {
+            int numIterations = nTotalVideo / smallerSize;
+
+            for (int i = 0; i < numIterations; i++) {
+                files.insert(files.end(), videofiles.begin(), videofiles.end());
+            }
+
+            int remainingElements = nTotalVideo - (numIterations * smallerSize);
+            files.insert(files.end(), videofiles.begin(), videofiles.begin() + remainingElements);
+        } else {
+            files = std::vector<std::string>(videofiles.begin(), videofiles.begin() + nTotalVideo);
+        }
+
+        std::cout << "Multifile mode - " << nTotalVideo << "videos will be decoded" << std::endl;
+    } else {
+        for (int i = 0; i < nTotalVideo; i++) {
+            files.push_back(std::string(szInFilePath));
+        }
+    }
+    return files;
+}
+
+/**
+ * @brief  Function to run the decoding tasks in parallel with thread pool to decode all the videos and record the total
+ * latency and the total number of frames
+ */
+float run(std::vector<OptimizedNvDecoder *> &vDec, int nThread, std::vector<std::string> &files,
+          std::vector<int> &vnFrame, std::vector<std::exception_ptr> &vExceptionPtrs, int *nTotalFrames,
+          std::vector<double> &vnLatency, std::vector<double> &frLatency, std::vector<double> &vnFPS) {
+    std::vector<std::future<double>> decodeLatencyFutures;
+    ThreadPool threadPool(nThread);
+    // Enqueue the video decoding task into thread pool
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < files.size(); i++) {
+        auto filePath = files[i].c_str();
+        CheckInputFile(filePath);
+        decodeLatencyFutures.push_back(
+            threadPool.enqueue(DecodeVideo, vDec, filePath, &vnFrame[i], std::ref(vExceptionPtrs[i])));
+    }
+    // Wait until decoding tasks finished
+    for (int i = 0; i < files.size(); i++) {
+        auto decodeLatency = decodeLatencyFutures[i].get();
+        vnLatency.push_back(decodeLatency);
+        *nTotalFrames += vnFrame[i];
+    }
+    auto elapsedTime =
+        (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start)
+             .count()) /
+        1000.0f;
+    for (int i = 0; i < nThread; i++) {
+        for (const auto &tuple : vDec[i]->GetFrameLatency()) {
+            int frame = std::get<0>(tuple);
+            double latency = std::get<1>(tuple);
+            if (frame > 0) {
+                frLatency.push_back(latency / frame);
+            }
+        }
+    }
+    for (int i = 0; i < vnLatency.size(); i++) {
+        if (vnLatency[i] != 0) {
+            vnFPS.push_back(vnFrame[i] / vnLatency[i]);
+        }
+    }
+
+    // Record the total time
+    return elapsedTime;
+}
+
+int main(int argc, char **argv) {
+    char szInFilePath[256] = "";
+    int iGpu = 0;
+    int nThread = 5;
+    int nTotalVideo = 100;
+    bool bSingle = false;
+    bool bHost = false;
+    std::string inputFilesListPath = "";
+    std::string outputFilePath = "";
+    std::vector<std::exception_ptr> vExceptionPtrs(nTotalVideo);
+    cudaVideoCodec codec = cudaVideoCodec_H264;
+    try {
+        // Parse the command line arguments
+        ParseCommandLine(argc, argv, szInFilePath, iGpu, nThread, nTotalVideo, bSingle, bHost, inputFilesListPath,
+                         outputFilePath, codec);
+        auto files = GenerateTotalFileList(inputFilesListPath, nTotalVideo, szInFilePath);
+
+        // Initialize and prepare the decoder context for each thread
+        std::vector<OptimizedNvDecoder *> vDec;
+        InitializeContext(vDec, iGpu, nThread, bSingle, bHost, codec);
+
+        // Decode all video with thread pool
+        std::vector<int> vnFrame(nTotalVideo);
+        int nTotalFrames = 0;
+        std::vector<double> vnLatency;
+        std::vector<double> frLatency;
+        std::vector<double> videoFPS;
+        auto elapsedTime =
+            run(vDec, nThread, files, vnFrame, vExceptionPtrs, &nTotalFrames, vnLatency, frLatency, videoFPS);
+
+        // Calculate and output the raw data into file and metrics into stdout
+        double sum, mean, min, max, p50, p90, p95, p99;
+        std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(vnLatency);
+        std::cout << "Total Frames Decoded=" << nTotalFrames << " FPS=" << nTotalFrames / elapsedTime << std::endl;
+        std::cout << "Mean Latency for each video=" << mean * 1000 << " P50 Latency=" << p50 * 1000
+                  << " P90 Latency=" << p90 * 1000 << " P95 Latency=" << p95 * 1000 << " P99 Latency=" << p99 * 1000
+                  << "ms" << std::endl;
+
+        std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(videoFPS);
+        std::cout << "Mean FPS for each video=" << mean << " P50 FPS=" << p50 << " P90 FPS=" << p90
+                  << " P95 FPS=" << p95 << " P99 FPS=" << p99 << std::endl;
+        std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(frLatency);
+        std::cout << "Mean Latency for each frame=" << mean * 1000 << " P50 Latency=" << p50 * 1000
+                  << " P90 Latency=" << p90 * 1000 << " P95 Latency=" << p95 * 1000 << " P99 Latency=" << p99 * 1000
+                  << "ms" << std::endl;
+        if (outputFilePath.size() != 0) {
+            WriteRawData(vDec, nThread, vnLatency, vnFrame, outputFilePath);
+        }
+        // Deinitialization
+        for (int i = 0; i < nThread; i++) {
+            delete (vDec[i]);
+        }
+        for (int i = 0; i < nThread; i++) {
+            if (vExceptionPtrs[i]) {
+                std::rethrow_exception(vExceptionPtrs[i]);
+            }
+        }
+    } catch (const std::exception &ex) {
+        std::cout << ex.what();
+        exit(1);
+    }
+    return 0;
+}
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
new file mode 100644
index 000000000..83cb15067
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt
@@ -0,0 +1,117 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.18)
+project(cuda_decode_performance)
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+  set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
+  set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
+  set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
+  set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
+  set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
+
+  if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
+    pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
+    pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
+    pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
+
+    set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
+    find_library(AVCODEC_LIBRARY NAMES avcodec
+    HINTS
+    ${PC_AVCODEC_LIBDIR}
+    ${PC_AVCODEC_LIBRARY_DIRS}
+    )
+    find_library(AVFORMAT_LIBRARY NAMES avformat
+    HINTS
+    ${PC_AVFORMAT_LIBDIR}
+    ${PC_AVFORMAT_LIBRARY_DIRS}
+    )
+    find_library(AVUTIL_LIBRARY NAMES avutil
+    HINTS
+    ${PC_AVUTIL_LIBDIR}
+    ${PC_AVUTIL_LIBRARY_DIRS}
+    )
+    find_library(SWRESAMPLE_LIBRARY NAMES swresample
+    HINTS
+    ${PC_SWRESAMPLE_LIBDIR}
+    ${PC_SWRESAMPLE_LIBRARY_DIRS}
+    )
+    set(AVCODEC_LIB ${AVCODEC_LIBRARY})
+    set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
+    set(AVUTIL_LIB ${AVUTIL_LIBRARY})
+    set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
+  endif()
+
+  set(APP_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
+  )
+
+  set(NV_DEC_SOURCES
+  ${NV_DEC_DIR}/NvDecoder.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
+  )
+
+  set(NV_DEC_HDRS
+  ${NV_DEC_DIR}/NvDecoder.h
+  ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
+  ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
+  ${NVCODEC_UTILS_DIR}/NvCodecUtils.h
+  ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
+  )
+
+  source_group( "headers" FILES ${NV_DEC_HDRS} )
+  source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
+  set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
+  find_package(CUDA)
+  set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
+  if ( CMAKE_COMPILER_IS_GNUCC )
+    if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
+      list(APPEND CUDA_NVCC_FLAGS -std=c++11)
+    endif()
+  endif()
+
+  # Check if the file exists
+  if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
+      execute_process(
+        COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
+        RESULT_VARIABLE result
+      )  
+      if(result)
+        message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
+      endif()
+  endif ()
+
+  find_library(CUVID_LIB nvcuvid
+  HINTS
+  "/usr/local/lib/"
+  "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
+  )
+
+  cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
+
+  set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+  target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
+  ${NVCODEC_PUBLIC_INTERFACE_DIR}
+  ${NVCODEC_UTILS_DIR}
+  ${NV_CODEC_DIR}
+  ${NV_APPDEC_COMMON_DIR}
+  ${NV_FFMPEG_HDRS}
+  ${THIRD_PARTY_SAMPLE_DIR}
+  )
+
+  target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
+  ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
+
+  install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
+endif()
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp
new file mode 100644
index 000000000..ee23391b7
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp
@@ -0,0 +1,263 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <cmath>
+
+#include "OptimizedNvDecoder.h"
+
+int OptimizedNvDecoder::Decode(const uint8_t *pData, int nSize, int nFlags, int64_t nTimestamp) {
+    m_nDecodedFrame = 0;
+    m_nDecodedFrameReturned = 0;
+    CUVIDSOURCEDATAPACKET packet = {0};
+    packet.payload = pData;
+    packet.payload_size = nSize;
+    packet.flags = nFlags | CUVID_PKT_TIMESTAMP;
+    packet.timestamp = nTimestamp;
+    if (!pData || nSize == 0) {
+        packet.flags |= CUVID_PKT_ENDOFSTREAM;
+    }
+    auto start = std::chrono::high_resolution_clock::now();
+    NVDEC_API_CALL(cuvidParseVideoData(m_hParser, &packet));
+    int64_t elapsedTime =
+        std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - start)
+            .count();
+    frameLatency.push_back(std::make_tuple(m_nDecodedFrame, elapsedTime / 1000.0f / 1000.0f));
+    return m_nDecodedFrame;
+}
+
+OptimizedNvDecoder::OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec,
+                                       CUVIDDECODECAPS decodecaps, bool bLowLatency, bool bDeviceFramePitched,
+                                       const Rect *pCropRect, const Dim *pResizeDim, bool extract_user_SEI_Message,
+                                       int maxWidth, int maxHeight, unsigned int clkRate, bool force_zero_latency) {
+    m_cuContext = cuContext;
+    m_bUseDeviceFrame = bUseDeviceFrame;
+    m_eCodec = eCodec;
+    m_bDeviceFramePitched = bDeviceFramePitched;
+    m_bExtractSEIMessage = extract_user_SEI_Message;
+    m_nMaxWidth = maxWidth;
+    m_nMaxHeight = maxHeight;
+    m_bForce_zero_latency = force_zero_latency;
+    if (pCropRect)
+        m_cropRect = *pCropRect;
+    if (pResizeDim)
+        m_resizeDim = *pResizeDim;
+
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext));
+
+    ck(cuStreamCreate(&m_cuvidStream, CU_STREAM_DEFAULT));
+
+    decoderSessionID = 0;
+
+    if (m_bExtractSEIMessage) {
+        m_fpSEI = fopen("sei_message.txt", "wb");
+        m_pCurrSEIMessage = new CUVIDSEIMESSAGEINFO;
+        memset(&m_SEIMessagesDisplayOrder, 0, sizeof(m_SEIMessagesDisplayOrder));
+    }
+    CUVIDPARSERPARAMS videoParserParameters = {};
+    videoParserParameters.CodecType = eCodec;
+    videoParserParameters.ulMaxNumDecodeSurfaces = 1;
+    videoParserParameters.ulClockRate = clkRate;
+    videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1;
+    videoParserParameters.pUserData = this;
+    videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
+    videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
+    videoParserParameters.pfnDisplayPicture = m_bForce_zero_latency ? NULL : HandlePictureDisplayProc;
+    videoParserParameters.pfnGetOperatingPoint = HandleOperatingPointProc;
+    videoParserParameters.pfnGetSEIMsg = m_bExtractSEIMessage ? HandleSEIMessagesProc : NULL;
+    NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters));
+    // reuse the decodecaps queried before
+    m_decodecaps = decodecaps;
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+}
+
+int OptimizedNvDecoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat) {
+    START_TIMER
+    m_videoInfo.str("");
+    m_videoInfo.clear();
+    m_videoInfo << "Video Input Information" << std::endl
+                << "\tCodec        : " << GetVideoCodecString(pVideoFormat->codec) << std::endl
+                << "\tFrame rate   : " << pVideoFormat->frame_rate.numerator << "/"
+                << pVideoFormat->frame_rate.denominator << " = "
+                << 1.0 * pVideoFormat->frame_rate.numerator / pVideoFormat->frame_rate.denominator << " fps"
+                << std::endl
+                << "\tSequence     : " << (pVideoFormat->progressive_sequence ? "Progressive" : "Interlaced")
+                << std::endl
+                << "\tCoded size   : [" << pVideoFormat->coded_width << ", " << pVideoFormat->coded_height << "]"
+                << std::endl
+                << "\tDisplay area : [" << pVideoFormat->display_area.left << ", " << pVideoFormat->display_area.top
+                << ", " << pVideoFormat->display_area.right << ", " << pVideoFormat->display_area.bottom << "]"
+                << std::endl
+                << "\tChroma       : " << GetVideoChromaFormatString(pVideoFormat->chroma_format) << std::endl
+                << "\tBit depth    : " << pVideoFormat->bit_depth_luma_minus8 + 8;
+    m_videoInfo << std::endl;
+
+    int nDecodeSurface = pVideoFormat->min_num_decode_surfaces;
+
+    // re-call the cuvidGetDecoderCaps when the video codeoc and format change
+    if (m_decodecaps.eCodecType != pVideoFormat->codec || m_decodecaps.eChromaFormat != pVideoFormat->chroma_format ||
+        m_decodecaps.nBitDepthMinus8 != pVideoFormat->bit_depth_luma_minus8) {
+        m_decodecaps.eCodecType = pVideoFormat->codec;
+        m_decodecaps.eChromaFormat = pVideoFormat->chroma_format;
+        m_decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+
+        CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+        NVDEC_API_CALL(cuvidGetDecoderCaps(&m_decodecaps));
+        CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    }
+
+    if (!m_decodecaps.bIsSupported) {
+        NVDEC_THROW_ERROR("Codec not supported on this GPU", CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if ((pVideoFormat->coded_width > m_decodecaps.nMaxWidth) ||
+        (pVideoFormat->coded_height > m_decodecaps.nMaxHeight)) {
+
+        std::ostringstream errorString;
+        errorString << std::endl
+                    << "Resolution          : " << pVideoFormat->coded_width << "x" << pVideoFormat->coded_height
+                    << std::endl
+                    << "Max Supported (wxh) : " << m_decodecaps.nMaxWidth << "x" << m_decodecaps.nMaxHeight << std::endl
+                    << "Resolution not supported on this GPU";
+
+        const std::string cErr = errorString.str();
+        NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+    if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > m_decodecaps.nMaxMBCount) {
+
+        std::ostringstream errorString;
+        errorString << std::endl
+                    << "MBCount             : " << (pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4)
+                    << std::endl
+                    << "Max Supported mbcnt : " << m_decodecaps.nMaxMBCount << std::endl
+                    << "MBCount not supported on this GPU";
+        NVDEC_THROW_ERROR(errorString.str(), CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if (m_nWidth && m_nLumaHeight && m_nChromaHeight) {
+
+        // cuvidCreateDecoder() has been called before, and now there's possible config change
+        return ReconfigureDecoder(pVideoFormat);
+    }
+
+    // eCodec has been set in the constructor (for parser). Here it's set again for potential correction
+    m_eCodec = pVideoFormat->codec;
+    m_eChromaFormat = pVideoFormat->chroma_format;
+    m_nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+    m_nBPP = m_nBitDepthMinus8 > 0 ? 2 : 1;
+
+    // Set the output surface format same as chroma format
+    if (m_eChromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome)
+        m_eOutputFormat =
+            pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
+    else if (m_eChromaFormat == cudaVideoChromaFormat_444)
+        m_eOutputFormat =
+            pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444;
+    else if (m_eChromaFormat == cudaVideoChromaFormat_422)
+        m_eOutputFormat = cudaVideoSurfaceFormat_NV12; // no 4:2:2 output format supported yet so make 420 default
+
+    // Check if output format supported. If not, check falback options
+    if (!(m_decodecaps.nOutputFormatMask & (1 << m_eOutputFormat))) {
+        if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12))
+            m_eOutputFormat = cudaVideoSurfaceFormat_NV12;
+        else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016))
+            m_eOutputFormat = cudaVideoSurfaceFormat_P016;
+        else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444))
+            m_eOutputFormat = cudaVideoSurfaceFormat_YUV444;
+        else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit))
+            m_eOutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit;
+        else
+            NVDEC_THROW_ERROR("No supported output format found", CUDA_ERROR_NOT_SUPPORTED);
+    }
+    m_videoFormat = *pVideoFormat;
+
+    CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0};
+    videoDecodeCreateInfo.CodecType = pVideoFormat->codec;
+    videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format;
+    videoDecodeCreateInfo.OutputFormat = m_eOutputFormat;
+    videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+    if (pVideoFormat->progressive_sequence)
+        videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
+    else
+        videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive;
+    videoDecodeCreateInfo.ulNumOutputSurfaces = 2;
+    // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware
+    videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
+    videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface;
+    videoDecodeCreateInfo.vidLock = m_ctxLock;
+    videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width;
+    videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height;
+    // AV1 has max width/height of sequence in sequence header
+    if (pVideoFormat->codec == cudaVideoCodec_AV1 && pVideoFormat->seqhdr_data_length > 0) {
+        CUVIDEOFORMATEX *vidFormatEx = (CUVIDEOFORMATEX *)pVideoFormat;
+        if (m_nMaxWidth < pVideoFormat->coded_width) {
+            m_nMaxWidth = vidFormatEx->av1.max_width;
+        }
+        if (m_nMaxHeight < pVideoFormat->coded_height) {
+            m_nMaxHeight = vidFormatEx->av1.max_height;
+        }
+    }
+    if (m_nMaxWidth < (int)pVideoFormat->coded_width)
+        m_nMaxWidth = pVideoFormat->coded_width;
+    if (m_nMaxHeight < (int)pVideoFormat->coded_height)
+        m_nMaxHeight = pVideoFormat->coded_height;
+    videoDecodeCreateInfo.ulMaxWidth = m_nMaxWidth;
+    videoDecodeCreateInfo.ulMaxHeight = m_nMaxHeight;
+
+    if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) {
+        m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left;
+        m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
+        videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width;
+        videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height;
+    } else {
+        if (m_resizeDim.w && m_resizeDim.h) {
+            videoDecodeCreateInfo.display_area.left = pVideoFormat->display_area.left;
+            videoDecodeCreateInfo.display_area.top = pVideoFormat->display_area.top;
+            videoDecodeCreateInfo.display_area.right = pVideoFormat->display_area.right;
+            videoDecodeCreateInfo.display_area.bottom = pVideoFormat->display_area.bottom;
+            m_nWidth = m_resizeDim.w;
+            m_nLumaHeight = m_resizeDim.h;
+        }
+
+        if (m_cropRect.r && m_cropRect.b) {
+            videoDecodeCreateInfo.display_area.left = m_cropRect.l;
+            videoDecodeCreateInfo.display_area.top = m_cropRect.t;
+            videoDecodeCreateInfo.display_area.right = m_cropRect.r;
+            videoDecodeCreateInfo.display_area.bottom = m_cropRect.b;
+            m_nWidth = m_cropRect.r - m_cropRect.l;
+            m_nLumaHeight = m_cropRect.b - m_cropRect.t;
+        }
+        videoDecodeCreateInfo.ulTargetWidth = m_nWidth;
+        videoDecodeCreateInfo.ulTargetHeight = m_nLumaHeight;
+    }
+
+    m_nChromaHeight = (int)(ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat)));
+    m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat);
+    m_nSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight;
+    m_nSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth;
+    m_displayRect.b = videoDecodeCreateInfo.display_area.bottom;
+    m_displayRect.t = videoDecodeCreateInfo.display_area.top;
+    m_displayRect.l = videoDecodeCreateInfo.display_area.left;
+    m_displayRect.r = videoDecodeCreateInfo.display_area.right;
+
+    m_videoInfo << "Video Decoding Params:" << std::endl
+                << "\tNum Surfaces : " << videoDecodeCreateInfo.ulNumDecodeSurfaces << std::endl
+                << "\tCrop         : [" << videoDecodeCreateInfo.display_area.left << ", "
+                << videoDecodeCreateInfo.display_area.top << ", " << videoDecodeCreateInfo.display_area.right << ", "
+                << videoDecodeCreateInfo.display_area.bottom << "]" << std::endl
+                << "\tResize       : " << videoDecodeCreateInfo.ulTargetWidth << "x"
+                << videoDecodeCreateInfo.ulTargetHeight << std::endl
+                << "\tDeinterlace  : "
+                << std::vector<const char *>{"Weave", "Bob", "Adaptive"}[videoDecodeCreateInfo.DeinterlaceMode];
+    m_videoInfo << std::endl;
+
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidCreateDecoder(&m_hDecoder, &videoDecodeCreateInfo));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    STOP_TIMER("Session Initialization Time: ");
+    NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime);
+    return nDecodeSurface;
+}
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h
new file mode 100644
index 000000000..f9881c80d
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h
@@ -0,0 +1,52 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "NvDecoder/NvDecoder.h"
+
+// This class is derived from NvDecoder class and is used to optimize the cuvidGetDecoderCaps overhead
+class OptimizedNvDecoder : public NvDecoder {
+
+  public:
+    OptimizedNvDecoder() {}
+    /**
+     *  @brief This function is used to initialize the decoder session.
+     *  Application must call this function to initialize the decoder, before
+     *  starting to decode any frames.
+     *  The only difference from the original function is to add a new member m_decodecaps.
+     *  Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK.
+     */
+    OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, CUVIDDECODECAPS decodecaps,
+                       bool bLowLatency = false, bool bDeviceFramePitched = false, const Rect *pCropRect = NULL,
+                       const Dim *pResizeDim = NULL, bool extract_user_SEI_Message = false, int maxWidth = 0,
+                       int maxHeight = 0, unsigned int clkRate = 1000, bool force_zero_latency = false);
+
+    /**
+     * @brief This function is to overwrite the origin Decode function to record the latency on frame level.
+     */
+    int Decode(const uint8_t *pData, int nSize, int nFlags = 0, int64_t nTimestamp = 0);
+    /**
+     * @brief This function is used to Get the frameLatency vector
+     */
+    std::vector<std::tuple<int, double>> &GetFrameLatency() { return frameLatency; }
+
+  protected:
+    /**
+     *   @brief  Callback function to be registered for getting a callback when decoding of sequence starts
+     */
+    static int CUDAAPI HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat) {
+        if (pUserData == nullptr) {
+            throw std::runtime_error("pUserData is nullptr");
+        }
+        return ((OptimizedNvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat);
+    }
+    /**
+     *   @brief  Define the new handler when decoding of sequence starts.
+     *           The only change is to re-query decoder caps when the video codec or format change
+     *           Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK.
+     */
+    int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat);
+
+    CUVIDDECODECAPS m_decodecaps;
+
+    std::vector<std::tuple<int, double>> frameLatency;
+};
diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h
new file mode 100644
index 000000000..5592b76e7
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h
@@ -0,0 +1,99 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+// ThreadPool is a simple thread pool implementation that supports enqueueing the task with the index of thread to use
+// and custom arguments like task(thread_index, *args).
+class ThreadPool {
+  public:
+    /**
+     * @brief Construct a new ThreadPool object with the given number of threads.
+     */
+    ThreadPool(size_t numThreads) {
+        for (size_t i = 0; i < numThreads; ++i) {
+            threads.emplace_back(&ThreadPool::worker, this, i);
+        }
+    }
+    /**
+     * @brief Destroy the ThreadPool object and join all threads.
+     */
+    ~ThreadPool() {
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            stop = true;
+        }
+        cv.notify_all();
+
+        for (auto &thread : threads) {
+            thread.join();
+        }
+    }
+    /**
+     * @brief TaskWrapper is a wrapper of the task with the index of thread to use and custom arguments like
+     * task(thread_index, *args).
+     */
+    template <typename R, typename F, typename... Args> struct TaskWrapper {
+        std::shared_ptr<std::packaged_task<R(size_t)>> task;
+
+        template <typename Callable, typename... CallableArgs> TaskWrapper(Callable &&f, CallableArgs &&...args) {
+            task = std::make_shared<std::packaged_task<R(size_t)>>(
+                [f, args...](size_t threadIdx) mutable { return f(threadIdx, args...); });
+        }
+
+        void operator()(size_t threadIdx) { (*task)(threadIdx); }
+    };
+    /**
+     * @brief Enqueue enqueues the task with custom arguments and return the results of task when finished.
+     */
+    template <typename F, typename... Args>
+    auto enqueue(F &&f, Args &&...args) -> std::future<typename std::result_of<F(size_t, Args...)>::type> {
+        using ReturnType = typename std::result_of<F(size_t, Args...)>::type;
+
+        TaskWrapper<ReturnType, F, Args...> wrapper(std::forward<F>(f), std::forward<Args>(args)...);
+        std::future<ReturnType> res = wrapper.task->get_future();
+
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            tasks.emplace(std::move(wrapper));
+        }
+        cv.notify_one();
+
+        return res;
+    }
+
+  private:
+    /**
+     * @brief The worker function that dequeues the task and executes it for each thread index.
+     */
+    void worker(size_t threadIdx) {
+        while (true) {
+            std::function<void(size_t)> task;
+            {
+                std::unique_lock<std::mutex> lock(mutex);
+                cv.wait(lock, [this] { return stop || !tasks.empty(); });
+
+                if (stop && tasks.empty()) {
+                    return;
+                }
+
+                task = tasks.front();
+                tasks.pop();
+            }
+
+            task(threadIdx);
+        }
+    }
+
+    std::vector<std::thread> threads;
+    std::queue<std::function<void(size_t)>> tasks;
+    std::mutex mutex;
+    std::condition_variable cv;
+    bool stop = false;
+};
diff --git a/third_party/Video_Codec_SDK/Interface/cuviddec.h b/third_party/Video_Codec_SDK/Interface/cuviddec.h
new file mode 100644
index 000000000..1d13eec83
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Interface/cuviddec.h
@@ -0,0 +1,1173 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*****************************************************************************************************/
+//! \file cuviddec.h
+//! NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! This file contains constants, structure definitions and function prototypes used for decoding.
+/*****************************************************************************************************/
+
+#if !defined(__CUDA_VIDEO_H__)
+#define __CUDA_VIDEO_H__
+
+#ifndef __cuda_cuda_h__
+#include <cuda.h>
+#endif // __cuda_cuda_h__
+
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+#if (CUDA_VERSION >= 3020) && (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020))
+#define __CUVID_DEVPTR64
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+typedef void *CUvideodecoder;
+typedef struct _CUcontextlock_st *CUvideoctxlock;
+
+/*********************************************************************************/
+//! \enum cudaVideoCodec
+//! Video codec enums
+//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
+/*********************************************************************************/
+typedef enum cudaVideoCodec_enum {
+    cudaVideoCodec_MPEG1 = 0, /**<  MPEG1             */
+    cudaVideoCodec_MPEG2,     /**<  MPEG2             */
+    cudaVideoCodec_MPEG4,     /**<  MPEG4             */
+    cudaVideoCodec_VC1,       /**<  VC1               */
+    cudaVideoCodec_H264,      /**<  H264              */
+    cudaVideoCodec_JPEG,      /**<  JPEG              */
+    cudaVideoCodec_H264_SVC,  /**<  H264-SVC          */
+    cudaVideoCodec_H264_MVC,  /**<  H264-MVC          */
+    cudaVideoCodec_HEVC,      /**<  HEVC              */
+    cudaVideoCodec_VP8,       /**<  VP8               */
+    cudaVideoCodec_VP9,       /**<  VP9               */
+    cudaVideoCodec_AV1,       /**<  AV1               */
+    cudaVideoCodec_NumCodecs, /**<  Max codecs        */
+    // Uncompressed YUV
+    cudaVideoCodec_YUV420 = (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')), /**< Y,U,V (4:2:0)      */
+    cudaVideoCodec_YV12 = (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,V,U (4:2:0)      */
+    cudaVideoCodec_NV12 = (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')),   /**< Y,UV  (4:2:0)      */
+    cudaVideoCodec_YUYV = (('Y' << 24) | ('U' << 16) | ('Y' << 8) | ('V')),   /**< YUYV/YUY2 (4:2:2)  */
+    cudaVideoCodec_UYVY = (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y'))    /**< UYVY (4:2:2)       */
+} cudaVideoCodec;
+
+/*********************************************************************************/
+//! \enum cudaVideoSurfaceFormat
+//! Video surface format enums used for output format of decoded output
+//! These enums are used in CUVIDDECODECREATEINFO structure
+/*********************************************************************************/
+typedef enum cudaVideoSurfaceFormat_enum {
+    cudaVideoSurfaceFormat_NV12 = 0,         /**< Semi-Planar YUV [Y plane followed by interleaved UV plane]     */
+    cudaVideoSurfaceFormat_P016 = 1,         /**< 16 bit Semi-Planar YUV [Y plane followed by interleaved UV plane].
+                                                  Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)      */
+    cudaVideoSurfaceFormat_YUV444 = 2,       /**< Planar YUV [Y plane followed by U and V planes]                */
+    cudaVideoSurfaceFormat_YUV444_16Bit = 3, /**< 16 bit Planar YUV [Y plane followed by U and V planes].
+                                                  Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0)      */
+} cudaVideoSurfaceFormat;
+
+/******************************************************************************************************************/
+//! \enum cudaVideoDeinterlaceMode
+//! Deinterlacing mode enums
+//! These enums are used in CUVIDDECODECREATEINFO structure
+//! Use cudaVideoDeinterlaceMode_Weave for progressive content and for content that doesn't need deinterlacing
+//! cudaVideoDeinterlaceMode_Adaptive needs more video memory than other DImodes
+/******************************************************************************************************************/
+typedef enum cudaVideoDeinterlaceMode_enum {
+    cudaVideoDeinterlaceMode_Weave = 0, /**< Weave both fields (no deinterlacing) */
+    cudaVideoDeinterlaceMode_Bob,       /**< Drop one field                       */
+    cudaVideoDeinterlaceMode_Adaptive   /**< Adaptive deinterlacing               */
+} cudaVideoDeinterlaceMode;
+
+/**************************************************************************************************************/
+//! \enum cudaVideoChromaFormat
+//! Chroma format enums
+//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
+/**************************************************************************************************************/
+typedef enum cudaVideoChromaFormat_enum {
+    cudaVideoChromaFormat_Monochrome = 0, /**< MonoChrome */
+    cudaVideoChromaFormat_420,            /**< YUV 4:2:0  */
+    cudaVideoChromaFormat_422,            /**< YUV 4:2:2  */
+    cudaVideoChromaFormat_444             /**< YUV 4:4:4  */
+} cudaVideoChromaFormat;
+
+/*************************************************************************************************************/
+//! \enum cudaVideoCreateFlags
+//! Decoder flag enums to select preferred decode path
+//! cudaVideoCreate_Default and cudaVideoCreate_PreferCUVID are most optimized, use these whenever possible
+/*************************************************************************************************************/
+typedef enum cudaVideoCreateFlags_enum {
+    cudaVideoCreate_Default = 0x00, /**< Default operation mode: use dedicated video engines                        */
+    cudaVideoCreate_PreferCUDA =
+        0x01, /**< Use CUDA-based decoder (requires valid vidLock object for multi-threading) */
+    cudaVideoCreate_PreferDXVA = 0x02, /**< Go through DXVA internally if possible (requires D3D9 interop) */
+    cudaVideoCreate_PreferCUVID = 0x04 /**< Use dedicated video engines directly */
+} cudaVideoCreateFlags;
+
+/*************************************************************************/
+//! \enum cuvidDecodeStatus
+//! Decode status enums
+//! These enums are used in CUVIDGETDECODESTATUS structure
+/*************************************************************************/
+typedef enum cuvidDecodeStatus_enum {
+    cuvidDecodeStatus_Invalid = 0,    // Decode status is not valid
+    cuvidDecodeStatus_InProgress = 1, // Decode is in progress
+    cuvidDecodeStatus_Success = 2,    // Decode is completed without any errors
+    // 3 to 7 enums are reserved for future use
+    cuvidDecodeStatus_Error = 8,           // Decode is completed with an error (error is not concealed)
+    cuvidDecodeStatus_Error_Concealed = 9, // Decode is completed with an error and error is concealed
+} cuvidDecodeStatus;
+
+/**************************************************************************************************************/
+//! \struct CUVIDDECODECAPS;
+//! This structure is used in cuvidGetDecoderCaps API
+/**************************************************************************************************************/
+typedef struct _CUVIDDECODECAPS {
+    cudaVideoCodec eCodecType;           /**< IN: cudaVideoCodec_XXX                                             */
+    cudaVideoChromaFormat eChromaFormat; /**< IN: cudaVideoChromaFormat_XXX                                      */
+    unsigned int nBitDepthMinus8;        /**< IN: The Value "BitDepth minus 8"                                   */
+    unsigned int reserved1[3];           /**< Reserved for future use - set to zero                              */
+
+    unsigned char bIsSupported;          /**< OUT: 1 if codec supported, 0 if not supported                      */
+    unsigned char nNumNVDECs;            /**< OUT: Number of NVDECs that can support IN params                   */
+    unsigned short nOutputFormatMask;    /**< OUT: each bit represents corresponding cudaVideoSurfaceFormat enum */
+    unsigned int nMaxWidth;              /**< OUT: Max supported coded width in pixels                           */
+    unsigned int nMaxHeight;             /**< OUT: Max supported coded height in pixels                          */
+    unsigned int nMaxMBCount;            /**< OUT: Max supported macroblock count
+                                                   CodedWidth*CodedHeight/256 must be <= nMaxMBCount             */
+    unsigned short nMinWidth;            /**< OUT: Min supported coded width in pixels                           */
+    unsigned short nMinHeight;           /**< OUT: Min supported coded height in pixels                          */
+    unsigned char bIsHistogramSupported; /**< OUT: 1 if Y component histogram output is supported, 0 if not
+                                                   Note: histogram is computed on original picture data before
+                                                   any post-processing like scaling, cropping, etc. is applied   */
+    unsigned char nCounterBitDepth;      /**< OUT: histogram counter bit depth                                   */
+    unsigned short nMaxHistogramBins;    /**< OUT: Max number of histogram bins                                  */
+    unsigned int reserved3[10];          /**< Reserved for future use - set to zero                              */
+} CUVIDDECODECAPS;
+
+/**************************************************************************************************************/
+//! \struct CUVIDDECODECREATEINFO
+//! This structure is used in cuvidCreateDecoder API
+/**************************************************************************************************************/
+typedef struct _CUVIDDECODECREATEINFO {
+    unsigned long ulWidth;  /**< IN: Coded sequence width in pixels                                             */
+    unsigned long ulHeight; /**< IN: Coded sequence height in pixels                                            */
+    unsigned long ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */
+    cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX                                                         */
+    cudaVideoChromaFormat ChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */
+    unsigned long ulCreationFlags;      /**< IN: Decoder creation flags (cudaVideoCreateFlags_XXX)      */
+    unsigned long bitDepthMinus8; /**< IN: The value "BitDepth minus 8"                                               */
+    unsigned long ulIntraDecodeOnly; /**< IN: Set 1 only if video has all intra frames (default value is 0). This will
+                                          optimize video memory for Intra frames only decoding. The support is limited
+                                          to specific codecs - H264, HEVC, VP9, the flag will be ignored for codecs
+                                        which are not supported. However decoding might fail if the flag is enabled in
+                                        case of supported codecs for regular bit streams having P and/or B frames. */
+    unsigned long ulMaxWidth;  /**< IN: Coded sequence max width in pixels used with reconfigure Decoder           */
+    unsigned long ulMaxHeight; /**< IN: Coded sequence max height in pixels used with reconfigure Decoder          */
+    unsigned long Reserved1;   /**< Reserved for future use - set to zero                                          */
+    /**
+     * IN: area of the frame that should be displayed
+     */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } display_area;
+
+    cudaVideoSurfaceFormat OutputFormat;      /**< IN: cudaVideoSurfaceFormat_XXX                                     */
+    cudaVideoDeinterlaceMode DeinterlaceMode; /**< IN: cudaVideoDeinterlaceMode_XXX                                   */
+    unsigned long ulTargetWidth;              /**< IN: Post-processed output width (Should be aligned to 2)           */
+    unsigned long ulTargetHeight;             /**< IN: Post-processed output height (Should be aligned to 2)          */
+    unsigned long ulNumOutputSurfaces;        /**< IN: Maximum number of output surfaces simultaneously mapped        */
+    CUvideoctxlock vidLock;                   /**< IN: If non-NULL, context lock used for synchronizing ownership of
+                                                   the cuda context. Needed for cudaVideoCreate_PreferCUDA decode     */
+    /**
+     * IN: target rectangle in the output frame (for aspect ratio conversion)
+     * if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will be used
+     */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } target_rect;
+
+    unsigned long enableHistogram; /**< IN: enable histogram output, if supported */
+    unsigned long Reserved2[4];    /**< Reserved for future use - set to zero */
+} CUVIDDECODECREATEINFO;
+
+/*********************************************************/
+//! \struct CUVIDH264DPBENTRY
+//! H.264 DPB entry
+//! This structure is used in CUVIDH264PICPARAMS structure
+/*********************************************************/
+typedef struct _CUVIDH264DPBENTRY {
+    int PicIdx;             /**< picture index of reference frame                                        */
+    int FrameIdx;           /**< frame_num(short-term) or LongTermFrameIdx(long-term)                    */
+    int is_long_term;       /**< 0=short term reference, 1=long term reference                           */
+    int not_existing;       /**< non-existing reference frame (corresponding PicIdx should be set to -1) */
+    int used_for_reference; /**< 0=unused, 1=top_field, 2=bottom_field, 3=both_fields                    */
+    int FieldOrderCnt[2];   /**< field order count of top and bottom fields                              */
+} CUVIDH264DPBENTRY;
+
+/************************************************************/
+//! \struct CUVIDH264MVCEXT
+//! H.264 MVC picture parameters ext
+//! This structure is used in CUVIDH264PICPARAMS structure
+/************************************************************/
+typedef struct _CUVIDH264MVCEXT {
+    int num_views_minus1;          /**< Max number of coded views minus 1 in video : Range - 0 to 1023              */
+    int view_id;                   /**< view identifier                                                             */
+    unsigned char inter_view_flag; /**< 1 if used for inter-view prediction, 0 if not                               */
+    unsigned char num_inter_view_refs_l0; /**< number of inter-view ref pics in RefPicList0 */
+    unsigned char num_inter_view_refs_l1; /**< number of inter-view ref pics in RefPicList1 */
+    unsigned char MVCReserved8Bits; /**< Reserved bits                                                               */
+    int InterViewRefsL0[16];        /**< view id of the i-th view component for inter-view prediction in RefPicList0 */
+    int InterViewRefsL1[16];        /**< view id of the i-th view component for inter-view prediction in RefPicList1 */
+} CUVIDH264MVCEXT;
+
+/*********************************************************/
+//! \struct CUVIDH264SVCEXT
+//! H.264 SVC picture parameters ext
+//! This structure is used in CUVIDH264PICPARAMS structure
+/*********************************************************/
+typedef struct _CUVIDH264SVCEXT {
+    unsigned char profile_idc;
+    unsigned char level_idc;
+    unsigned char DQId;
+    unsigned char DQIdMax;
+    unsigned char disable_inter_layer_deblocking_filter_idc;
+    unsigned char ref_layer_chroma_phase_y_plus1;
+    signed char inter_layer_slice_alpha_c0_offset_div2;
+    signed char inter_layer_slice_beta_offset_div2;
+
+    unsigned short DPBEntryValidFlag;
+    unsigned char inter_layer_deblocking_filter_control_present_flag;
+    unsigned char extended_spatial_scalability_idc;
+    unsigned char adaptive_tcoeff_level_prediction_flag;
+    unsigned char slice_header_restriction_flag;
+    unsigned char chroma_phase_x_plus1_flag;
+    unsigned char chroma_phase_y_plus1;
+
+    unsigned char tcoeff_level_prediction_flag;
+    unsigned char constrained_intra_resampling_flag;
+    unsigned char ref_layer_chroma_phase_x_plus1_flag;
+    unsigned char store_ref_base_pic_flag;
+    unsigned char Reserved8BitsA;
+    unsigned char Reserved8BitsB;
+
+    short scaled_ref_layer_left_offset;
+    short scaled_ref_layer_top_offset;
+    short scaled_ref_layer_right_offset;
+    short scaled_ref_layer_bottom_offset;
+    unsigned short Reserved16Bits;
+    struct _CUVIDPICPARAMS *pNextLayer; /**< Points to the picparams for the next layer to be decoded.
+                                             Linked list ends at the target layer. */
+    int bRefBaseLayer;                  /**< whether to store ref base pic */
+} CUVIDH264SVCEXT;
+
+/******************************************************/
+//! \struct CUVIDH264PICPARAMS
+//! H.264 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/******************************************************/
+typedef struct _CUVIDH264PICPARAMS {
+    // SPS
+    int log2_max_frame_num_minus4;
+    int pic_order_cnt_type;
+    int log2_max_pic_order_cnt_lsb_minus4;
+    int delta_pic_order_always_zero_flag;
+    int frame_mbs_only_flag;
+    int direct_8x8_inference_flag;
+    int num_ref_frames; // NOTE: shall meet level 4.1 restrictions
+    unsigned char residual_colour_transform_flag;
+    unsigned char bit_depth_luma_minus8;   // Must be 0 (only 8-bit supported)
+    unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported)
+    unsigned char qpprime_y_zero_transform_bypass_flag;
+    // PPS
+    int entropy_coding_mode_flag;
+    int pic_order_present_flag;
+    int num_ref_idx_l0_active_minus1;
+    int num_ref_idx_l1_active_minus1;
+    int weighted_pred_flag;
+    int weighted_bipred_idc;
+    int pic_init_qp_minus26;
+    int deblocking_filter_control_present_flag;
+    int redundant_pic_cnt_present_flag;
+    int transform_8x8_mode_flag;
+    int MbaffFrameFlag;
+    int constrained_intra_pred_flag;
+    int chroma_qp_index_offset;
+    int second_chroma_qp_index_offset;
+    int ref_pic_flag;
+    int frame_num;
+    int CurrFieldOrderCnt[2];
+    // DPB
+    CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB
+    // Quantization Matrices (raster-order)
+    unsigned char WeightScale4x4[6][16];
+    unsigned char WeightScale8x8[2][64];
+    // FMO/ASO
+    unsigned char fmo_aso_enable;
+    unsigned char num_slice_groups_minus1;
+    unsigned char slice_group_map_type;
+    signed char pic_init_qs_minus26;
+    unsigned int slice_group_change_rate_minus1;
+    union {
+        unsigned long long slice_group_map_addr;
+        const unsigned char *pMb2SliceGroupMap;
+    } fmo;
+    unsigned int Reserved[12];
+    // SVC/MVC
+    union {
+        CUVIDH264MVCEXT mvcext;
+        CUVIDH264SVCEXT svcext;
+    };
+} CUVIDH264PICPARAMS;
+
+/********************************************************/
+//! \struct CUVIDMPEG2PICPARAMS
+//! MPEG-2 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/********************************************************/
+typedef struct _CUVIDMPEG2PICPARAMS {
+    int ForwardRefIdx;  // Picture index of forward reference (P/B-frames)
+    int BackwardRefIdx; // Picture index of backward reference (B-frames)
+    int picture_coding_type;
+    int full_pel_forward_vector;
+    int full_pel_backward_vector;
+    int f_code[2][2];
+    int intra_dc_precision;
+    int frame_pred_frame_dct;
+    int concealment_motion_vectors;
+    int q_scale_type;
+    int intra_vlc_format;
+    int alternate_scan;
+    int top_field_first;
+    // Quantization matrices (raster order)
+    unsigned char QuantMatrixIntra[64];
+    unsigned char QuantMatrixInter[64];
+} CUVIDMPEG2PICPARAMS;
+
+// MPEG-4 has VOP types instead of Picture types
+#define I_VOP 0
+#define P_VOP 1
+#define B_VOP 2
+#define S_VOP 3
+
+/*******************************************************/
+//! \struct CUVIDMPEG4PICPARAMS
+//! MPEG-4 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/*******************************************************/
+typedef struct _CUVIDMPEG4PICPARAMS {
+    int ForwardRefIdx;  // Picture index of forward reference (P/B-frames)
+    int BackwardRefIdx; // Picture index of backward reference (B-frames)
+    // VOL
+    int video_object_layer_width;
+    int video_object_layer_height;
+    int vop_time_increment_bitcount;
+    int top_field_first;
+    int resync_marker_disable;
+    int quant_type;
+    int quarter_sample;
+    int short_video_header;
+    int divx_flags;
+    // VOP
+    int vop_coding_type;
+    int vop_coded;
+    int vop_rounding_type;
+    int alternate_vertical_scan_flag;
+    int interlaced;
+    int vop_fcode_forward;
+    int vop_fcode_backward;
+    int trd[2];
+    int trb[2];
+    // Quantization matrices (raster order)
+    unsigned char QuantMatrixIntra[64];
+    unsigned char QuantMatrixInter[64];
+    int gmc_enabled;
+} CUVIDMPEG4PICPARAMS;
+
+/********************************************************/
+//! \struct CUVIDVC1PICPARAMS
+//! VC1 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/********************************************************/
+typedef struct _CUVIDVC1PICPARAMS {
+    int ForwardRefIdx;  /**< Picture index of forward reference (P/B-frames) */
+    int BackwardRefIdx; /**< Picture index of backward reference (B-frames)  */
+    int FrameWidth;     /**< Actual frame width                              */
+    int FrameHeight;    /**< Actual frame height                             */
+    // PICTURE
+    int intra_pic_flag;  /**< Set to 1 for I,BI frames */
+    int ref_pic_flag;    /**< Set to 1 for I,P frames  */
+    int progressive_fcm; /**< Progressive frame        */
+    // SEQUENCE
+    int profile;
+    int postprocflag;
+    int pulldown;
+    int interlace;
+    int tfcntrflag;
+    int finterpflag;
+    int psf;
+    int multires;
+    int syncmarker;
+    int rangered;
+    int maxbframes;
+    // ENTRYPOINT
+    int panscan_flag;
+    int refdist_flag;
+    int extended_mv;
+    int dquant;
+    int vstransform;
+    int loopfilter;
+    int fastuvmc;
+    int overlap;
+    int quantizer;
+    int extended_dmv;
+    int range_mapy_flag;
+    int range_mapy;
+    int range_mapuv_flag;
+    int range_mapuv;
+    int rangeredfrm; // range reduction state
+} CUVIDVC1PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDJPEGPICPARAMS
+//! JPEG picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDJPEGPICPARAMS {
+    int Reserved;
+} CUVIDJPEGPICPARAMS;
+
+/*******************************************************/
+//! \struct CUVIDHEVCPICPARAMS
+//! HEVC picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/*******************************************************/
+typedef struct _CUVIDHEVCPICPARAMS {
+    // sps
+    int pic_width_in_luma_samples;
+    int pic_height_in_luma_samples;
+    unsigned char log2_min_luma_coding_block_size_minus3;
+    unsigned char log2_diff_max_min_luma_coding_block_size;
+    unsigned char log2_min_transform_block_size_minus2;
+    unsigned char log2_diff_max_min_transform_block_size;
+    unsigned char pcm_enabled_flag;
+    unsigned char log2_min_pcm_luma_coding_block_size_minus3;
+    unsigned char log2_diff_max_min_pcm_luma_coding_block_size;
+    unsigned char pcm_sample_bit_depth_luma_minus1;
+
+    unsigned char pcm_sample_bit_depth_chroma_minus1;
+    unsigned char pcm_loop_filter_disabled_flag;
+    unsigned char strong_intra_smoothing_enabled_flag;
+    unsigned char max_transform_hierarchy_depth_intra;
+    unsigned char max_transform_hierarchy_depth_inter;
+    unsigned char amp_enabled_flag;
+    unsigned char separate_colour_plane_flag;
+    unsigned char log2_max_pic_order_cnt_lsb_minus4;
+
+    unsigned char num_short_term_ref_pic_sets;
+    unsigned char long_term_ref_pics_present_flag;
+    unsigned char num_long_term_ref_pics_sps;
+    unsigned char sps_temporal_mvp_enabled_flag;
+    unsigned char sample_adaptive_offset_enabled_flag;
+    unsigned char scaling_list_enable_flag;
+    unsigned char IrapPicFlag;
+    unsigned char IdrPicFlag;
+
+    unsigned char bit_depth_luma_minus8;
+    unsigned char bit_depth_chroma_minus8;
+    // sps/pps extension fields
+    unsigned char log2_max_transform_skip_block_size_minus2;
+    unsigned char log2_sao_offset_scale_luma;
+    unsigned char log2_sao_offset_scale_chroma;
+    unsigned char high_precision_offsets_enabled_flag;
+    unsigned char reserved1[10];
+
+    // pps
+    unsigned char dependent_slice_segments_enabled_flag;
+    unsigned char slice_segment_header_extension_present_flag;
+    unsigned char sign_data_hiding_enabled_flag;
+    unsigned char cu_qp_delta_enabled_flag;
+    unsigned char diff_cu_qp_delta_depth;
+    signed char init_qp_minus26;
+    signed char pps_cb_qp_offset;
+    signed char pps_cr_qp_offset;
+
+    unsigned char constrained_intra_pred_flag;
+    unsigned char weighted_pred_flag;
+    unsigned char weighted_bipred_flag;
+    unsigned char transform_skip_enabled_flag;
+    unsigned char transquant_bypass_enabled_flag;
+    unsigned char entropy_coding_sync_enabled_flag;
+    unsigned char log2_parallel_merge_level_minus2;
+    unsigned char num_extra_slice_header_bits;
+
+    unsigned char loop_filter_across_tiles_enabled_flag;
+    unsigned char loop_filter_across_slices_enabled_flag;
+    unsigned char output_flag_present_flag;
+    unsigned char num_ref_idx_l0_default_active_minus1;
+    unsigned char num_ref_idx_l1_default_active_minus1;
+    unsigned char lists_modification_present_flag;
+    unsigned char cabac_init_present_flag;
+    unsigned char pps_slice_chroma_qp_offsets_present_flag;
+
+    unsigned char deblocking_filter_override_enabled_flag;
+    unsigned char pps_deblocking_filter_disabled_flag;
+    signed char pps_beta_offset_div2;
+    signed char pps_tc_offset_div2;
+    unsigned char tiles_enabled_flag;
+    unsigned char uniform_spacing_flag;
+    unsigned char num_tile_columns_minus1;
+    unsigned char num_tile_rows_minus1;
+
+    unsigned short column_width_minus1[21];
+    unsigned short row_height_minus1[21];
+
+    // sps and pps extension HEVC-main 444
+    unsigned char sps_range_extension_flag;
+    unsigned char transform_skip_rotation_enabled_flag;
+    unsigned char transform_skip_context_enabled_flag;
+    unsigned char implicit_rdpcm_enabled_flag;
+
+    unsigned char explicit_rdpcm_enabled_flag;
+    unsigned char extended_precision_processing_flag;
+    unsigned char intra_smoothing_disabled_flag;
+    unsigned char persistent_rice_adaptation_enabled_flag;
+
+    unsigned char cabac_bypass_alignment_enabled_flag;
+    unsigned char pps_range_extension_flag;
+    unsigned char cross_component_prediction_enabled_flag;
+    unsigned char chroma_qp_offset_list_enabled_flag;
+
+    unsigned char diff_cu_chroma_qp_offset_depth;
+    unsigned char chroma_qp_offset_list_len_minus1;
+    signed char cb_qp_offset_list[6];
+
+    signed char cr_qp_offset_list[6];
+    unsigned char reserved2[2];
+
+    unsigned int reserved3[8];
+
+    // RefPicSets
+    int NumBitsForShortTermRPSInSlice;
+    int NumDeltaPocsOfRefRpsIdx;
+    int NumPocTotalCurr;
+    int NumPocStCurrBefore;
+    int NumPocStCurrAfter;
+    int NumPocLtCurr;
+    int CurrPicOrderCntVal;
+    int RefPicIdx[16];                      // [refpic] Indices of valid reference pictures (-1 if unused for reference)
+    int PicOrderCntVal[16];                 // [refpic]
+    unsigned char IsLongTerm[16];           // [refpic] 0=not a long-term reference, 1=long-term reference
+    unsigned char RefPicSetStCurrBefore[8]; // [0..NumPocStCurrBefore-1] -> refpic (0..15)
+    unsigned char RefPicSetStCurrAfter[8];  // [0..NumPocStCurrAfter-1] -> refpic (0..15)
+    unsigned char RefPicSetLtCurr[8];       // [0..NumPocLtCurr-1] -> refpic (0..15)
+    unsigned char RefPicSetInterLayer0[8];
+    unsigned char RefPicSetInterLayer1[8];
+    unsigned int reserved4[12];
+
+    // scaling lists (diag order)
+    unsigned char ScalingList4x4[6][16];      // [matrixId][i]
+    unsigned char ScalingList8x8[6][64];      // [matrixId][i]
+    unsigned char ScalingList16x16[6][64];    // [matrixId][i]
+    unsigned char ScalingList32x32[2][64];    // [matrixId][i]
+    unsigned char ScalingListDCCoeff16x16[6]; // [matrixId]
+    unsigned char ScalingListDCCoeff32x32[2]; // [matrixId]
+} CUVIDHEVCPICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDVP8PICPARAMS
+//! VP8 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDVP8PICPARAMS {
+    int width;
+    int height;
+    unsigned int first_partition_size;
+    // Frame Indexes
+    unsigned char LastRefIdx;
+    unsigned char GoldenRefIdx;
+    unsigned char AltRefIdx;
+    union {
+        struct {
+            unsigned char frame_type : 1; /**< 0 = KEYFRAME, 1 = INTERFRAME  */
+            unsigned char version : 3;
+            unsigned char show_frame : 1;
+            unsigned char update_mb_segmentation_data : 1; /**< Must be 0 if segmentation is not enabled */
+            unsigned char Reserved2Bits : 2;
+        } vp8_frame_tag;
+        unsigned char wFrameTagFlags;
+    };
+    unsigned char Reserved1[4];
+    unsigned int Reserved2[3];
+} CUVIDVP8PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDVP9PICPARAMS
+//! VP9 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDVP9PICPARAMS {
+    unsigned int width;
+    unsigned int height;
+
+    // Frame Indices
+    unsigned char LastRefIdx;
+    unsigned char GoldenRefIdx;
+    unsigned char AltRefIdx;
+    unsigned char colorSpace;
+
+    unsigned short profile : 3;
+    unsigned short frameContextIdx : 2;
+    unsigned short frameType : 1;
+    unsigned short showFrame : 1;
+    unsigned short errorResilient : 1;
+    unsigned short frameParallelDecoding : 1;
+    unsigned short subSamplingX : 1;
+    unsigned short subSamplingY : 1;
+    unsigned short intraOnly : 1;
+    unsigned short allow_high_precision_mv : 1;
+    unsigned short refreshEntropyProbs : 1;
+    unsigned short reserved2Bits : 2;
+
+    unsigned short reserved16Bits;
+
+    unsigned char refFrameSignBias[4];
+
+    unsigned char bitDepthMinus8Luma;
+    unsigned char bitDepthMinus8Chroma;
+    unsigned char loopFilterLevel;
+    unsigned char loopFilterSharpness;
+
+    unsigned char modeRefLfEnabled;
+    unsigned char log2_tile_columns;
+    unsigned char log2_tile_rows;
+
+    unsigned char segmentEnabled : 1;
+    unsigned char segmentMapUpdate : 1;
+    unsigned char segmentMapTemporalUpdate : 1;
+    unsigned char segmentFeatureMode : 1;
+    unsigned char reserved4Bits : 4;
+
+    unsigned char segmentFeatureEnable[8][4];
+    short segmentFeatureData[8][4];
+    unsigned char mb_segment_tree_probs[7];
+    unsigned char segment_pred_probs[3];
+    unsigned char reservedSegment16Bits[2];
+
+    int qpYAc;
+    int qpYDc;
+    int qpChDc;
+    int qpChAc;
+
+    unsigned int activeRefIdx[3];
+    unsigned int resetFrameContext;
+    unsigned int mcomp_filter_type;
+    unsigned int mbRefLfDelta[4];
+    unsigned int mbModeLfDelta[2];
+    unsigned int frameTagSize;
+    unsigned int offsetToDctParts;
+    unsigned int reserved128Bits[4];
+
+} CUVIDVP9PICPARAMS;
+
+/***********************************************************/
+//! \struct CUVIDAV1PICPARAMS
+//! AV1 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDAV1PICPARAMS {
+    unsigned int width;        // coded width, if superres enabled then it is upscaled width
+    unsigned int height;       // coded height
+    unsigned int frame_offset; // defined as order_hint in AV1 specification
+    int decodePicIdx; // decoded output pic index, if film grain enabled, it will keep decoded (without film grain)
+                      // output It can be used as reference frame for future frames
+
+    // sequence header
+    unsigned int profile : 3;                // 0 = profile0, 1 = profile1, 2 = profile2
+    unsigned int use_128x128_superblock : 1; // superblock size 0:64x64, 1: 128x128
+    unsigned int subsampling_x : 1;          // (subsampling_x, _y) 1,1 = 420, 1,0 = 422, 0,0 = 444
+    unsigned int subsampling_y : 1;
+    unsigned int mono_chrome : 1;      // for monochrome content, mono_chrome = 1 and (subsampling_x, _y) should be 1,1
+    unsigned int bit_depth_minus8 : 4; // bit depth minus 8
+    unsigned int enable_filter_intra : 1;        // tool enable in seq level, 0 : disable 1: frame header control
+    unsigned int enable_intra_edge_filter : 1;   // intra edge filtering process, 0 : disable 1: enabled
+    unsigned int enable_interintra_compound : 1; // interintra, 0 : not present 1: present
+    unsigned int enable_masked_compound : 1;     // 1: mode info for inter blocks may contain the syntax element
+                                                 // compound_type. 0: syntax element compound_type will not be present
+    unsigned int enable_dual_filter : 1;         // vertical and horiz filter selection, 1: enable and 0: disable
+    unsigned int enable_order_hint : 1;          // order hint, and related tools, 1: enable and 0: disable
+    unsigned int order_hint_bits_minus1 : 3;     // is used to compute OrderHintBits
+    unsigned int enable_jnt_comp : 1;            // joint compound modes, 1: enable and 0: disable
+    unsigned int enable_superres : 1;            // superres in seq level, 0 : disable 1: frame level control
+    unsigned int enable_cdef : 1;                // cdef filtering in seq level, 0 : disable 1: frame level control
+    unsigned int enable_restoration : 1; // loop restoration filtering in seq level, 0 : disable 1: frame level control
+    unsigned int enable_fgs : 1;         // defined as film_grain_params_present in AV1 specification
+    unsigned int reserved0_7bits : 7;    // reserved bits; must be set to 0
+
+    // frame header
+    unsigned int frame_type : 2;         // 0:Key frame, 1:Inter frame, 2:intra only, 3:s-frame
+    unsigned int show_frame : 1;         // show_frame = 1 implies that frame should be immediately output once decoded
+    unsigned int disable_cdf_update : 1; // CDF update during symbol decoding, 1: disabled, 0: enabled
+    unsigned int
+        allow_screen_content_tools : 1; // 1: intra blocks may use palette encoding, 0: palette encoding is never used
+    unsigned int force_integer_mv : 1;  // 1: motion vectors will always be integers, 0: can contain fractional bits
+    unsigned int coded_denom : 3;       // coded_denom of the superres scale as specified in AV1 specification
+    unsigned int allow_intrabc : 1;     // 1: intra block copy may be used, 0: intra block copy is not allowed
+    unsigned int allow_high_precision_mv : 1; // 1/8 precision mv enable
+    unsigned int interp_filter : 3;           // interpolation filter. Refer to section 6.8.9 of the AV1 specification
+                                              // Version 1.0.0 with Errata 1
+    unsigned int switchable_motion_mode : 1;  // defined as is_motion_mode_switchable in AV1 specification
+    unsigned int use_ref_frame_mvs : 1; // 1: current frame can use the previous frame mv information, 0: will not use.
+    unsigned int disable_frame_end_update_cdf : 1; // 1: indicates that the end of frame CDF update is disabled
+    unsigned int delta_q_present : 1;              // quantizer index delta values are present in the block level
+    unsigned int delta_q_res : 2;    // left shift which should be applied to decoded quantizer index delta values
+    unsigned int using_qmatrix : 1;  // 1: quantizer matrix will be used to compute quantizers
+    unsigned int coded_lossless : 1; // 1: all segments use lossless coding
+    unsigned int use_superres : 1;   // 1: superres enabled for frame
+    unsigned int tx_mode : 2;        // 0: ONLY4x4,1:LARGEST,2:SELECT
+    unsigned int reference_mode : 1; // 0: SINGLE, 1: SELECT
+    unsigned int
+        allow_warped_motion : 1; // 1: allow_warped_motion may be present, 0: allow_warped_motion will not be present
+    unsigned int reduced_tx_set : 1;  // 1: frame is restricted to subset of the full set of transform types, 0: no such
+                                      // restriction
+    unsigned int skip_mode : 1;       // 1: most of the mode info is skipped, 0: mode info is not skipped
+    unsigned int reserved1_3bits : 3; // reserved bits; must be set to 0
+
+    // tiling info
+    unsigned int num_tile_cols : 8;           // number of tiles across the frame., max is 64
+    unsigned int num_tile_rows : 8;           // number of tiles down the frame., max is 64
+    unsigned int context_update_tile_id : 16; // specifies which tile to use for the CDF update
+    unsigned short tile_widths[64];           // Width of each column in superblocks
+    unsigned short tile_heights[64];          // height of each row in superblocks
+
+    // CDEF - refer to section 6.10.14 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char cdef_damping_minus_3 : 2; // controls the amount of damping in the deringing filter
+    unsigned char cdef_bits : 2;            // the number of bits needed to specify which CDEF filter to apply
+    unsigned char reserved2_4bits : 4;      // reserved bits; must be set to 0
+    unsigned char cdef_y_strength[8];       // 0-3 bits: y_pri_strength, 4-7 bits y_sec_strength
+    unsigned char cdef_uv_strength[8];      // 0-3 bits: uv_pri_strength, 4-7 bits uv_sec_strength
+
+    // SkipModeFrames
+    unsigned char
+        SkipModeFrame0 : 4; // specifies the frames to use for compound prediction when skip_mode is equal to 1.
+    unsigned char SkipModeFrame1 : 4;
+
+    // qp information - refer to section 6.8.11 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char base_qindex; // indicates the base frame qindex. Defined as base_q_idx in AV1 specification
+    char qp_y_dc_delta_q;      // indicates the Y DC quantizer relative to base_q_idx. Defined as DeltaQYDc in AV1
+                               // specification
+    char qp_u_dc_delta_q;      // indicates the U DC quantizer relative to base_q_idx. Defined as DeltaQUDc in AV1
+                               // specification
+    char qp_v_dc_delta_q;      // indicates the V DC quantizer relative to base_q_idx. Defined as DeltaQVDc in AV1
+                               // specification
+    char qp_u_ac_delta_q;      // indicates the U AC quantizer relative to base_q_idx. Defined as DeltaQUAc in AV1
+                               // specification
+    char qp_v_ac_delta_q;      // indicates the V AC quantizer relative to base_q_idx. Defined as DeltaQVAc in AV1
+                               // specification
+    unsigned char qm_y; // specifies the level in the quantizer matrix that should be used for luma plane decoding
+    unsigned char qm_u; // specifies the level in the quantizer matrix that should be used for chroma U plane decoding
+    unsigned char qm_v; // specifies the level in the quantizer matrix that should be used for chroma V plane decoding
+
+    // segmentation - refer to section 6.8.13 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char segmentation_enabled : 1;    // 1 indicates that this frame makes use of the segmentation tool
+    unsigned char segmentation_update_map : 1; // 1 indicates that the segmentation map are updated during the decoding
+                                               // of this frame
+    unsigned char
+        segmentation_update_data : 1; // 1 indicates that new parameters are about to be specified for each segment
+    unsigned char segmentation_temporal_update : 1; // 1 indicates that the updates to the segmentation map are coded
+                                                    // relative to the existing segmentation map
+    unsigned char reserved3_4bits : 4;              // reserved bits; must be set to 0
+    short segmentation_feature_data[8][8];          // specifies the feature data for a segment feature
+    unsigned char
+        segmentation_feature_mask[8]; // indicates that the corresponding feature is unused or feature value is coded
+
+    // loopfilter - refer to section 6.8.10 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char loop_filter_level[2];  // contains loop filter strength values
+    unsigned char loop_filter_level_u;   // loop filter strength value of U plane
+    unsigned char loop_filter_level_v;   // loop filter strength value of V plane
+    unsigned char loop_filter_sharpness; // indicates the sharpness level
+    char loop_filter_ref_deltas[8]; // contains the adjustment needed for the filter level based on the chosen reference
+                                    // frame
+    char loop_filter_mode_deltas[2]; // contains the adjustment needed for the filter level based on the chosen mode
+    unsigned char loop_filter_delta_enabled : 1; // indicates that the filter level depends on the mode and reference
+                                                 // frame used to predict a block
+    unsigned char loop_filter_delta_update : 1;  // indicates that additional syntax elements are present that specify
+                                                 // which mode and reference frame deltas are to be updated
+    unsigned char delta_lf_present : 1; // specifies whether loop filter delta values are present in the block level
+    unsigned char delta_lf_res : 2;     // specifies the left shift to apply to the decoded loop filter values
+    unsigned char delta_lf_multi : 1;   // separate loop filter deltas for Hy,Vy,U,V edges
+    unsigned char reserved4_2bits : 2;  // reserved bits; must be set to 0
+
+    // restoration - refer to section 6.10.15 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned char lr_unit_size[3]; // specifies the size of loop restoration units: 0: 32, 1: 64, 2: 128, 3: 256
+    unsigned char lr_type[3];      // used to compute FrameRestorationType
+
+    // reference frames
+    unsigned char primary_ref_frame; // specifies which reference frame contains the CDF values and other state that
+                                     // should be loaded at the start of the frame
+    unsigned char ref_frame_map[8];  // frames in dpb that can be used as reference for current or future frames
+
+    unsigned char temporal_layer_id : 4; // temporal layer id
+    unsigned char spatial_layer_id : 4;  // spatial layer id
+
+    unsigned char reserved5_32bits[4]; // reserved bits; must be set to 0
+
+    // ref frame list
+    struct {
+        unsigned int width;
+        unsigned int height;
+        unsigned char index;
+        unsigned char reserved24Bits[3]; // reserved bits; must be set to 0
+    } ref_frame[7];                      // frames used as reference frame for current frame.
+
+    // global motion
+    struct {
+        unsigned char invalid : 1;
+        unsigned char wmtype : 2;        // defined as GmType in AV1 specification
+        unsigned char reserved5Bits : 5; // reserved bits; must be set to 0
+        char reserved24Bits[3];          // reserved bits; must be set to 0
+        int wmmat[6];                    // defined as gm_params[] in AV1 specification
+    } global_motion[7];                  // global motion params for reference frames
+
+    // film grain params - refer to section 6.8.20 of the AV1 specification Version 1.0.0 with Errata 1
+    unsigned short apply_grain : 1;
+    unsigned short overlap_flag : 1;
+    unsigned short scaling_shift_minus8 : 2;
+    unsigned short chroma_scaling_from_luma : 1;
+    unsigned short ar_coeff_lag : 2;
+    unsigned short ar_coeff_shift_minus6 : 2;
+    unsigned short grain_scale_shift : 2;
+    unsigned short clip_to_restricted_range : 1;
+    unsigned short reserved6_4bits : 4; // reserved bits; must be set to 0
+    unsigned char num_y_points;
+    unsigned char scaling_points_y[14][2];
+    unsigned char num_cb_points;
+    unsigned char scaling_points_cb[10][2];
+    unsigned char num_cr_points;
+    unsigned char scaling_points_cr[10][2];
+    unsigned char reserved7_8bits; // reserved bits; must be set to 0
+    unsigned short random_seed;
+    short ar_coeffs_y[24];
+    short ar_coeffs_cb[25];
+    short ar_coeffs_cr[25];
+    unsigned char cb_mult;
+    unsigned char cb_luma_mult;
+    short cb_offset;
+    unsigned char cr_mult;
+    unsigned char cr_luma_mult;
+    short cr_offset;
+
+    int reserved[7]; // reserved bits; must be set to 0
+} CUVIDAV1PICPARAMS;
+
+/******************************************************************************************/
+//! \struct CUVIDPICPARAMS
+//! Picture parameters for decoding
+//! This structure is used in cuvidDecodePicture API
+//! IN  for cuvidDecodePicture
+/******************************************************************************************/
+typedef struct _CUVIDPICPARAMS {
+    int PicWidthInMbs;     /**< IN: Coded frame size in macroblocks                           */
+    int FrameHeightInMbs;  /**< IN: Coded frame height in macroblocks                         */
+    int CurrPicIdx;        /**< IN: Output index of the current picture                       */
+    int field_pic_flag;    /**< IN: 0=frame picture, 1=field picture                          */
+    int bottom_field_flag; /**< IN: 0=top field, 1=bottom field (ignored if field_pic_flag=0) */
+    int second_field;      /**< IN: Second field of a complementary field pair                */
+    // Bitstream data
+    unsigned int nBitstreamDataLen;        /**< IN: Number of bytes in bitstream data buffer                  */
+    const unsigned char *pBitstreamData;   /**< IN: Ptr to bitstream data for this picture (slice-layer)      */
+    unsigned int nNumSlices;               /**< IN: Number of slices in this picture                          */
+    const unsigned int *pSliceDataOffsets; /**< IN: nNumSlices entries, contains offset of each slice within
+                                                        the bitstream data buffer                             */
+    int ref_pic_flag;                      /**< IN: This picture is a reference picture                       */
+    int intra_pic_flag;                    /**< IN: This picture is entirely intra coded                      */
+    unsigned int Reserved[30];             /**< Reserved for future use                                       */
+    // IN: Codec-specific data
+    union {
+        CUVIDMPEG2PICPARAMS mpeg2; /**< Also used for MPEG-1 */
+        CUVIDH264PICPARAMS h264;
+        CUVIDVC1PICPARAMS vc1;
+        CUVIDMPEG4PICPARAMS mpeg4;
+        CUVIDJPEGPICPARAMS jpeg;
+        CUVIDHEVCPICPARAMS hevc;
+        CUVIDVP8PICPARAMS vp8;
+        CUVIDVP9PICPARAMS vp9;
+        CUVIDAV1PICPARAMS av1;
+        unsigned int CodecReserved[1024];
+    } CodecSpecific;
+} CUVIDPICPARAMS;
+
+/******************************************************/
+//! \struct CUVIDPROCPARAMS
+//! Picture parameters for postprocessing
+//! This structure is used in cuvidMapVideoFrame API
+/******************************************************/
+typedef struct _CUVIDPROCPARAMS {
+    int progressive_frame; /**< IN: Input is progressive (deinterlace_mode will be ignored)                */
+    int second_field;      /**< IN: Output the second field (ignored if deinterlace mode is Weave)         */
+    int top_field_first;   /**< IN: Input frame is top field first (1st field is top, 2nd field is bottom) */
+    int unpaired_field;    /**< IN: Input only contains one field (2nd field is invalid)                   */
+    // The fields below are used for raw YUV input
+    unsigned int reserved_flags; /**< Reserved for future use (set to zero)                                      */
+    unsigned int reserved_zero;  /**< Reserved (set to zero)                                                     */
+    unsigned long long raw_input_dptr; /**< IN: Input CUdeviceptr for raw YUV extensions */
+    unsigned int raw_input_pitch;  /**< IN: pitch in bytes of raw YUV input (should be aligned appropriately)      */
+    unsigned int raw_input_format; /**< IN: Input YUV format (cudaVideoCodec_enum)                                 */
+    unsigned long long raw_output_dptr; /**< IN: Output CUdeviceptr for raw YUV extensions */
+    unsigned int raw_output_pitch; /**< IN: pitch in bytes of raw YUV output (should be aligned appropriately)     */
+    unsigned int Reserved1;        /**< Reserved for future use (set to zero)                                      */
+    CUstream output_stream;        /**< IN: stream object used by cuvidMapVideoFrame                               */
+    unsigned int Reserved[46];     /**< Reserved for future use (set to zero)                                      */
+    unsigned long long *histogram_dptr; /**< OUT: Output CUdeviceptr for histogram extensions */
+    void *Reserved2[1]; /**< Reserved for future use (set to zero)                                      */
+} CUVIDPROCPARAMS;
+
+/*********************************************************************************************************/
+//! \struct CUVIDGETDECODESTATUS
+//! Struct for reporting decode status.
+//! This structure is used in cuvidGetDecodeStatus API.
+/*********************************************************************************************************/
+typedef struct _CUVIDGETDECODESTATUS {
+    cuvidDecodeStatus decodeStatus;
+    unsigned int reserved[31];
+    void *pReserved[8];
+} CUVIDGETDECODESTATUS;
+
+/****************************************************/
+//! \struct CUVIDRECONFIGUREDECODERINFO
+//! Struct for decoder reset
+//! This structure is used in cuvidReconfigureDecoder() API
+/****************************************************/
+typedef struct _CUVIDRECONFIGUREDECODERINFO {
+    unsigned int
+        ulWidth; /**< IN: Coded sequence width in pixels, MUST be < = ulMaxWidth defined at CUVIDDECODECREATEINFO  */
+    unsigned int
+        ulHeight; /**< IN: Coded sequence height in pixels, MUST be < = ulMaxHeight defined at CUVIDDECODECREATEINFO  */
+    unsigned int ulTargetWidth;       /**< IN: Post processed output width */
+    unsigned int ulTargetHeight;      /**< IN: Post Processed output height */
+    unsigned int ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */
+    unsigned int reserved1[12];       /**< Reserved for future use. Set to Zero */
+    /**
+     * IN: Area of frame to be displayed. Use-case : Source Cropping
+     */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } display_area;
+    /**
+     * IN: Target Rectangle in the OutputFrame. Use-case : Aspect ratio Conversion
+     */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } target_rect;
+    unsigned int reserved2[11]; /**< Reserved for future use. Set to Zero */
+} CUVIDRECONFIGUREDECODERINFO;
+
+/***********************************************************************************************************/
+//! VIDEO_DECODER
+//!
+//! In order to minimize decode latencies, there should be always at least 2 pictures in the decode
+//! queue at any time, in order to make sure that all decode engines are always busy.
+//!
+//! Overall data flow:
+//!  - cuvidGetDecoderCaps(...)
+//!  - cuvidCreateDecoder(...)
+//!  - For each picture:
+//!    + cuvidDecodePicture(N)
+//!    + cuvidMapVideoFrame(N-4)
+//!    + do some processing in cuda
+//!    + cuvidUnmapVideoFrame(N-4)
+//!    + cuvidDecodePicture(N+1)
+//!    + cuvidMapVideoFrame(N-3)
+//!    + ...
+//!  - cuvidDestroyDecoder(...)
+//!
+//! NOTE:
+//! - When the cuda context is created from a D3D device, the D3D device must also be created
+//!   with the D3DCREATE_MULTITHREADED flag.
+//! - There is a limit to how many pictures can be mapped simultaneously (ulNumOutputSurfaces)
+//! - cuvidDecodePicture may block the calling thread if there are too many pictures pending
+//!   in the decode queue
+/***********************************************************************************************************/
+
+/**********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc)
+//! Queries decode capabilities of NVDEC-HW based on CodecType, ChromaFormat and BitDepthMinus8 parameters.
+//! 1. Application fills IN parameters CodecType, ChromaFormat and BitDepthMinus8 of CUVIDDECODECAPS structure
+//! 2. On calling cuvidGetDecoderCaps, driver fills OUT parameters if the IN parameters are supported
+//!    If IN parameters passed to the driver are not supported by NVDEC-HW, then all OUT params are set to 0.
+//! E.g. on Geforce GTX 960:
+//!   App fills - eCodecType = cudaVideoCodec_H264; eChromaFormat = cudaVideoChromaFormat_420; nBitDepthMinus8 = 0;
+//!   Given IN parameters are supported, hence driver fills: bIsSupported = 1; nMinWidth   = 48; nMinHeight  = 16;
+//!   nMaxWidth = 4096; nMaxHeight = 4096; nMaxMBCount = 65536;
+//! CodedWidth*CodedHeight/256 must be less than or equal to nMaxMBCount
+/**********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci)
+//! Create the decoder object based on pdci. A handle to the created decoder is returned
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder)
+//! Destroy the decoder object
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams)
+//! Decode a single picture (field or frame)
+//! Kicks off HW decoding
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams);
+
+/************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int nPicIdx);
+//! Get the decode status for frame corresponding to nPicIdx
+//! API is supported for Maxwell and above generation GPUs.
+//! API is currently supported for HEVC, H264 and JPEG codecs.
+//! API returns CUDA_ERROR_NOT_SUPPORTED error code for unsupported GPU or codec.
+/************************************************************************************************************/
+extern CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int nPicIdx, CUVIDGETDECODESTATUS *pDecodeStatus);
+
+/*********************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder, CUVIDRECONFIGUREDECODERINFO
+//! *pDecReconfigParams) Used to reuse single decoder for multiple clips. Currently supports resolution change, resize
+//! params, display area params, target area params change for same codec. Must be called during
+//! CUVIDPARSERPARAMS::pfnSequenceCallback
+/*********************************************************************************************************/
+extern CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder,
+                                                CUVIDRECONFIGUREDECODERINFO *pDecReconfigParams);
+
+#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL)
+/************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr,
+//!                                         unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
+//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and
+//! associated pitch of the video frame
+/************************************************************************************************************************/
+extern CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr,
+                                           unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
+
+/*****************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr)
+//! Unmap a previously mapped video frame
+/*****************************************************************************************************/
+extern CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
+#endif
+
+/****************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
+//!                                           unsigned int * pPitch, CUVIDPROCPARAMS *pVPP);
+//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and
+//! associated pitch of the video frame
+/****************************************************************************************************************************/
+extern CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
+                                             unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
+
+/**************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
+//! Unmap a previously mapped video frame
+/**************************************************************************************************/
+extern CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
+
+#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL)
+#define cuvidMapVideoFrame cuvidMapVideoFrame64
+#define cuvidUnmapVideoFrame cuvidUnmapVideoFrame64
+#endif
+
+/********************************************************************************************************************/
+//!
+//! Context-locking: to facilitate multi-threaded implementations, the following 4 functions
+//! provide a simple mutex-style host synchronization. If a non-NULL context is specified
+//! in CUVIDDECODECREATEINFO, the codec library will acquire the mutex associated with the given
+//! context before making any cuda calls.
+//! A multi-threaded application could create a lock associated with a context handle so that
+//! multiple threads can safely share the same cuda context:
+//!  - use cuCtxPopCurrent immediately after context creation in order to create a 'floating' context
+//!    that can be passed to cuvidCtxLockCreate.
+//!  - When using a floating context, all cuda calls should only be made within a cuvidCtxLock/cuvidCtxUnlock section.
+//!
+//! NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent, and is not related to video
+//! decoder in any way (implemented as a critical section associated with cuCtx{Push|Pop}Current calls).
+/********************************************************************************************************************/
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx)
+//! This API is used to create CtxLock object
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck)
+//! This API is used to free CtxLock object
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags)
+//! This API is used to acquire ctxlock
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
+
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags)
+//! This API is used to release ctxlock
+/********************************************************************************************************************/
+extern CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
+
+/**********************************************************************************************/
+
+#if defined(__cplusplus)
+}
+// Auto-lock helper for C++ applications
+class CCtxAutoLock {
+  private:
+    CUvideoctxlock m_ctx;
+
+  public:
+    CCtxAutoLock(CUvideoctxlock ctx) : m_ctx(ctx) { cuvidCtxLock(m_ctx, 0); }
+    ~CCtxAutoLock() { cuvidCtxUnlock(m_ctx, 0); }
+};
+#endif /* __cplusplus */
+
+#endif // __CUDA_VIDEO_H__
diff --git a/third_party/Video_Codec_SDK/Interface/nvcuvid.h b/third_party/Video_Codec_SDK/Interface/nvcuvid.h
new file mode 100644
index 000000000..d4691672c
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Interface/nvcuvid.h
@@ -0,0 +1,486 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/********************************************************************************************************************/
+//! \file nvcuvid.h
+//!   NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! \date 2015-2022
+//!  This file contains the interface constants, structure definitions and function prototypes.
+/********************************************************************************************************************/
+
+#if !defined(__NVCUVID_H__)
+#define __NVCUVID_H__
+
+#include "cuviddec.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#define MAX_CLOCK_TS 3
+
+/***********************************************/
+//!
+//! High-level helper APIs for video sources
+//!
+/***********************************************/
+
+typedef void *CUvideosource;
+typedef void *CUvideoparser;
+typedef long long CUvideotimestamp;
+
+/************************************************************************/
+//! \enum cudaVideoState
+//! Video source state enums
+//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs
+/************************************************************************/
+typedef enum {
+    cudaVideoState_Error = -1,  /**< Error state (invalid source)                  */
+    cudaVideoState_Stopped = 0, /**< Source is stopped (or reached end-of-stream)  */
+    cudaVideoState_Started = 1  /**< Source is running and delivering data         */
+} cudaVideoState;
+
+/************************************************************************/
+//! \enum cudaAudioCodec
+//! Audio compression enums
+//! Used in CUAUDIOFORMAT structure
+/************************************************************************/
+typedef enum {
+    cudaAudioCodec_MPEG1 = 0, /**< MPEG-1 Audio               */
+    cudaAudioCodec_MPEG2,     /**< MPEG-2 Audio               */
+    cudaAudioCodec_MP3,       /**< MPEG-1 Layer III Audio     */
+    cudaAudioCodec_AC3,       /**< Dolby Digital (AC3) Audio  */
+    cudaAudioCodec_LPCM,      /**< PCM Audio                  */
+    cudaAudioCodec_AAC,       /**< AAC Audio                  */
+} cudaAudioCodec;
+
+/************************************************************************/
+//! \ingroup STRUCTS
+//! \struct HEVCTIMECODESET
+//! Used to store Time code extracted from Time code SEI in HEVC codec
+/************************************************************************/
+typedef struct _HEVCTIMECODESET {
+    unsigned int time_offset_value;
+    unsigned short n_frames;
+    unsigned char clock_timestamp_flag;
+    unsigned char units_field_based_flag;
+    unsigned char counting_type;
+    unsigned char full_timestamp_flag;
+    unsigned char discontinuity_flag;
+    unsigned char cnt_dropped_flag;
+    unsigned char seconds_value;
+    unsigned char minutes_value;
+    unsigned char hours_value;
+    unsigned char seconds_flag;
+    unsigned char minutes_flag;
+    unsigned char hours_flag;
+    unsigned char time_offset_length;
+    unsigned char reserved;
+} HEVCTIMECODESET;
+
+/************************************************************************/
+//! \ingroup STRUCTS
+//! \struct HEVCSEITIMECODE
+//! Used to extract Time code SEI in HEVC codec
+/************************************************************************/
+typedef struct _HEVCSEITIMECODE {
+    HEVCTIMECODESET time_code_set[MAX_CLOCK_TS];
+    unsigned char num_clock_ts;
+} HEVCSEITIMECODE;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUSEIMESSAGE;
+//! Used in CUVIDSEIMESSAGEINFO structure
+/**********************************************************************************/
+typedef struct _CUSEIMESSAGE {
+    unsigned char sei_message_type; /**< OUT: SEI Message Type      */
+    unsigned char reserved[3];
+    unsigned int sei_message_size; /**< OUT: SEI Message Size      */
+} CUSEIMESSAGE;
+
+/************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMAT
+//! Video format
+//! Used in cuvidGetSourceVideoFormat API
+/************************************************************************************************/
+typedef struct {
+    cudaVideoCodec codec; /**< OUT: Compression format          */
+                          /**
+                           * OUT: frame rate = numerator / denominator (for example: 30000/1001)
+                           */
+    struct {
+        /**< OUT: frame rate numerator   (0 = unspecified or variable frame rate) */
+        unsigned int numerator;
+        /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */
+        unsigned int denominator;
+    } frame_rate;
+    unsigned char progressive_sequence;    /**< OUT: 0=interlaced, 1=progressive                                      */
+    unsigned char bit_depth_luma_minus8;   /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth   */
+    unsigned char bit_depth_chroma_minus8; /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */
+    unsigned char min_num_decode_surfaces; /**< OUT: Minimum number of decode surfaces to be allocated for correct
+                                                     decoding. The client can send this value in ulNumDecodeSurfaces
+                                                     (in CUVIDDECODECREATEINFO structure).
+                                                     This guarantees correct functionality and optimal video memory
+                                                     usage but not necessarily the best performance, which depends on
+                                                     the design of the overall application. The optimal number of
+                                                     decode surfaces (in terms of performance and memory utilization)
+                                                     should be decided by experimentation for each application, but it
+                                                     cannot go below min_num_decode_surfaces.
+                                                     If this value is used for ulNumDecodeSurfaces then it must be
+                                                     returned to parser during sequence callback.                     */
+    unsigned int coded_width;              /**< OUT: coded frame width in pixels                                      */
+    unsigned int coded_height;             /**< OUT: coded frame height in pixels                                     */
+                                           /**
+                                            * area of the frame that should be displayed
+                                            * typical example:
+                                            * coded_width = 1920, coded_height = 1088
+                                            * display_area = { 0,0,1920,1080 }
+                                            */
+    struct {
+        int left;   /**< OUT: left position of display rect    */
+        int top;    /**< OUT: top position of display rect     */
+        int right;  /**< OUT: right position of display rect   */
+        int bottom; /**< OUT: bottom position of display rect  */
+    } display_area;
+    cudaVideoChromaFormat chroma_format; /**< OUT:  Chroma format                   */
+    unsigned int bitrate;                /**< OUT: video bitrate (bps, 0=unknown)   */
+                                         /**
+                                          * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc)
+                                          */
+    struct {
+        int x;
+        int y;
+    } display_aspect_ratio;
+    /**
+     * Video Signal Description
+     * Refer section E.2.1 (VUI parameters semantics) of H264 spec file
+     */
+    struct {
+        unsigned char video_format : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified     */
+        unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range */
+        unsigned char reserved_zero_bits : 4; /**< Reserved bits                                                      */
+        unsigned char color_primaries;        /**< OUT: chromaticity coordinates of source primaries                  */
+        unsigned char
+            transfer_characteristics;      /**< OUT: opto-electronic transfer characteristic of the source picture */
+        unsigned char matrix_coefficients; /**< OUT: used in deriving luma and chroma signals from RGB primaries   */
+    } video_signal_description;
+    unsigned int seqhdr_data_length; /**< OUT: Additional bytes following (CUVIDEOFORMATEX)                  */
+} CUVIDEOFORMAT;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDOPERATINGPOINTINFO
+//! Operating point information of scalable bitstream
+/****************************************************************/
+typedef struct {
+    cudaVideoCodec codec;
+    union {
+        struct {
+            unsigned char operating_points_cnt;
+            unsigned char reserved24_bits[3];
+            unsigned short operating_points_idc[32];
+        } av1;
+        unsigned char CodecReserved[1024];
+    };
+} CUVIDOPERATINGPOINTINFO;
+
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSEIMESSAGEINFO
+//! Used in cuvidParseVideoData API with PFNVIDSEIMSGCALLBACK pfnGetSEIMsg
+/**********************************************************************************/
+typedef struct _CUVIDSEIMESSAGEINFO {
+    void *pSEIData;                 /**< OUT: SEI Message Data      */
+    CUSEIMESSAGE *pSEIMessage;      /**< OUT: SEI Message Info      */
+    unsigned int sei_message_count; /**< OUT: SEI Message Count     */
+    unsigned int picIdx;            /**< OUT: SEI Message Pic Index */
+} CUVIDSEIMESSAGEINFO;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDAV1SEQHDR
+//! AV1 specific sequence header information
+/****************************************************************/
+typedef struct {
+    unsigned int max_width;
+    unsigned int max_height;
+    unsigned char reserved[1016];
+} CUVIDAV1SEQHDR;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMATEX
+//! Video format including raw sequence header information
+//! Used in cuvidGetSourceVideoFormat API
+/****************************************************************/
+typedef struct {
+    CUVIDEOFORMAT format; /**< OUT: CUVIDEOFORMAT structure */
+    union {
+        CUVIDAV1SEQHDR av1;
+        unsigned char raw_seqhdr_data[1024]; /**< OUT: Sequence header data    */
+    };
+} CUVIDEOFORMATEX;
+
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUAUDIOFORMAT
+//! Audio formats
+//! Used in cuvidGetSourceAudioFormat API
+/****************************************************************/
+typedef struct {
+    cudaAudioCodec codec;       /**< OUT: Compression format                                              */
+    unsigned int channels;      /**< OUT: number of audio channels                                        */
+    unsigned int samplespersec; /**< OUT: sampling frequency                                              */
+    unsigned int bitrate;       /**< OUT: For uncompressed, can also be used to determine bits per sample */
+    unsigned int reserved1;     /**< Reserved for future use                                              */
+    unsigned int reserved2;     /**< Reserved for future use                                              */
+} CUAUDIOFORMAT;
+
+/***************************************************************/
+//! \enum CUvideopacketflags
+//! Data packet flags
+//! Used in CUVIDSOURCEDATAPACKET structure
+/***************************************************************/
+typedef enum {
+    CUVID_PKT_ENDOFSTREAM = 0x01, /**< Set when this is the last packet for this stream                              */
+    CUVID_PKT_TIMESTAMP = 0x02,   /**< Timestamp is valid                                                            */
+    CUVID_PKT_DISCONTINUITY = 0x04, /**< Set when a discontinuity has to be signalled */
+    CUVID_PKT_ENDOFPICTURE = 0x08, /**< Set when the packet contains exactly one frame or one field                   */
+    CUVID_PKT_NOTIFY_EOS = 0x10,   /**< If this flag is set along with CUVID_PKT_ENDOFSTREAM, an additional (dummy)
+                                        display callback will be invoked with null value of CUVIDPARSERDISPINFO which
+                                        should be interpreted as end of the stream.                                   */
+} CUvideopacketflags;
+
+/*****************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEDATAPACKET
+//! Data Packet
+//! Used in cuvidParseVideoData API
+//! IN for cuvidParseVideoData
+/*****************************************************************************/
+typedef struct _CUVIDSOURCEDATAPACKET {
+    unsigned long flags;          /**< IN: Combination of CUVID_PKT_XXX flags                              */
+    unsigned long payload_size;   /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */
+    const unsigned char *payload; /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */
+    CUvideotimestamp timestamp;   /**< IN: Presentation time stamp (10MHz clock), only valid if
+                                           CUVID_PKT_TIMESTAMP flag is set                                 */
+} CUVIDSOURCEDATAPACKET;
+
+// Callback for packet delivery
+typedef int(CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
+
+/**************************************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEPARAMS
+//! Describes parameters needed in cuvidCreateVideoSource API
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all
+//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is
+//! needed.
+/**************************************************************************************************************************/
+typedef struct _CUVIDSOURCEPARAMS {
+    unsigned int ulClockRate;                 /**< IN: Time stamp units in Hz (0=default=10000000Hz)      */
+    unsigned int bAnnexb : 1;                 /**< IN: AV1 annexB stream                                  */
+    unsigned int uReserved : 31;              /**< Reserved for future use - set to zero                  */
+    unsigned int uReserved1[6];               /**< Reserved for future use - set to zero                  */
+    void *pUserData;                          /**< IN: User private data passed in to the data handlers   */
+    PFNVIDSOURCECALLBACK pfnVideoDataHandler; /**< IN: Called to deliver video packets                    */
+    PFNVIDSOURCECALLBACK pfnAudioDataHandler; /**< IN: Called to deliver audio packets.                   */
+    void *pvReserved2[8];                     /**< Reserved for future use - set to NULL                  */
+} CUVIDSOURCEPARAMS;
+
+/**********************************************/
+//! \ingroup ENUMS
+//! \enum CUvideosourceformat_flags
+//! CUvideosourceformat_flags
+//! Used in cuvidGetSourceVideoFormat API
+/**********************************************/
+typedef enum {
+    CUVID_FMT_EXTFORMATINFO = 0x100 /**< Return extended format structure (CUVIDEOFORMATEX) */
+} CUvideosourceformat_flags;
+
+#if !defined(__APPLE__)
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS
+//! *pParams) Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks:
+//! pfnVideoDataHandler() and pfnAudioDataHandler()
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all
+//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is
+//! needed.
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
+
+/***************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS
+//! *pParams) Create video source
+/***************************************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
+
+/********************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj)
+//! Destroy video source
+/********************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state)
+//! Set video source state to:
+//! cudaVideoState_Started - to signal the source to run and deliver data
+//! cudaVideoState_Stopped - to stop the source from delivering the data
+//! cudaVideoState_Error   - invalid source
+/******************************************************************************************/
+CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
+
+/******************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj)
+//! Get video source state
+//! Returns:
+//! cudaVideoState_Started - if Source is running and delivering data
+//! cudaVideoState_Stopped - if Source is stopped or reached end-of-stream
+//! cudaVideoState_Error   - if Source is in error state
+/******************************************************************************************/
+cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj);
+
+/******************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags)
+//! Gets video source format in pvidfmt, flags is set to combination of CUvideosourceformat_flags as per requirement
+/******************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
+
+/**************************************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags)
+//! Get audio source format
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all
+//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is
+//! needed.
+/**************************************************************************************************************************/
+CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
+
+#endif
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERDISPINFO
+//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture
+/**********************************************************************************/
+typedef struct _CUVIDPARSERDISPINFO {
+    int picture_index; /**< OUT: Index of the current picture                                                         */
+    int progressive_frame;      /**< OUT: 1 if progressive frame; 0 otherwise      */
+    int top_field_first;        /**< OUT: 1 if top field is displayed first; 0 otherwise        */
+    int repeat_first_field;     /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling,
+                                     -1=unpaired field)     */
+    CUvideotimestamp timestamp; /**< OUT: Presentation time stamp */
+} CUVIDPARSERDISPINFO;
+
+/***********************************************************************************************************************/
+//! Parser callbacks
+//! The parser will call these synchronously from within cuvidParseVideoData(), whenever there is sequence change or a
+//! picture is ready to be decoded and/or displayed. First argument in functions is "void *pUserData" member of
+//! structure CUVIDSOURCEPARAMS Return values from these callbacks are interpreted as below. If the callbacks return
+//! failure, it will be propagated by cuvidParseVideoData() to the application. Parser picks default operating point as
+//! 0 and outputAllLayers flag as 0 if PFNVIDOPPOINTCALLBACK is not set or return value is -1 or invalid operating
+//! point. PFNVIDSEQUENCECALLBACK : 0: fail, 1: succeeded, > 1: override dpb size of parser (set by
+//! CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while creating parser) PFNVIDDECODECALLBACK   : 0: fail, >=1: succeeded
+//! PFNVIDDISPLAYCALLBACK  : 0: fail, >=1: succeeded
+//! PFNVIDOPPOINTCALLBACK  : <0: fail, >=0: succeeded (bit 0-9: OperatingPoint, bit 10-10: outputAllLayers, bit 11-30:
+//! reserved) PFNVIDSEIMSGCALLBACK   : 0: fail, >=1: succeeded
+/***********************************************************************************************************************/
+typedef int(CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
+typedef int(CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
+typedef int(CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
+typedef int(CUDAAPI *PFNVIDOPPOINTCALLBACK)(void *, CUVIDOPERATINGPOINTINFO *);
+typedef int(CUDAAPI *PFNVIDSEIMSGCALLBACK)(void *, CUVIDSEIMESSAGEINFO *);
+
+/**************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERPARAMS
+//! Used in cuvidCreateVideoParser API
+/**************************************/
+typedef struct _CUVIDPARSERPARAMS {
+    cudaVideoCodec CodecType;            /**< IN: cudaVideoCodec_XXX                                                  */
+    unsigned int ulMaxNumDecodeSurfaces; /**< IN: Max # of decode surfaces (parser will cycle through these)          */
+    unsigned int ulClockRate;            /**< IN: Timestamp units in Hz (0=default=10000000Hz)                        */
+    unsigned int ulErrorThreshold;       /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always
+                                              IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */
+    unsigned int ulMaxDisplayDelay;      /**< IN: Max display queue delay (improves pipelining of decode with display)
+                                                  0=no delay (recommended values: 2..4)                               */
+    unsigned int bAnnexb : 1;            /**< IN: AV1 annexB stream                                                   */
+    unsigned int uReserved : 31;         /**< Reserved for future use - set to zero                                   */
+    unsigned int uReserved1[4];          /**< IN: Reserved for future use - set to 0                                  */
+    void *pUserData;                     /**< IN: User data for callbacks                                             */
+    PFNVIDSEQUENCECALLBACK
+    pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */
+    PFNVIDDECODECALLBACK pfnDecodePicture; /**< IN: Called when a picture is ready to be decoded (decode order) */
+    PFNVIDDISPLAYCALLBACK
+    pfnDisplayPicture; /**< IN: Called whenever a picture is ready to be displayed (display order)  */
+    PFNVIDOPPOINTCALLBACK pfnGetOperatingPoint; /**< IN: Called from AV1 sequence header to get operating point of a AV1
+                                                         scalable bitstream */
+    PFNVIDSEIMSGCALLBACK pfnGetSEIMsg; /**< IN: Called when all SEI messages are parsed for particular frame        */
+    void *pvReserved2[5];              /**< Reserved for future use - set to NULL                                   */
+    CUVIDEOFORMATEX *pExtVideoInfo;    /**< IN: [Optional] sequence header data from system layer                   */
+} CUVIDPARSERPARAMS;
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams)
+//! Create video parser object and initialize
+/************************************************************************************************/
+CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket)
+//! Parse the video data from source data packet in pPacket
+//! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and
+//! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding
+//! calls back pfnSequenceCallback with CUVIDEOFORMAT data for initial sequence header or when
+//! the decoder encounters a video format change
+//! calls back pfnDisplayPicture with CUVIDPARSERDISPINFO data to display a video frame
+/************************************************************************************************/
+CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
+
+/************************************************************************************************/
+//! \ingroup FUNCTS
+//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj)
+//! Destroy the video parser
+/************************************************************************************************/
+CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj);
+
+/**********************************************************************************************/
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif // __NVCUVID_H__
diff --git a/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so b/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so
new file mode 100644
index 0000000000000000000000000000000000000000..f08a209545e076a835d11dcc24bd20d22088b1c5
GIT binary patch
literal 3528
zcmd6qPiWLf6vt;1HBqCksam5^qWA|QEDE9$7Rf&~2kXBy7D8bA%kD49*6eQh{boZF
zQhO;S^&sNGQ+g}n(MzF+P(t+(4}u2|J?z!uNvMbi8%@9OH}7pWlk5~hN*}zL+0XaQ
z%$u3tZ+G^)a~IFo#9~UUR-IM!>CFa1+F4lG%nelS>Wo^4vR>6ivFn$mRjq|h21%_{
zO9W^RnnL)(Cc~wkhekFZl~lAuts0ac)2}}2rPS3II>Sq1;;y9Mc7*MRH^M)M$KiY7
zd*Ec-;2WcsDfXeFKD*$%;k5i__*S^fXkF{zk|XV@<UsuzsDN!TRk^G<7PSMk4qglY
z489ZI1phay#msWv5<84WBX}bX2$Q@{<KT#2viLX5`z`)A^Y1PGl=+0k?=l~<_+#Wi
z^B%YOw;cbq#ousz(&Ds7G|oF-kgV%A^Zi!*L*}g(pJKk-;+@PlSiHpil*OBH@NxCm
z(#+--L@ZlBnv`dQ6~B+y+hg%#93MRZXv@0(<oKIb{6^-T7H_PQ&*3WkC!Y6^HO^HY
z|4WPCXFh6i;Ap+_97_JbaC~6Jt8{TZo9POCPy5||x{&dMN_0J&@e5bHAY23Y7K%aI
zUlnceSIqUq#X|bViVtS86{y<}bx@crv%dUjxtGc6N;a*F0nf(aEAnNwD&b0Anehc*
zoBqAp)5XwCU4}uIH@3`7zm8%iTR2|`a-J?n8zq|5QX*z#nt?O_!W%QeftOngojPfT
zj<!6C?Dn+BEXa;ypvz<C<29UCekePPU0Jaay}n+1?!J6(_3ReJ{3$;}Q>IdbVVDZF
z7ige-MvZ0%^W)Lek_Zcl6L`-0dVYNI<kA1FL}D=dej<xaT~o1%cx-P&UHu)r1N0k0
z-o3Q66ulcATjM{}bZwFMgtR2Q)u9hSk2~~f=x;w%AAc5l$)V3d|J|XNpg(Zvi_mX7
z^k(dXcIZjy{SJKqdb>lPhJM7M&q8l-=yT9tzpuW3CFp-R^hM~uIrL_H2mkEQlhAKD
z^a1E&4t*N>b%#C+J?YTrpdWPTCFu1IeG&Sr#p?UljEl|lD!SaSS}_J0-KEG{u%~2l
zNBxxrd2e2_&ev4c#{K19MR&OW#1i2MeoM4Bs1hSyIHD4n$$U7OGqny>Li-b1CH#@p
zQ1l&4x|hqQKT&eDUfLsGetF+y|LFcjCc4~BWuNKJOJj&G`zrM_7U*tDP3bTDOZQaK
z&{T3xQp>rJM#jfxKA=3nOwLvA5mMhPN1BG@E%+$+m;CSWeM9Oqlw5^L{(|Uth31wp
i`5z<yZ{|5rbm>q3C&*|$qD6p<8GaDkDk0kT`u_pVP7(wF

literal 0
HcmV?d00001

diff --git a/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp
new file mode 100644
index 000000000..0fd61f447
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp
@@ -0,0 +1,709 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+
+#include "../../../Interface/nvcuvid.h"
+#include "NvDecoder/NvDecoder.h"
+
+std::map<int, int64_t> NvDecoder::sessionOverHead = {{0, 0}, {1, 0}};
+
+/**
+ *   @brief  This function is used to get codec string from codec id
+ */
+const char *NvDecoder::GetCodecString(cudaVideoCodec eCodec) { return GetVideoCodecString(eCodec); }
+
+/* Called when the parser encounters sequence header for AV1 SVC content
+ *  return value interpretation:
+ *      < 0 : fail, >=0: succeeded (bit 0-9: currOperatingPoint, bit 10-10: bDispAllLayer, bit 11-30: reserved, must be
+ * set 0)
+ */
+int NvDecoder::GetOperatingPoint(CUVIDOPERATINGPOINTINFO *pOPInfo) {
+    if (pOPInfo->codec == cudaVideoCodec_AV1) {
+        if (pOPInfo->av1.operating_points_cnt > 1) {
+            // clip has SVC enabled
+            if (m_nOperatingPoint >= pOPInfo->av1.operating_points_cnt)
+                m_nOperatingPoint = 0;
+
+            printf("AV1 SVC clip: operating point count %d  ", pOPInfo->av1.operating_points_cnt);
+            printf("Selected operating point: %d, IDC 0x%x bOutputAllLayers %d\n", m_nOperatingPoint,
+                   pOPInfo->av1.operating_points_idc[m_nOperatingPoint], m_bDispAllLayers);
+            return (m_nOperatingPoint | (m_bDispAllLayers << 10));
+        }
+    }
+    return -1;
+}
+
+/* Return value from HandleVideoSequence() are interpreted as   :
+ *  0: fail, 1: succeeded, > 1: override dpb size of parser (set by CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while
+ * creating parser)
+ */
+int NvDecoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat) {
+    START_TIMER
+    m_videoInfo.str("");
+    m_videoInfo.clear();
+    m_videoInfo << "Video Input Information" << std::endl
+                << "\tCodec        : " << GetVideoCodecString(pVideoFormat->codec) << std::endl
+                << "\tFrame rate   : " << pVideoFormat->frame_rate.numerator << "/"
+                << pVideoFormat->frame_rate.denominator << " = "
+                << 1.0 * pVideoFormat->frame_rate.numerator / pVideoFormat->frame_rate.denominator << " fps"
+                << std::endl
+                << "\tSequence     : " << (pVideoFormat->progressive_sequence ? "Progressive" : "Interlaced")
+                << std::endl
+                << "\tCoded size   : [" << pVideoFormat->coded_width << ", " << pVideoFormat->coded_height << "]"
+                << std::endl
+                << "\tDisplay area : [" << pVideoFormat->display_area.left << ", " << pVideoFormat->display_area.top
+                << ", " << pVideoFormat->display_area.right << ", " << pVideoFormat->display_area.bottom << "]"
+                << std::endl
+                << "\tChroma       : " << GetVideoChromaFormatString(pVideoFormat->chroma_format) << std::endl
+                << "\tBit depth    : " << pVideoFormat->bit_depth_luma_minus8 + 8;
+    m_videoInfo << std::endl;
+
+    int nDecodeSurface = pVideoFormat->min_num_decode_surfaces;
+
+    CUVIDDECODECAPS decodecaps;
+    memset(&decodecaps, 0, sizeof(decodecaps));
+
+    decodecaps.eCodecType = pVideoFormat->codec;
+    decodecaps.eChromaFormat = pVideoFormat->chroma_format;
+    decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+
+    if (!decodecaps.bIsSupported) {
+        NVDEC_THROW_ERROR("Codec not supported on this GPU", CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if ((pVideoFormat->coded_width > decodecaps.nMaxWidth) || (pVideoFormat->coded_height > decodecaps.nMaxHeight)) {
+
+        std::ostringstream errorString;
+        errorString << std::endl
+                    << "Resolution          : " << pVideoFormat->coded_width << "x" << pVideoFormat->coded_height
+                    << std::endl
+                    << "Max Supported (wxh) : " << decodecaps.nMaxWidth << "x" << decodecaps.nMaxHeight << std::endl
+                    << "Resolution not supported on this GPU";
+
+        const std::string cErr = errorString.str();
+        NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > decodecaps.nMaxMBCount) {
+
+        std::ostringstream errorString;
+        errorString << std::endl
+                    << "MBCount             : " << (pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4)
+                    << std::endl
+                    << "Max Supported mbcnt : " << decodecaps.nMaxMBCount << std::endl
+                    << "MBCount not supported on this GPU";
+
+        const std::string cErr = errorString.str();
+        NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED);
+        return nDecodeSurface;
+    }
+
+    if (m_nWidth && m_nLumaHeight && m_nChromaHeight) {
+
+        // cuvidCreateDecoder() has been called before, and now there's possible config change
+        return ReconfigureDecoder(pVideoFormat);
+    }
+
+    // eCodec has been set in the constructor (for parser). Here it's set again for potential correction
+    m_eCodec = pVideoFormat->codec;
+    m_eChromaFormat = pVideoFormat->chroma_format;
+    m_nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+    m_nBPP = m_nBitDepthMinus8 > 0 ? 2 : 1;
+
+    // Set the output surface format same as chroma format
+    if (m_eChromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome)
+        m_eOutputFormat =
+            pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12;
+    else if (m_eChromaFormat == cudaVideoChromaFormat_444)
+        m_eOutputFormat =
+            pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444;
+    else if (m_eChromaFormat == cudaVideoChromaFormat_422)
+        m_eOutputFormat = cudaVideoSurfaceFormat_NV12; // no 4:2:2 output format supported yet so make 420 default
+
+    // Check if output format supported. If not, check falback options
+    if (!(decodecaps.nOutputFormatMask & (1 << m_eOutputFormat))) {
+        if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12))
+            m_eOutputFormat = cudaVideoSurfaceFormat_NV12;
+        else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016))
+            m_eOutputFormat = cudaVideoSurfaceFormat_P016;
+        else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444))
+            m_eOutputFormat = cudaVideoSurfaceFormat_YUV444;
+        else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit))
+            m_eOutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit;
+        else
+            NVDEC_THROW_ERROR("No supported output format found", CUDA_ERROR_NOT_SUPPORTED);
+    }
+    m_videoFormat = *pVideoFormat;
+
+    CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0};
+    videoDecodeCreateInfo.CodecType = pVideoFormat->codec;
+    videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format;
+    videoDecodeCreateInfo.OutputFormat = m_eOutputFormat;
+    videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8;
+    if (pVideoFormat->progressive_sequence)
+        videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
+    else
+        videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive;
+    videoDecodeCreateInfo.ulNumOutputSurfaces = 2;
+    // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware
+    videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID;
+    videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface;
+    videoDecodeCreateInfo.vidLock = m_ctxLock;
+    videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width;
+    videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height;
+    // AV1 has max width/height of sequence in sequence header
+    if (pVideoFormat->codec == cudaVideoCodec_AV1 && pVideoFormat->seqhdr_data_length > 0) {
+        // dont overwrite if it is already set from cmdline or reconfig.txt
+        if (!(m_nMaxWidth > pVideoFormat->coded_width || m_nMaxHeight > pVideoFormat->coded_height)) {
+            CUVIDEOFORMATEX *vidFormatEx = (CUVIDEOFORMATEX *)pVideoFormat;
+            m_nMaxWidth = vidFormatEx->av1.max_width;
+            m_nMaxHeight = vidFormatEx->av1.max_height;
+        }
+    }
+    if (m_nMaxWidth < (int)pVideoFormat->coded_width)
+        m_nMaxWidth = pVideoFormat->coded_width;
+    if (m_nMaxHeight < (int)pVideoFormat->coded_height)
+        m_nMaxHeight = pVideoFormat->coded_height;
+    videoDecodeCreateInfo.ulMaxWidth = m_nMaxWidth;
+    videoDecodeCreateInfo.ulMaxHeight = m_nMaxHeight;
+
+    if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) {
+        m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left;
+        m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
+        videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width;
+        videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height;
+    } else {
+        if (m_resizeDim.w && m_resizeDim.h) {
+            videoDecodeCreateInfo.display_area.left = pVideoFormat->display_area.left;
+            videoDecodeCreateInfo.display_area.top = pVideoFormat->display_area.top;
+            videoDecodeCreateInfo.display_area.right = pVideoFormat->display_area.right;
+            videoDecodeCreateInfo.display_area.bottom = pVideoFormat->display_area.bottom;
+            m_nWidth = m_resizeDim.w;
+            m_nLumaHeight = m_resizeDim.h;
+        }
+
+        if (m_cropRect.r && m_cropRect.b) {
+            videoDecodeCreateInfo.display_area.left = m_cropRect.l;
+            videoDecodeCreateInfo.display_area.top = m_cropRect.t;
+            videoDecodeCreateInfo.display_area.right = m_cropRect.r;
+            videoDecodeCreateInfo.display_area.bottom = m_cropRect.b;
+            m_nWidth = m_cropRect.r - m_cropRect.l;
+            m_nLumaHeight = m_cropRect.b - m_cropRect.t;
+        }
+        videoDecodeCreateInfo.ulTargetWidth = m_nWidth;
+        videoDecodeCreateInfo.ulTargetHeight = m_nLumaHeight;
+    }
+
+    m_nChromaHeight = (int)(ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat)));
+    m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat);
+    m_nSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight;
+    m_nSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth;
+    m_displayRect.b = videoDecodeCreateInfo.display_area.bottom;
+    m_displayRect.t = videoDecodeCreateInfo.display_area.top;
+    m_displayRect.l = videoDecodeCreateInfo.display_area.left;
+    m_displayRect.r = videoDecodeCreateInfo.display_area.right;
+
+    m_videoInfo << "Video Decoding Params:" << std::endl
+                << "\tNum Surfaces : " << videoDecodeCreateInfo.ulNumDecodeSurfaces << std::endl
+                << "\tCrop         : [" << videoDecodeCreateInfo.display_area.left << ", "
+                << videoDecodeCreateInfo.display_area.top << ", " << videoDecodeCreateInfo.display_area.right << ", "
+                << videoDecodeCreateInfo.display_area.bottom << "]" << std::endl
+                << "\tResize       : " << videoDecodeCreateInfo.ulTargetWidth << "x"
+                << videoDecodeCreateInfo.ulTargetHeight << std::endl
+                << "\tDeinterlace  : "
+                << std::vector<const char *>{"Weave", "Bob", "Adaptive"}[videoDecodeCreateInfo.DeinterlaceMode];
+    m_videoInfo << std::endl;
+
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidCreateDecoder(&m_hDecoder, &videoDecodeCreateInfo));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    STOP_TIMER("Session Initialization Time: ");
+    NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime);
+    return nDecodeSurface;
+}
+
+int NvDecoder::ReconfigureDecoder(CUVIDEOFORMAT *pVideoFormat) {
+    if (pVideoFormat->bit_depth_luma_minus8 != m_videoFormat.bit_depth_luma_minus8 ||
+        pVideoFormat->bit_depth_chroma_minus8 != m_videoFormat.bit_depth_chroma_minus8) {
+
+        NVDEC_THROW_ERROR("Reconfigure Not supported for bit depth change", CUDA_ERROR_NOT_SUPPORTED);
+    }
+
+    if (pVideoFormat->chroma_format != m_videoFormat.chroma_format) {
+
+        NVDEC_THROW_ERROR("Reconfigure Not supported for chroma format change", CUDA_ERROR_NOT_SUPPORTED);
+    }
+
+    bool bDecodeResChange = !(pVideoFormat->coded_width == m_videoFormat.coded_width &&
+                              pVideoFormat->coded_height == m_videoFormat.coded_height);
+    bool bDisplayRectChange = !(pVideoFormat->display_area.bottom == m_videoFormat.display_area.bottom &&
+                                pVideoFormat->display_area.top == m_videoFormat.display_area.top &&
+                                pVideoFormat->display_area.left == m_videoFormat.display_area.left &&
+                                pVideoFormat->display_area.right == m_videoFormat.display_area.right);
+
+    int nDecodeSurface = pVideoFormat->min_num_decode_surfaces;
+
+    if ((pVideoFormat->coded_width > m_nMaxWidth) || (pVideoFormat->coded_height > m_nMaxHeight)) {
+        // For VP9, let driver  handle the change if new width/height > maxwidth/maxheight
+        if ((m_eCodec != cudaVideoCodec_VP9) || m_bReconfigExternal) {
+            NVDEC_THROW_ERROR("Reconfigure Not supported when width/height > maxwidth/maxheight",
+                              CUDA_ERROR_NOT_SUPPORTED);
+        }
+        return 1;
+    }
+
+    if (!bDecodeResChange && !m_bReconfigExtPPChange) {
+        // if the coded_width/coded_height hasn't changed but display resolution has changed, then need to update
+        // width/height for correct output without cropping. Example : 1920x1080 vs 1920x1088
+        if (bDisplayRectChange) {
+            m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left;
+            m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
+            m_nChromaHeight = (int)ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat));
+            m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat);
+        }
+
+        // no need for reconfigureDecoder(). Just return
+        return 1;
+    }
+
+    CUVIDRECONFIGUREDECODERINFO reconfigParams = {0};
+
+    reconfigParams.ulWidth = m_videoFormat.coded_width = pVideoFormat->coded_width;
+    reconfigParams.ulHeight = m_videoFormat.coded_height = pVideoFormat->coded_height;
+
+    // Dont change display rect and get scaled output from decoder. This will help display app to present apps smoothly
+    reconfigParams.display_area.bottom = m_displayRect.b;
+    reconfigParams.display_area.top = m_displayRect.t;
+    reconfigParams.display_area.left = m_displayRect.l;
+    reconfigParams.display_area.right = m_displayRect.r;
+    reconfigParams.ulTargetWidth = m_nSurfaceWidth;
+    reconfigParams.ulTargetHeight = m_nSurfaceHeight;
+
+    // If external reconfigure is called along with resolution change even if post processing params is not changed,
+    // do full reconfigure params update
+    if ((m_bReconfigExternal && bDecodeResChange) || m_bReconfigExtPPChange) {
+        // update display rect and target resolution if requested explicitly
+        m_bReconfigExternal = false;
+        m_bReconfigExtPPChange = false;
+        m_videoFormat = *pVideoFormat;
+        if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) {
+            m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left;
+            m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top;
+            reconfigParams.ulTargetWidth = pVideoFormat->coded_width;
+            reconfigParams.ulTargetHeight = pVideoFormat->coded_height;
+        } else {
+            if (m_resizeDim.w && m_resizeDim.h) {
+                reconfigParams.display_area.left = pVideoFormat->display_area.left;
+                reconfigParams.display_area.top = pVideoFormat->display_area.top;
+                reconfigParams.display_area.right = pVideoFormat->display_area.right;
+                reconfigParams.display_area.bottom = pVideoFormat->display_area.bottom;
+                m_nWidth = m_resizeDim.w;
+                m_nLumaHeight = m_resizeDim.h;
+            }
+
+            if (m_cropRect.r && m_cropRect.b) {
+                reconfigParams.display_area.left = m_cropRect.l;
+                reconfigParams.display_area.top = m_cropRect.t;
+                reconfigParams.display_area.right = m_cropRect.r;
+                reconfigParams.display_area.bottom = m_cropRect.b;
+                m_nWidth = m_cropRect.r - m_cropRect.l;
+                m_nLumaHeight = m_cropRect.b - m_cropRect.t;
+            }
+            reconfigParams.ulTargetWidth = m_nWidth;
+            reconfigParams.ulTargetHeight = m_nLumaHeight;
+        }
+
+        m_nChromaHeight = (int)ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat));
+        m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat);
+        m_nSurfaceHeight = reconfigParams.ulTargetHeight;
+        m_nSurfaceWidth = reconfigParams.ulTargetWidth;
+        m_displayRect.b = reconfigParams.display_area.bottom;
+        m_displayRect.t = reconfigParams.display_area.top;
+        m_displayRect.l = reconfigParams.display_area.left;
+        m_displayRect.r = reconfigParams.display_area.right;
+    }
+
+    reconfigParams.ulNumDecodeSurfaces = nDecodeSurface;
+
+    START_TIMER
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidReconfigureDecoder(m_hDecoder, &reconfigParams));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    STOP_TIMER("Session Reconfigure Time: ");
+
+    return nDecodeSurface;
+}
+
+int NvDecoder::setReconfigParams(const Rect *pCropRect, const Dim *pResizeDim) {
+    m_bReconfigExternal = true;
+    m_bReconfigExtPPChange = false;
+    if (pCropRect) {
+        if (!((pCropRect->t == m_cropRect.t) && (pCropRect->l == m_cropRect.l) && (pCropRect->b == m_cropRect.b) &&
+              (pCropRect->r == m_cropRect.r))) {
+            m_bReconfigExtPPChange = true;
+            m_cropRect = *pCropRect;
+        }
+    }
+    if (pResizeDim) {
+        if (!((pResizeDim->w == m_resizeDim.w) && (pResizeDim->h == m_resizeDim.h))) {
+            m_bReconfigExtPPChange = true;
+            m_resizeDim = *pResizeDim;
+        }
+    }
+
+    // Clear existing output buffers of different size
+    uint8_t *pFrame = NULL;
+    while (!m_vpFrame.empty()) {
+        pFrame = m_vpFrame.back();
+        m_vpFrame.pop_back();
+        if (m_bUseDeviceFrame) {
+            CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+            CUDA_DRVAPI_CALL(cuMemFree((CUdeviceptr)pFrame));
+            CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+        } else {
+            delete pFrame;
+        }
+    }
+
+    return 1;
+}
+
+/* Return value from HandlePictureDecode() are interpreted as:
+ *  0: fail, >=1: succeeded
+ */
+int NvDecoder::HandlePictureDecode(CUVIDPICPARAMS *pPicParams) {
+    if (!m_hDecoder) {
+        NVDEC_THROW_ERROR("Decoder not initialized.", CUDA_ERROR_NOT_INITIALIZED);
+        return false;
+    }
+    m_nPicNumInDecodeOrder[pPicParams->CurrPicIdx] = m_nDecodePicCnt++;
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(cuvidDecodePicture(m_hDecoder, pPicParams));
+    if (m_bForce_zero_latency && ((!pPicParams->field_pic_flag) || (pPicParams->second_field))) {
+        CUVIDPARSERDISPINFO dispInfo;
+        memset(&dispInfo, 0, sizeof(dispInfo));
+        dispInfo.picture_index = pPicParams->CurrPicIdx;
+        dispInfo.progressive_frame = !pPicParams->field_pic_flag;
+        dispInfo.top_field_first = pPicParams->bottom_field_flag ^ 1;
+        HandlePictureDisplay(&dispInfo);
+    }
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+    return 1;
+}
+
+/* Return value from HandlePictureDisplay() are interpreted as:
+ *  0: fail, >=1: succeeded
+ */
+int NvDecoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo) {
+    CUVIDPROCPARAMS videoProcessingParameters = {};
+    videoProcessingParameters.progressive_frame = pDispInfo->progressive_frame;
+    videoProcessingParameters.second_field = pDispInfo->repeat_first_field + 1;
+    videoProcessingParameters.top_field_first = pDispInfo->top_field_first;
+    videoProcessingParameters.unpaired_field = pDispInfo->repeat_first_field < 0;
+    videoProcessingParameters.output_stream = m_cuvidStream;
+
+    if (m_bExtractSEIMessage) {
+        if (m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData) {
+            // Write SEI Message
+            uint8_t *seiBuffer = (uint8_t *)(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData);
+            uint32_t seiNumMessages = m_SEIMessagesDisplayOrder[pDispInfo->picture_index].sei_message_count;
+            CUSEIMESSAGE *seiMessagesInfo = m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIMessage;
+            if (m_fpSEI) {
+                for (uint32_t i = 0; i < seiNumMessages; i++) {
+                    if (m_eCodec == cudaVideoCodec_H264 || cudaVideoCodec_H264_SVC || cudaVideoCodec_H264_MVC ||
+                        cudaVideoCodec_HEVC) {
+                        switch (seiMessagesInfo[i].sei_message_type) {
+                        case SEI_TYPE_TIME_CODE: {
+                            HEVCSEITIMECODE *timecode = (HEVCSEITIMECODE *)seiBuffer;
+                            fwrite(timecode, sizeof(HEVCSEITIMECODE), 1, m_fpSEI);
+                        } break;
+                        case SEI_TYPE_USER_DATA_UNREGISTERED: {
+                            fwrite(seiBuffer, seiMessagesInfo[i].sei_message_size, 1, m_fpSEI);
+                        } break;
+                        }
+                    }
+                    if (m_eCodec == cudaVideoCodec_AV1) {
+                        fwrite(seiBuffer, seiMessagesInfo[i].sei_message_size, 1, m_fpSEI);
+                    }
+                    seiBuffer += seiMessagesInfo[i].sei_message_size;
+                }
+            }
+            free(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData);
+            free(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIMessage);
+        }
+    }
+
+    CUdeviceptr dpSrcFrame = 0;
+    unsigned int nSrcPitch = 0;
+    CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext));
+    NVDEC_API_CALL(
+        cuvidMapVideoFrame(m_hDecoder, pDispInfo->picture_index, &dpSrcFrame, &nSrcPitch, &videoProcessingParameters));
+
+    CUVIDGETDECODESTATUS DecodeStatus;
+    memset(&DecodeStatus, 0, sizeof(DecodeStatus));
+    CUresult result = cuvidGetDecodeStatus(m_hDecoder, pDispInfo->picture_index, &DecodeStatus);
+    if (result == CUDA_SUCCESS && (DecodeStatus.decodeStatus == cuvidDecodeStatus_Error ||
+                                   DecodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed)) {
+        printf("Decode Error occurred for picture %d\n", m_nPicNumInDecodeOrder[pDispInfo->picture_index]);
+    }
+
+    uint8_t *pDecodedFrame = nullptr;
+    {
+        std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+        if ((unsigned)++m_nDecodedFrame > m_vpFrame.size()) {
+            // Not enough frames in stock
+            m_nFrameAlloc++;
+            uint8_t *pFrame = NULL;
+            if (m_bUseDeviceFrame) {
+                if (m_bDeviceFramePitched) {
+                    CUDA_DRVAPI_CALL(cuMemAllocPitch((CUdeviceptr *)&pFrame, &m_nDeviceFramePitch, GetWidth() * m_nBPP,
+                                                     m_nLumaHeight + (m_nChromaHeight * m_nNumChromaPlanes), 16));
+                } else {
+                    CUDA_DRVAPI_CALL(cuMemAlloc((CUdeviceptr *)&pFrame, GetFrameSize()));
+                }
+            } else {
+                pFrame = new uint8_t[GetFrameSize()];
+            }
+            m_vpFrame.push_back(pFrame);
+        }
+        pDecodedFrame = m_vpFrame[m_nDecodedFrame - 1];
+    }
+
+    // Copy luma plane
+    CUDA_MEMCPY2D m = {0};
+    m.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    m.srcDevice = dpSrcFrame;
+    m.srcPitch = nSrcPitch;
+    m.dstMemoryType = m_bUseDeviceFrame ? CU_MEMORYTYPE_DEVICE : CU_MEMORYTYPE_HOST;
+    m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame);
+    m.dstPitch = m_nDeviceFramePitch ? m_nDeviceFramePitch : GetWidth() * m_nBPP;
+    m.WidthInBytes = GetWidth() * m_nBPP;
+    m.Height = m_nLumaHeight;
+    CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream));
+
+    // Copy chroma plane
+    // NVDEC output has luma height aligned by 2. Adjust chroma offset by aligning height
+    m.srcDevice = (CUdeviceptr)((uint8_t *)dpSrcFrame + m.srcPitch * ((m_nSurfaceHeight + 1) & ~1));
+    m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * m_nLumaHeight);
+    m.Height = m_nChromaHeight;
+    CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream));
+
+    if (m_nNumChromaPlanes == 2) {
+        m.srcDevice = (CUdeviceptr)((uint8_t *)dpSrcFrame + m.srcPitch * ((m_nSurfaceHeight + 1) & ~1) * 2);
+        m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * m_nLumaHeight * 2);
+        m.Height = m_nChromaHeight;
+        CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream));
+    }
+    CUDA_DRVAPI_CALL(cuStreamSynchronize(m_cuvidStream));
+    CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL));
+
+    if ((int)m_vTimestamp.size() < m_nDecodedFrame) {
+        m_vTimestamp.resize(m_vpFrame.size());
+    }
+    m_vTimestamp[m_nDecodedFrame - 1] = pDispInfo->timestamp;
+
+    NVDEC_API_CALL(cuvidUnmapVideoFrame(m_hDecoder, dpSrcFrame));
+    return 1;
+}
+
+int NvDecoder::GetSEIMessage(CUVIDSEIMESSAGEINFO *pSEIMessageInfo) {
+    uint32_t seiNumMessages = pSEIMessageInfo->sei_message_count;
+    CUSEIMESSAGE *seiMessagesInfo = pSEIMessageInfo->pSEIMessage;
+    size_t totalSEIBufferSize = 0;
+    if ((pSEIMessageInfo->picIdx < 0) || (pSEIMessageInfo->picIdx >= MAX_FRM_CNT)) {
+        printf("Invalid picture index (%d)\n", pSEIMessageInfo->picIdx);
+        return 0;
+    }
+    for (uint32_t i = 0; i < seiNumMessages; i++) {
+        totalSEIBufferSize += seiMessagesInfo[i].sei_message_size;
+    }
+    if (!m_pCurrSEIMessage) {
+        printf("Out of Memory, Allocation failed for m_pCurrSEIMessage\n");
+        return 0;
+    }
+    m_pCurrSEIMessage->pSEIData = malloc(totalSEIBufferSize);
+    if (!m_pCurrSEIMessage->pSEIData) {
+        printf("Out of Memory, Allocation failed for SEI Buffer\n");
+        return 0;
+    }
+    memcpy(m_pCurrSEIMessage->pSEIData, pSEIMessageInfo->pSEIData, totalSEIBufferSize);
+    m_pCurrSEIMessage->pSEIMessage = (CUSEIMESSAGE *)malloc(sizeof(CUSEIMESSAGE) * seiNumMessages);
+    if (!m_pCurrSEIMessage->pSEIMessage) {
+        free(m_pCurrSEIMessage->pSEIData);
+        m_pCurrSEIMessage->pSEIData = NULL;
+        return 0;
+    }
+    memcpy(m_pCurrSEIMessage->pSEIMessage, pSEIMessageInfo->pSEIMessage, sizeof(CUSEIMESSAGE) * seiNumMessages);
+    m_pCurrSEIMessage->sei_message_count = pSEIMessageInfo->sei_message_count;
+    m_SEIMessagesDisplayOrder[pSEIMessageInfo->picIdx] = *m_pCurrSEIMessage;
+    return 1;
+}
+
+NvDecoder::NvDecoder(CUcontext cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, bool bLowLatency,
+                     bool bDeviceFramePitched, const Rect *pCropRect, const Dim *pResizeDim,
+                     bool extract_user_SEI_Message, int maxWidth, int maxHeight, unsigned int clkRate,
+                     bool force_zero_latency)
+    : m_cuContext(cuContext), m_bUseDeviceFrame(bUseDeviceFrame), m_eCodec(eCodec),
+      m_bDeviceFramePitched(bDeviceFramePitched), m_bExtractSEIMessage(extract_user_SEI_Message), m_nMaxWidth(maxWidth),
+      m_nMaxHeight(maxHeight), m_bForce_zero_latency(force_zero_latency) {
+    if (pCropRect)
+        m_cropRect = *pCropRect;
+    if (pResizeDim)
+        m_resizeDim = *pResizeDim;
+
+    NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext));
+
+    ck(cuStreamCreate(&m_cuvidStream, CU_STREAM_DEFAULT));
+
+    decoderSessionID = 0;
+
+    if (m_bExtractSEIMessage) {
+        m_fpSEI = fopen("sei_message.txt", "wb");
+        m_pCurrSEIMessage = new CUVIDSEIMESSAGEINFO;
+        memset(&m_SEIMessagesDisplayOrder, 0, sizeof(m_SEIMessagesDisplayOrder));
+    }
+    CUVIDPARSERPARAMS videoParserParameters = {};
+    videoParserParameters.CodecType = eCodec;
+    videoParserParameters.ulMaxNumDecodeSurfaces = 1;
+    videoParserParameters.ulClockRate = clkRate;
+    videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1;
+    videoParserParameters.pUserData = this;
+    videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc;
+    videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc;
+    videoParserParameters.pfnDisplayPicture = m_bForce_zero_latency ? NULL : HandlePictureDisplayProc;
+    videoParserParameters.pfnGetOperatingPoint = HandleOperatingPointProc;
+    videoParserParameters.pfnGetSEIMsg = m_bExtractSEIMessage ? HandleSEIMessagesProc : NULL;
+    NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters));
+}
+
+NvDecoder::~NvDecoder() {
+
+    START_TIMER
+
+    if (m_pCurrSEIMessage) {
+        delete m_pCurrSEIMessage;
+        m_pCurrSEIMessage = NULL;
+    }
+
+    if (m_fpSEI) {
+        fclose(m_fpSEI);
+        m_fpSEI = NULL;
+    }
+
+    if (m_hParser) {
+        cuvidDestroyVideoParser(m_hParser);
+    }
+    cuCtxPushCurrent(m_cuContext);
+    if (m_hDecoder) {
+        cuvidDestroyDecoder(m_hDecoder);
+    }
+
+    std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+
+    for (uint8_t *pFrame : m_vpFrame) {
+        if (m_bUseDeviceFrame) {
+            cuMemFree((CUdeviceptr)pFrame);
+        } else {
+            delete[] pFrame;
+        }
+    }
+    cuCtxPopCurrent(NULL);
+
+    cuvidCtxLockDestroy(m_ctxLock);
+
+    STOP_TIMER("Session Deinitialization Time: ");
+
+    NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime);
+}
+
+int NvDecoder::Decode(const uint8_t *pData, int nSize, int nFlags, int64_t nTimestamp) {
+    m_nDecodedFrame = 0;
+    m_nDecodedFrameReturned = 0;
+    CUVIDSOURCEDATAPACKET packet = {0};
+    packet.payload = pData;
+    packet.payload_size = nSize;
+    packet.flags = nFlags | CUVID_PKT_TIMESTAMP;
+    packet.timestamp = nTimestamp;
+    if (!pData || nSize == 0) {
+        packet.flags |= CUVID_PKT_ENDOFSTREAM;
+    }
+    NVDEC_API_CALL(cuvidParseVideoData(m_hParser, &packet));
+
+    return m_nDecodedFrame;
+}
+
+uint8_t *NvDecoder::GetFrame(int64_t *pTimestamp) {
+    if (m_nDecodedFrame > 0) {
+        std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+        m_nDecodedFrame--;
+        if (pTimestamp)
+            *pTimestamp = m_vTimestamp[m_nDecodedFrameReturned];
+        return m_vpFrame[m_nDecodedFrameReturned++];
+    }
+
+    return NULL;
+}
+
+uint8_t *NvDecoder::GetLockedFrame(int64_t *pTimestamp) {
+    uint8_t *pFrame;
+    uint64_t timestamp;
+    if (m_nDecodedFrame > 0) {
+        std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+        m_nDecodedFrame--;
+        pFrame = m_vpFrame[0];
+        m_vpFrame.erase(m_vpFrame.begin(), m_vpFrame.begin() + 1);
+
+        timestamp = m_vTimestamp[0];
+        m_vTimestamp.erase(m_vTimestamp.begin(), m_vTimestamp.begin() + 1);
+
+        if (pTimestamp)
+            *pTimestamp = timestamp;
+
+        return pFrame;
+    }
+
+    return NULL;
+}
+
+void NvDecoder::UnlockFrame(uint8_t **pFrame) {
+    std::lock_guard<std::mutex> lock(m_mtxVPFrame);
+    m_vpFrame.insert(m_vpFrame.end(), &pFrame[0], &pFrame[1]);
+
+    // add a dummy entry for timestamp
+    uint64_t timestamp[2] = {0};
+    m_vTimestamp.insert(m_vTimestamp.end(), &timestamp[0], &timestamp[1]);
+}
diff --git a/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h
new file mode 100644
index 000000000..886202bf7
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h
@@ -0,0 +1,528 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "../../../Interface/nvcuvid.h"
+#include "../Utils/NvCodecUtils.h"
+#include <assert.h>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#define MAX_FRM_CNT 32
+
+typedef enum { SEI_TYPE_TIME_CODE = 136, SEI_TYPE_USER_DATA_UNREGISTERED = 5 } SEI_H264_HEVC_PAYLOAD_TYPE;
+
+/**
+ * @brief Exception class for error reporting from the decode API.
+ */
+class NVDECException : public std::exception {
+  public:
+    NVDECException(const std::string &errorStr, const CUresult errorCode)
+        : m_errorString(errorStr), m_errorCode(errorCode) {}
+
+    virtual ~NVDECException() throw() {}
+    virtual const char *what() const throw() { return m_errorString.c_str(); }
+    CUresult getErrorCode() const { return m_errorCode; }
+    const std::string &getErrorString() const { return m_errorString; }
+    static NVDECException makeNVDECException(const std::string &errorStr, const CUresult errorCode,
+                                             const std::string &functionName, const std::string &fileName, int lineNo);
+
+  private:
+    std::string m_errorString;
+    CUresult m_errorCode;
+};
+
+inline NVDECException NVDECException::makeNVDECException(const std::string &errorStr, const CUresult errorCode,
+                                                         const std::string &functionName, const std::string &fileName,
+                                                         int lineNo) {
+    std::ostringstream errorLog;
+    errorLog << functionName << " : " << errorStr << " at " << fileName << ":" << lineNo << std::endl;
+    NVDECException exception(errorLog.str(), errorCode);
+    return exception;
+}
+
+#define NVDEC_THROW_ERROR(errorStr, errorCode)                                                                         \
+    do {                                                                                                               \
+        throw NVDECException::makeNVDECException(errorStr, errorCode, __FUNCTION__, __FILE__, __LINE__);               \
+    } while (0)
+
+#define NVDEC_API_CALL(cuvidAPI)                                                                                       \
+    do {                                                                                                               \
+        CUresult errorCode = cuvidAPI;                                                                                 \
+        if (errorCode != CUDA_SUCCESS) {                                                                               \
+            std::ostringstream errorLog;                                                                               \
+            errorLog << #cuvidAPI << " returned error " << errorCode;                                                  \
+            throw NVDECException::makeNVDECException(errorLog.str(), errorCode, __FUNCTION__, __FILE__, __LINE__);     \
+        }                                                                                                              \
+    } while (0)
+
+struct Rect {
+    int l, t, r, b;
+};
+
+struct Dim {
+    int w, h;
+};
+
+#define START_TIMER auto start = std::chrono::high_resolution_clock::now();
+
+#define STOP_TIMER(print_message)                                                                                      \
+    int64_t elapsedTime =                                                                                              \
+        std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start)       \
+            .count();                                                                                                  \
+    std::cout << print_message << elapsedTime << " ms " << std::endl;
+
+#define CUDA_DRVAPI_CALL(call)                                                                                         \
+    do {                                                                                                               \
+        CUresult err__ = call;                                                                                         \
+        if (err__ != CUDA_SUCCESS) {                                                                                   \
+            const char *szErrName = NULL;                                                                              \
+            cuGetErrorName(err__, &szErrName);                                                                         \
+            std::ostringstream errorLog;                                                                               \
+            errorLog << "CUDA driver API error " << szErrName;                                                         \
+            throw NVDECException::makeNVDECException(errorLog.str(), err__, __FUNCTION__, __FILE__, __LINE__);         \
+        }                                                                                                              \
+    } while (0)
+
+static const char *GetVideoCodecString(cudaVideoCodec eCodec) {
+    static struct {
+        cudaVideoCodec eCodec;
+        const char *name;
+    } aCodecName[] = {
+        {cudaVideoCodec_MPEG1, "MPEG-1"},
+        {cudaVideoCodec_MPEG2, "MPEG-2"},
+        {cudaVideoCodec_MPEG4, "MPEG-4 (ASP)"},
+        {cudaVideoCodec_VC1, "VC-1/WMV"},
+        {cudaVideoCodec_H264, "AVC/H.264"},
+        {cudaVideoCodec_JPEG, "M-JPEG"},
+        {cudaVideoCodec_H264_SVC, "H.264/SVC"},
+        {cudaVideoCodec_H264_MVC, "H.264/MVC"},
+        {cudaVideoCodec_HEVC, "H.265/HEVC"},
+        {cudaVideoCodec_VP8, "VP8"},
+        {cudaVideoCodec_VP9, "VP9"},
+        {cudaVideoCodec_AV1, "AV1"},
+        {cudaVideoCodec_NumCodecs, "Invalid"},
+        {cudaVideoCodec_YUV420, "YUV  4:2:0"},
+        {cudaVideoCodec_YV12, "YV12 4:2:0"},
+        {cudaVideoCodec_NV12, "NV12 4:2:0"},
+        {cudaVideoCodec_YUYV, "YUYV 4:2:2"},
+        {cudaVideoCodec_UYVY, "UYVY 4:2:2"},
+    };
+
+    if (eCodec >= 0 && eCodec <= cudaVideoCodec_NumCodecs) {
+        return aCodecName[eCodec].name;
+    }
+    for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(aCodecName) / sizeof(aCodecName[0]); i++) {
+        if (eCodec == aCodecName[i].eCodec) {
+            return aCodecName[eCodec].name;
+        }
+    }
+    return "Unknown";
+}
+
+static const char *GetVideoChromaFormatString(cudaVideoChromaFormat eChromaFormat) {
+    static struct {
+        cudaVideoChromaFormat eChromaFormat;
+        const char *name;
+    } aChromaFormatName[] = {
+        {cudaVideoChromaFormat_Monochrome, "YUV 400 (Monochrome)"},
+        {cudaVideoChromaFormat_420, "YUV 420"},
+        {cudaVideoChromaFormat_422, "YUV 422"},
+        {cudaVideoChromaFormat_444, "YUV 444"},
+    };
+
+    if (eChromaFormat >= 0 && eChromaFormat < sizeof(aChromaFormatName) / sizeof(aChromaFormatName[0])) {
+        return aChromaFormatName[eChromaFormat].name;
+    }
+    return "Unknown";
+}
+
+static float GetChromaHeightFactor(cudaVideoSurfaceFormat eSurfaceFormat) {
+    float factor = 0.5;
+    switch (eSurfaceFormat) {
+    case cudaVideoSurfaceFormat_NV12:
+    case cudaVideoSurfaceFormat_P016:
+        factor = 0.5;
+        break;
+    case cudaVideoSurfaceFormat_YUV444:
+    case cudaVideoSurfaceFormat_YUV444_16Bit:
+        factor = 1.0;
+        break;
+    }
+
+    return factor;
+}
+
+static int GetChromaPlaneCount(cudaVideoSurfaceFormat eSurfaceFormat) {
+    int numPlane = 1;
+    switch (eSurfaceFormat) {
+    case cudaVideoSurfaceFormat_NV12:
+    case cudaVideoSurfaceFormat_P016:
+        numPlane = 1;
+        break;
+    case cudaVideoSurfaceFormat_YUV444:
+    case cudaVideoSurfaceFormat_YUV444_16Bit:
+        numPlane = 2;
+        break;
+    }
+
+    return numPlane;
+}
+
+/**
+ * @brief Base class for decoder interface.
+ */
+class NvDecoder {
+
+  public:
+    NvDecoder() {}
+    /**
+     *  @brief This function is used to initialize the decoder session.
+     *  Application must call this function to initialize the decoder, before
+     *  starting to decode any frames.
+     */
+    NvDecoder(CUcontext cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, bool bLowLatency = false,
+              bool bDeviceFramePitched = false, const Rect *pCropRect = NULL, const Dim *pResizeDim = NULL,
+              bool extract_user_SEI_Message = false, int maxWidth = 0, int maxHeight = 0, unsigned int clkRate = 1000,
+              bool force_zero_latency = false);
+    ~NvDecoder();
+
+    /**
+     *  @brief  This function is used to get the current CUDA context.
+     */
+    CUcontext GetContext() { return m_cuContext; }
+
+    /**
+     *  @brief  This function is used to get the output frame width.
+     *  NV12/P016 output format width is 2 byte aligned because of U and V interleave
+     */
+    int GetWidth() {
+        assert(m_nWidth);
+        return (m_eOutputFormat == cudaVideoSurfaceFormat_NV12 || m_eOutputFormat == cudaVideoSurfaceFormat_P016)
+                   ? (m_nWidth + 1) & ~1
+                   : m_nWidth;
+    }
+
+    /**
+     *  @brief  This function is used to get the actual decode width
+     */
+    int GetDecodeWidth() {
+        assert(m_nWidth);
+        return m_nWidth;
+    }
+
+    /**
+     *  @brief  This function is used to get the output frame height (Luma height).
+     */
+    int GetHeight() {
+        assert(m_nLumaHeight);
+        return m_nLumaHeight;
+    }
+
+    /**
+     *  @brief  This function is used to get the current chroma height.
+     */
+    int GetChromaHeight() {
+        assert(m_nChromaHeight);
+        return m_nChromaHeight;
+    }
+
+    /**
+     *  @brief  This function is used to get the number of chroma planes.
+     */
+    int GetNumChromaPlanes() {
+        assert(m_nNumChromaPlanes);
+        return m_nNumChromaPlanes;
+    }
+
+    /**
+     *   @brief  This function is used to get the current frame size based on pixel format.
+     */
+    int GetFrameSize() {
+        assert(m_nWidth);
+        return GetWidth() * (m_nLumaHeight + (m_nChromaHeight * m_nNumChromaPlanes)) * m_nBPP;
+    }
+
+    /**
+     *   @brief  This function is used to get the current frame Luma plane size.
+     */
+    int GetLumaPlaneSize() {
+        assert(m_nWidth);
+        return GetWidth() * m_nLumaHeight * m_nBPP;
+    }
+
+    /**
+     *   @brief  This function is used to get the current frame chroma plane size.
+     */
+    int GetChromaPlaneSize() {
+        assert(m_nWidth);
+        return GetWidth() * (m_nChromaHeight * m_nNumChromaPlanes) * m_nBPP;
+    }
+
+    /**
+     *  @brief  This function is used to get the pitch of the device buffer holding the decoded frame.
+     */
+    int GetDeviceFramePitch() {
+        assert(m_nWidth);
+        return m_nDeviceFramePitch ? (int)m_nDeviceFramePitch : GetWidth() * m_nBPP;
+    }
+
+    /**
+     *   @brief  This function is used to get the bit depth associated with the pixel format.
+     */
+    int GetBitDepth() {
+        assert(m_nWidth);
+        return m_nBitDepthMinus8 + 8;
+    }
+
+    /**
+     *   @brief  This function is used to get the bytes used per pixel.
+     */
+    int GetBPP() {
+        assert(m_nWidth);
+        return m_nBPP;
+    }
+
+    /**
+     *   @brief  This function is used to get the YUV chroma format
+     */
+    cudaVideoSurfaceFormat GetOutputFormat() { return m_eOutputFormat; }
+
+    /**
+     *   @brief  This function is used to get information about the video stream (codec, display parameters etc)
+     */
+    CUVIDEOFORMAT GetVideoFormatInfo() {
+        assert(m_nWidth);
+        return m_videoFormat;
+    }
+
+    /**
+     *   @brief  This function is used to get codec string from codec id
+     */
+    const char *GetCodecString(cudaVideoCodec eCodec);
+
+    /**
+     *   @brief  This function is used to print information about the video stream
+     */
+    std::string GetVideoInfo() const { return m_videoInfo.str(); }
+
+    /**
+     *   @brief  This function decodes a frame and returns the number of frames that are available for
+     *   display. All frames that are available for display should be read before making a subsequent decode call.
+     *   @param  pData - pointer to the data buffer that is to be decoded
+     *   @param  nSize - size of the data buffer in bytes
+     *   @param  nFlags - CUvideopacketflags for setting decode options
+     *   @param  nTimestamp - presentation timestamp
+     */
+    int Decode(const uint8_t *pData, int nSize, int nFlags = 0, int64_t nTimestamp = 0);
+
+    /**
+     *   @brief  This function returns a decoded frame and timestamp. This function should be called in a loop for
+     *   fetching all the frames that are available for display.
+     */
+    uint8_t *GetFrame(int64_t *pTimestamp = nullptr);
+
+    /**
+     *   @brief  This function decodes a frame and returns the locked frame buffers
+     *   This makes the buffers available for use by the application without the buffers
+     *   getting overwritten, even if subsequent decode calls are made. The frame buffers
+     *   remain locked, until UnlockFrame() is called
+     */
+    uint8_t *GetLockedFrame(int64_t *pTimestamp = nullptr);
+
+    /**
+     *   @brief  This function unlocks the frame buffer and makes the frame buffers available for write again
+     *   @param  ppFrame - pointer to array of frames that are to be unlocked
+     *   @param  nFrame - number of frames to be unlocked
+     */
+    void UnlockFrame(uint8_t **pFrame);
+
+    /**
+     *   @brief  This function allows app to set decoder reconfig params
+     *   @param  pCropRect - cropping rectangle coordinates
+     *   @param  pResizeDim - width and height of resized output
+     */
+    int setReconfigParams(const Rect *pCropRect, const Dim *pResizeDim);
+
+    /**
+     *   @brief  This function allows app to set operating point for AV1 SVC clips
+     *   @param  opPoint - operating point of an AV1 scalable bitstream
+     *   @param  bDispAllLayers - Output all decoded frames of an AV1 scalable bitstream
+     */
+    void SetOperatingPoint(const uint32_t opPoint, const bool bDispAllLayers) {
+        m_nOperatingPoint = opPoint;
+        m_bDispAllLayers = bDispAllLayers;
+    }
+
+    // start a timer
+    void startTimer() { m_stDecode_time.Start(); }
+
+    // stop the timer
+    double stopTimer() { return m_stDecode_time.Stop(); }
+
+    void setDecoderSessionID(int sessionID) { decoderSessionID = sessionID; }
+    int getDecoderSessionID() { return decoderSessionID; }
+
+    // Session overhead refers to decoder initialization and deinitialization time
+    static void addDecoderSessionOverHead(int sessionID, int64_t duration) { sessionOverHead[sessionID] += duration; }
+    static int64_t getDecoderSessionOverHead(int sessionID) { return sessionOverHead[sessionID]; }
+
+  protected:
+    int decoderSessionID;                          // Decoder session identifier. Used to gather session level stats.
+    static std::map<int, int64_t> sessionOverHead; // Records session overhead of initialization+deinitialization time.
+                                                   // Format is (thread id, duration)
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback when decoding of sequence starts
+     */
+    static int CUDAAPI HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat) {
+        return ((NvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat);
+    }
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback when a decoded frame is ready to be decoded
+     */
+    static int CUDAAPI HandlePictureDecodeProc(void *pUserData, CUVIDPICPARAMS *pPicParams) {
+        return ((NvDecoder *)pUserData)->HandlePictureDecode(pPicParams);
+    }
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback when a decoded frame is available for display
+     */
+    static int CUDAAPI HandlePictureDisplayProc(void *pUserData, CUVIDPARSERDISPINFO *pDispInfo) {
+        return ((NvDecoder *)pUserData)->HandlePictureDisplay(pDispInfo);
+    }
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback to get operating point when AV1 SVC sequence
+     * header start.
+     */
+    static int CUDAAPI HandleOperatingPointProc(void *pUserData, CUVIDOPERATINGPOINTINFO *pOPInfo) {
+        return ((NvDecoder *)pUserData)->GetOperatingPoint(pOPInfo);
+    }
+
+    /**
+     *   @brief  Callback function to be registered for getting a callback when all the unregistered user SEI Messages
+     * are parsed for a frame.
+     */
+    static int CUDAAPI HandleSEIMessagesProc(void *pUserData, CUVIDSEIMESSAGEINFO *pSEIMessageInfo) {
+        return ((NvDecoder *)pUserData)->GetSEIMessage(pSEIMessageInfo);
+    }
+
+    /**
+    *   @brief  This function gets called when a sequence is ready to be decoded. The function also gets called
+        when there is format change
+    */
+    int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat);
+
+    /**
+     *   @brief  This function gets called when a picture is ready to be decoded. cuvidDecodePicture is called from this
+     * function to decode the picture
+     */
+    int HandlePictureDecode(CUVIDPICPARAMS *pPicParams);
+
+    /**
+    *   @brief  This function gets called after a picture is decoded and available for display. Frames are fetched and
+    stored in internal buffer
+    */
+    int HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo);
+
+    /**
+     *   @brief  This function gets called when AV1 sequence encounter more than one operating points
+     */
+    int GetOperatingPoint(CUVIDOPERATINGPOINTINFO *pOPInfo);
+
+    /**
+     *   @brief  This function gets called when all unregistered user SEI messages are parsed for a frame
+     */
+    int GetSEIMessage(CUVIDSEIMESSAGEINFO *pSEIMessageInfo);
+
+    /**
+     *   @brief  This function reconfigure decoder if there is a change in sequence params.
+     */
+    int ReconfigureDecoder(CUVIDEOFORMAT *pVideoFormat);
+
+  public:
+    CUcontext m_cuContext = NULL;
+    CUvideoctxlock m_ctxLock;
+    CUvideoparser m_hParser = NULL;
+    CUvideodecoder m_hDecoder = NULL;
+    bool m_bUseDeviceFrame;
+    // dimension of the output
+    unsigned int m_nWidth = 0, m_nLumaHeight = 0, m_nChromaHeight = 0;
+    unsigned int m_nNumChromaPlanes = 0;
+    // height of the mapped surface
+    int m_nSurfaceHeight = 0;
+    int m_nSurfaceWidth = 0;
+    cudaVideoCodec m_eCodec = cudaVideoCodec_NumCodecs;
+    cudaVideoChromaFormat m_eChromaFormat = cudaVideoChromaFormat_420;
+    cudaVideoSurfaceFormat m_eOutputFormat = cudaVideoSurfaceFormat_NV12;
+    int m_nBitDepthMinus8 = 0;
+    int m_nBPP = 1;
+    CUVIDEOFORMAT m_videoFormat = {};
+    Rect m_displayRect = {};
+    // stock of frames
+    std::vector<uint8_t *> m_vpFrame;
+    // timestamps of decoded frames
+    std::vector<int64_t> m_vTimestamp;
+    int m_nDecodedFrame = 0, m_nDecodedFrameReturned = 0;
+    int m_nDecodePicCnt = 0, m_nPicNumInDecodeOrder[MAX_FRM_CNT];
+    CUVIDSEIMESSAGEINFO *m_pCurrSEIMessage = NULL;
+    CUVIDSEIMESSAGEINFO m_SEIMessagesDisplayOrder[MAX_FRM_CNT];
+    FILE *m_fpSEI = NULL;
+    bool m_bEndDecodeDone = false;
+    std::mutex m_mtxVPFrame;
+    int m_nFrameAlloc = 0;
+    CUstream m_cuvidStream = 0;
+    bool m_bDeviceFramePitched = false;
+    size_t m_nDeviceFramePitch = 0;
+    Rect m_cropRect = {};
+    Dim m_resizeDim = {};
+
+    std::ostringstream m_videoInfo;
+    unsigned int m_nMaxWidth = 0, m_nMaxHeight = 0;
+    bool m_bReconfigExternal = false;
+    bool m_bReconfigExtPPChange = false;
+    StopWatch m_stDecode_time;
+
+    unsigned int m_nOperatingPoint = 0;
+    bool m_bDispAllLayers = false;
+    // In H.264, there is an inherent display latency for video contents
+    // which do not have num_reorder_frames=0 in the VUI. This applies to
+    // All-Intra and IPPP sequences as well. If the user wants zero display
+    // latency for All-Intra and IPPP sequences, the below flag will enable
+    // the display callback immediately after the decode callback.
+    bool m_bForce_zero_latency = false;
+    bool m_bExtractSEIMessage = false;
+};
diff --git a/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h
new file mode 100644
index 000000000..bd1881dbc
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h
@@ -0,0 +1,379 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
+#include <libavformat/avio.h>
+/* Explicitly include bsf.h when building against FFmpeg 4.3 (libavcodec 58.45.100) or later for backward compatibility
+ */
+#if LIBAVCODEC_VERSION_INT >= 3824484
+#include <libavcodec/bsf.h>
+#endif
+}
+#include "NvCodecUtils.h"
+#include "nvcuvid.h"
+
+//---------------------------------------------------------------------------
+//! \file FFmpegDemuxer.h
+//! \brief Provides functionality for stream demuxing
+//!
+//! This header file is used by Decode/Transcode apps to demux input video clips before decoding frames from it.
+//---------------------------------------------------------------------------
+
+/**
+ * @brief libavformat wrapper class. Retrieves the elementary encoded stream from the container format.
+ */
+class FFmpegDemuxer {
+  private:
+    AVFormatContext *fmtc = NULL;
+    AVIOContext *avioc = NULL;
+    AVPacket *pkt = NULL; /*!< AVPacket stores compressed data typically exported by demuxers and then passed as input
+                             to decoders */
+    AVPacket *pktFiltered = NULL;
+    AVBSFContext *bsfc = NULL;
+
+    int iVideoStream;
+    bool bMp4H264, bMp4HEVC, bMp4MPEG4;
+    AVCodecID eVideoCodec;
+    AVPixelFormat eChromaFormat;
+    int nWidth, nHeight, nBitDepth, nBPP, nChromaHeight;
+    double timeBase = 0.0;
+    int64_t userTimeScale = 0;
+
+    uint8_t *pDataWithHeader = NULL;
+
+    unsigned int frameCount = 0;
+
+  public:
+    class DataProvider {
+      public:
+        virtual ~DataProvider() {}
+        virtual int GetData(uint8_t *pBuf, int nBuf) = 0;
+    };
+
+  private:
+    /**
+     *   @brief  Private constructor to initialize libavformat resources.
+     *   @param  fmtc - Pointer to AVFormatContext allocated inside avformat_open_input()
+     */
+    FFmpegDemuxer(AVFormatContext *fmtc, int64_t timeScale = 1000 /*Hz*/) : fmtc(fmtc) {
+        if (!fmtc) {
+            LOG(ERROR) << "No AVFormatContext provided.";
+            return;
+        }
+
+        // Allocate the AVPackets and initialize to default values
+        pkt = av_packet_alloc();
+        pktFiltered = av_packet_alloc();
+        if (!pkt || !pktFiltered) {
+            LOG(ERROR) << "AVPacket allocation failed";
+            return;
+        }
+
+        LOG(INFO) << "Media format: " << fmtc->iformat->long_name << " (" << fmtc->iformat->name << ")";
+
+        ck(avformat_find_stream_info(fmtc, NULL));
+        iVideoStream = av_find_best_stream(fmtc, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0);
+        if (iVideoStream < 0) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " "
+                       << "Could not find stream in input file";
+            av_packet_free(&pkt);
+            av_packet_free(&pktFiltered);
+            return;
+        }
+
+        // fmtc->streams[iVideoStream]->need_parsing = AVSTREAM_PARSE_NONE;
+        eVideoCodec = fmtc->streams[iVideoStream]->codecpar->codec_id;
+        nWidth = fmtc->streams[iVideoStream]->codecpar->width;
+        nHeight = fmtc->streams[iVideoStream]->codecpar->height;
+        eChromaFormat = (AVPixelFormat)fmtc->streams[iVideoStream]->codecpar->format;
+        AVRational rTimeBase = fmtc->streams[iVideoStream]->time_base;
+        timeBase = av_q2d(rTimeBase);
+        userTimeScale = timeScale;
+
+        // Set bit depth, chroma height, bits per pixel based on eChromaFormat of input
+        switch (eChromaFormat) {
+        case AV_PIX_FMT_YUV420P10LE:
+        case AV_PIX_FMT_GRAY10LE: // monochrome is treated as 420 with chroma filled with 0x0
+            nBitDepth = 10;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV420P12LE:
+            nBitDepth = 12;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P10LE:
+            nBitDepth = 10;
+            nChromaHeight = nHeight << 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P12LE:
+            nBitDepth = 12;
+            nChromaHeight = nHeight << 1;
+            nBPP = 2;
+            break;
+        case AV_PIX_FMT_YUV444P:
+            nBitDepth = 8;
+            nChromaHeight = nHeight << 1;
+            nBPP = 1;
+            break;
+        case AV_PIX_FMT_YUV420P:
+        case AV_PIX_FMT_YUVJ420P:
+        case AV_PIX_FMT_YUVJ422P: // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420
+        case AV_PIX_FMT_YUVJ444P: // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420
+        case AV_PIX_FMT_GRAY8:    // monochrome is treated as 420 with chroma filled with 0x0
+            nBitDepth = 8;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 1;
+            break;
+        default:
+            LOG(WARNING) << "ChromaFormat not recognized. Assuming 420";
+            eChromaFormat = AV_PIX_FMT_YUV420P;
+            nBitDepth = 8;
+            nChromaHeight = (nHeight + 1) >> 1;
+            nBPP = 1;
+        }
+
+        bMp4H264 = eVideoCodec == AV_CODEC_ID_H264 && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") ||
+                                                       !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") ||
+                                                       !strcmp(fmtc->iformat->long_name, "Matroska / WebM"));
+        bMp4HEVC = eVideoCodec == AV_CODEC_ID_HEVC && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") ||
+                                                       !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") ||
+                                                       !strcmp(fmtc->iformat->long_name, "Matroska / WebM"));
+
+        bMp4MPEG4 = eVideoCodec == AV_CODEC_ID_MPEG4 && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") ||
+                                                         !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") ||
+                                                         !strcmp(fmtc->iformat->long_name, "Matroska / WebM"));
+
+        // Initialize bitstream filter and its required resources
+        if (bMp4H264) {
+            const AVBitStreamFilter *bsf = av_bsf_get_by_name("h264_mp4toannexb");
+            if (!bsf) {
+                LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " "
+                           << "av_bsf_get_by_name() failed";
+                av_packet_free(&pkt);
+                av_packet_free(&pktFiltered);
+                return;
+            }
+            ck(av_bsf_alloc(bsf, &bsfc));
+            avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar);
+            ck(av_bsf_init(bsfc));
+        }
+        if (bMp4HEVC) {
+            const AVBitStreamFilter *bsf = av_bsf_get_by_name("hevc_mp4toannexb");
+            if (!bsf) {
+                LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " "
+                           << "av_bsf_get_by_name() failed";
+                av_packet_free(&pkt);
+                av_packet_free(&pktFiltered);
+                return;
+            }
+            ck(av_bsf_alloc(bsf, &bsfc));
+            avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar);
+            ck(av_bsf_init(bsfc));
+        }
+    }
+
+    AVFormatContext *CreateFormatContext(DataProvider *pDataProvider) {
+
+        AVFormatContext *ctx = NULL;
+        if (!(ctx = avformat_alloc_context())) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+
+        uint8_t *avioc_buffer = NULL;
+        int avioc_buffer_size = 8 * 1024 * 1024;
+        avioc_buffer = (uint8_t *)av_malloc(avioc_buffer_size);
+        if (!avioc_buffer) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+        avioc = avio_alloc_context(avioc_buffer, avioc_buffer_size, 0, pDataProvider, &ReadPacket, NULL, NULL);
+        if (!avioc) {
+            LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+            return NULL;
+        }
+        ctx->pb = avioc;
+
+        ck(avformat_open_input(&ctx, NULL, NULL, NULL));
+        return ctx;
+    }
+
+    /**
+     *   @brief  Allocate and return AVFormatContext*.
+     *   @param  szFilePath - Filepath pointing to input stream.
+     *   @return Pointer to AVFormatContext
+     */
+    AVFormatContext *CreateFormatContext(const char *szFilePath) {
+        avformat_network_init();
+
+        AVFormatContext *ctx = NULL;
+        ck(avformat_open_input(&ctx, szFilePath, NULL, NULL));
+        return ctx;
+    }
+
+  public:
+    FFmpegDemuxer(const char *szFilePath, int64_t timescale = 1000 /*Hz*/)
+        : FFmpegDemuxer(CreateFormatContext(szFilePath), timescale) {}
+    FFmpegDemuxer(DataProvider *pDataProvider) : FFmpegDemuxer(CreateFormatContext(pDataProvider)) { avioc = fmtc->pb; }
+    ~FFmpegDemuxer() {
+
+        if (!fmtc) {
+            return;
+        }
+
+        if (pkt) {
+            av_packet_free(&pkt);
+        }
+        if (pktFiltered) {
+            av_packet_free(&pktFiltered);
+        }
+
+        if (bsfc) {
+            av_bsf_free(&bsfc);
+        }
+
+        avformat_close_input(&fmtc);
+
+        if (avioc) {
+            av_freep(&avioc->buffer);
+            av_freep(&avioc);
+        }
+
+        if (pDataWithHeader) {
+            av_free(pDataWithHeader);
+        }
+    }
+    AVCodecID GetVideoCodec() { return eVideoCodec; }
+    AVPixelFormat GetChromaFormat() { return eChromaFormat; }
+    int GetWidth() { return nWidth; }
+    int GetHeight() { return nHeight; }
+    int GetBitDepth() { return nBitDepth; }
+    int GetFrameSize() { return nWidth * (nHeight + nChromaHeight) * nBPP; }
+    bool Demux(uint8_t **ppVideo, int *pnVideoBytes, int64_t *pts = NULL) {
+        if (!fmtc) {
+            return false;
+        }
+
+        *pnVideoBytes = 0;
+
+        if (pkt->data) {
+            av_packet_unref(pkt);
+        }
+
+        int e = 0;
+        while ((e = av_read_frame(fmtc, pkt)) >= 0 && pkt->stream_index != iVideoStream) {
+            av_packet_unref(pkt);
+        }
+        if (e < 0) {
+            return false;
+        }
+
+        if (bMp4H264 || bMp4HEVC) {
+            if (pktFiltered->data) {
+                av_packet_unref(pktFiltered);
+            }
+            ck(av_bsf_send_packet(bsfc, pkt));
+            ck(av_bsf_receive_packet(bsfc, pktFiltered));
+            *ppVideo = pktFiltered->data;
+            *pnVideoBytes = pktFiltered->size;
+            if (pts)
+                *pts = (int64_t)(pktFiltered->pts * userTimeScale * timeBase);
+        } else {
+
+            if (bMp4MPEG4 && (frameCount == 0)) {
+
+                int extraDataSize = fmtc->streams[iVideoStream]->codecpar->extradata_size;
+
+                if (extraDataSize > 0) {
+
+                    // extradata contains start codes 00 00 01. Subtract its size
+                    pDataWithHeader = (uint8_t *)av_malloc(extraDataSize + pkt->size - 3 * sizeof(uint8_t));
+
+                    if (!pDataWithHeader) {
+                        LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__;
+                        return false;
+                    }
+
+                    memcpy(pDataWithHeader, fmtc->streams[iVideoStream]->codecpar->extradata, extraDataSize);
+                    memcpy(pDataWithHeader + extraDataSize, pkt->data + 3, pkt->size - 3 * sizeof(uint8_t));
+
+                    *ppVideo = pDataWithHeader;
+                    *pnVideoBytes = extraDataSize + pkt->size - 3 * sizeof(uint8_t);
+                }
+
+            } else {
+                *ppVideo = pkt->data;
+                *pnVideoBytes = pkt->size;
+            }
+
+            if (pts)
+                *pts = (int64_t)(pkt->pts * userTimeScale * timeBase);
+        }
+
+        frameCount++;
+
+        return true;
+    }
+
+    static int ReadPacket(void *opaque, uint8_t *pBuf, int nBuf) {
+        return ((DataProvider *)opaque)->GetData(pBuf, nBuf);
+    }
+};
+
+inline cudaVideoCodec FFmpeg2NvCodecId(AVCodecID id) {
+    switch (id) {
+    case AV_CODEC_ID_MPEG1VIDEO:
+        return cudaVideoCodec_MPEG1;
+    case AV_CODEC_ID_MPEG2VIDEO:
+        return cudaVideoCodec_MPEG2;
+    case AV_CODEC_ID_MPEG4:
+        return cudaVideoCodec_MPEG4;
+    case AV_CODEC_ID_WMV3:
+    case AV_CODEC_ID_VC1:
+        return cudaVideoCodec_VC1;
+    case AV_CODEC_ID_H264:
+        return cudaVideoCodec_H264;
+    case AV_CODEC_ID_HEVC:
+        return cudaVideoCodec_HEVC;
+    case AV_CODEC_ID_VP8:
+        return cudaVideoCodec_VP8;
+    case AV_CODEC_ID_VP9:
+        return cudaVideoCodec_VP9;
+    case AV_CODEC_ID_MJPEG:
+        return cudaVideoCodec_JPEG;
+    case AV_CODEC_ID_AV1:
+        return cudaVideoCodec_AV1;
+    default:
+        return cudaVideoCodec_NumCodecs;
+    }
+}
diff --git a/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h
new file mode 100644
index 000000000..08e43e603
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h
@@ -0,0 +1,148 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <mutex>
+#include <thread>
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavutil/opt.h>
+#include <libswresample/swresample.h>
+};
+#include "Logger.h"
+
+using namespace std;
+
+extern simplelogger::Logger *logger;
+
+static string AvErrorToString(int av_error_code) {
+    const auto buf_size = 1024U;
+    char *err_string = (char *)calloc(buf_size, sizeof(*err_string));
+    if (!err_string) {
+        return string();
+    }
+
+    if (0 != av_strerror(av_error_code, err_string, buf_size - 1)) {
+        free(err_string);
+        stringstream ss;
+        ss << "Unknown error with code " << av_error_code;
+        return ss.str();
+    }
+
+    string str(err_string);
+    free(err_string);
+    return str;
+}
+
+class FFmpegStreamer {
+  private:
+    AVFormatContext *oc = NULL;
+    AVStream *vs = NULL;
+    int nFps = 0;
+
+  public:
+    FFmpegStreamer(AVCodecID eCodecId, int nWidth, int nHeight, int nFps, const char *szInFilePath) : nFps(nFps) {
+        avformat_network_init();
+
+        int ret = 0;
+
+        if ((eCodecId == AV_CODEC_ID_H264) || (eCodecId == AV_CODEC_ID_HEVC))
+            ret = avformat_alloc_output_context2(&oc, NULL, "mpegts", NULL);
+        else if (eCodecId == AV_CODEC_ID_AV1)
+            ret = avformat_alloc_output_context2(&oc, NULL, "ivf", NULL);
+
+        if (ret < 0) {
+            LOG(ERROR) << "FFmpeg: failed to allocate an AVFormatContext. Error message: " << AvErrorToString(ret);
+            return;
+        }
+
+        oc->url = av_strdup(szInFilePath);
+        LOG(INFO) << "Streaming destination: " << oc->url;
+
+        // Add video stream to oc
+        vs = avformat_new_stream(oc, NULL);
+        if (!vs) {
+            LOG(ERROR) << "FFMPEG: Could not alloc video stream";
+            return;
+        }
+        vs->id = 0;
+
+        // Set video parameters
+        AVCodecParameters *vpar = vs->codecpar;
+        vpar->codec_id = eCodecId;
+        vpar->codec_type = AVMEDIA_TYPE_VIDEO;
+        vpar->width = nWidth;
+        vpar->height = nHeight;
+
+        // Everything is ready. Now open the output stream.
+        if (avio_open(&oc->pb, oc->url, AVIO_FLAG_WRITE) < 0) {
+            LOG(ERROR) << "FFMPEG: Could not open " << oc->url;
+            return;
+        }
+
+        // Write the container header
+        if (avformat_write_header(oc, NULL)) {
+            LOG(ERROR) << "FFMPEG: avformat_write_header error!";
+            return;
+        }
+    }
+    ~FFmpegStreamer() {
+        if (oc) {
+            av_write_trailer(oc);
+            avio_close(oc->pb);
+            avformat_free_context(oc);
+        }
+    }
+
+    bool Stream(uint8_t *pData, int nBytes, int nPts) {
+        AVPacket *pkt = av_packet_alloc();
+        if (!pkt) {
+            LOG(ERROR) << "AVPacket allocation failed !";
+            return false;
+        }
+        pkt->pts = av_rescale_q(nPts++, AVRational{1, nFps}, vs->time_base);
+        // No B-frames
+        pkt->dts = pkt->pts;
+        pkt->stream_index = vs->index;
+        pkt->data = pData;
+        pkt->size = nBytes;
+
+        if (!memcmp(pData, "\x00\x00\x00\x01\x67", 5)) {
+            pkt->flags |= AV_PKT_FLAG_KEY;
+        }
+
+        // Write the compressed frame into the output
+        int ret = av_write_frame(oc, pkt);
+        av_write_frame(oc, NULL);
+        if (ret < 0) {
+            LOG(ERROR) << "FFMPEG: Error while writing video frame";
+        }
+
+        av_packet_free(&pkt);
+        return true;
+    }
+};
diff --git a/third_party/Video_Codec_SDK/Samples/Utils/Logger.h b/third_party/Video_Codec_SDK/Samples/Utils/Logger.h
new file mode 100644
index 000000000..5d2f069cf
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/Utils/Logger.h
@@ -0,0 +1,235 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <time.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <winsock.h>
+
+#pragma comment(lib, "ws2_32.lib")
+#undef ERROR
+#else
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#define SOCKET int
+#define INVALID_SOCKET -1
+#endif
+
+enum LogLevel { TRACE, INFO, WARNING, ERROR, FATAL };
+
+namespace simplelogger {
+class Logger {
+  public:
+    Logger(LogLevel level, bool bPrintTimeStamp) : level(level), bPrintTimeStamp(bPrintTimeStamp) {}
+    virtual ~Logger() {}
+    virtual std::ostream &GetStream() = 0;
+    virtual void FlushStream() {}
+    bool ShouldLogFor(LogLevel l) { return l >= level; }
+    char *GetLead(LogLevel l, const char *szFile, int nLine, const char *szFunc) {
+        if (l < TRACE || l > FATAL) {
+            sprintf(szLead, "[?????] ");
+            return szLead;
+        }
+        const char *szLevels[] = {"TRACE", "INFO", "WARN", "ERROR", "FATAL"};
+        if (bPrintTimeStamp) {
+            time_t t = time(NULL);
+            struct tm *ptm = localtime(&t);
+            sprintf(szLead, "[%-5s][%02d:%02d:%02d] ", szLevels[l], ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
+        } else {
+            sprintf(szLead, "[%-5s] ", szLevels[l]);
+        }
+        return szLead;
+    }
+    void EnterCriticalSection() { mtx.lock(); }
+    void LeaveCriticalSection() { mtx.unlock(); }
+
+  private:
+    LogLevel level;
+    char szLead[80];
+    bool bPrintTimeStamp;
+    std::mutex mtx;
+};
+
+class LoggerFactory {
+  public:
+    static Logger *CreateFileLogger(std::string strFilePath, LogLevel level = INFO, bool bPrintTimeStamp = true) {
+        return new FileLogger(strFilePath, level, bPrintTimeStamp);
+    }
+    static Logger *CreateConsoleLogger(LogLevel level = INFO, bool bPrintTimeStamp = true) {
+        return new ConsoleLogger(level, bPrintTimeStamp);
+    }
+    static Logger *CreateUdpLogger(char *szHost, unsigned uPort, LogLevel level = INFO, bool bPrintTimeStamp = true) {
+        return new UdpLogger(szHost, uPort, level, bPrintTimeStamp);
+    }
+
+  private:
+    LoggerFactory() {}
+
+    class FileLogger : public Logger {
+      public:
+        FileLogger(std::string strFilePath, LogLevel level, bool bPrintTimeStamp) : Logger(level, bPrintTimeStamp) {
+            pFileOut = new std::ofstream();
+            pFileOut->open(strFilePath.c_str());
+        }
+        ~FileLogger() { pFileOut->close(); }
+        std::ostream &GetStream() { return *pFileOut; }
+
+      private:
+        std::ofstream *pFileOut;
+    };
+
+    class ConsoleLogger : public Logger {
+      public:
+        ConsoleLogger(LogLevel level, bool bPrintTimeStamp) : Logger(level, bPrintTimeStamp) {}
+        std::ostream &GetStream() { return std::cout; }
+    };
+
+    class UdpLogger : public Logger {
+      private:
+        class UdpOstream : public std::ostream {
+          public:
+            UdpOstream(char *szHost, unsigned short uPort) : std::ostream(&sb), socket(INVALID_SOCKET) {
+#ifdef _WIN32
+                WSADATA w;
+                if (WSAStartup(0x0101, &w) != 0) {
+                    fprintf(stderr, "WSAStartup() failed.\n");
+                    return;
+                }
+#endif
+                socket = ::socket(AF_INET, SOCK_DGRAM, 0);
+                if (socket == INVALID_SOCKET) {
+#ifdef _WIN32
+                    WSACleanup();
+#endif
+                    fprintf(stderr, "socket() failed.\n");
+                    return;
+                }
+#ifdef _WIN32
+                unsigned int b1, b2, b3, b4;
+                sscanf(szHost, "%u.%u.%u.%u", &b1, &b2, &b3, &b4);
+                struct in_addr addr = {(unsigned char)b1, (unsigned char)b2, (unsigned char)b3, (unsigned char)b4};
+#else
+                struct in_addr addr = {inet_addr(szHost)};
+#endif
+                struct sockaddr_in s = {AF_INET, htons(uPort), addr};
+                server = s;
+            }
+            ~UdpOstream() throw() {
+                if (socket == INVALID_SOCKET) {
+                    return;
+                }
+#ifdef _WIN32
+                closesocket(socket);
+                WSACleanup();
+#else
+                close(socket);
+#endif
+            }
+            void Flush() {
+                if (sendto(socket, sb.str().c_str(), (int)sb.str().length() + 1, 0, (struct sockaddr *)&server,
+                           (int)sizeof(sockaddr_in)) == -1) {
+                    fprintf(stderr, "sendto() failed.\n");
+                }
+                sb.str("");
+            }
+
+          private:
+            std::stringbuf sb;
+            SOCKET socket;
+            struct sockaddr_in server;
+        };
+
+      public:
+        UdpLogger(char *szHost, unsigned uPort, LogLevel level, bool bPrintTimeStamp)
+            : Logger(level, bPrintTimeStamp), udpOut(szHost, (unsigned short)uPort) {}
+        UdpOstream &GetStream() { return udpOut; }
+        virtual void FlushStream() { udpOut.Flush(); }
+
+      private:
+        UdpOstream udpOut;
+    };
+};
+
+class LogTransaction {
+  public:
+    LogTransaction(Logger *pLogger, LogLevel level, const char *szFile, const int nLine, const char *szFunc)
+        : pLogger(pLogger), level(level) {
+        if (!pLogger) {
+            std::cout << "[-----] ";
+            return;
+        }
+        if (!pLogger->ShouldLogFor(level)) {
+            return;
+        }
+        pLogger->EnterCriticalSection();
+        pLogger->GetStream() << pLogger->GetLead(level, szFile, nLine, szFunc);
+    }
+    ~LogTransaction() {
+        if (!pLogger) {
+            std::cout << std::endl;
+            return;
+        }
+        if (!pLogger->ShouldLogFor(level)) {
+            return;
+        }
+        pLogger->GetStream() << std::endl;
+        pLogger->FlushStream();
+        pLogger->LeaveCriticalSection();
+        if (level == FATAL) {
+            exit(1);
+        }
+    }
+    std::ostream &GetStream() {
+        if (!pLogger) {
+            return std::cout;
+        }
+        if (!pLogger->ShouldLogFor(level)) {
+            return ossNull;
+        }
+        return pLogger->GetStream();
+    }
+
+  private:
+    Logger *pLogger;
+    LogLevel level;
+    std::ostringstream ossNull;
+};
+
+} // namespace simplelogger
+
+extern simplelogger::Logger *logger;
+#define LOG(level) simplelogger::LogTransaction(logger, level, __FILE__, __LINE__, __FUNCTION__).GetStream()
diff --git a/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h b/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h
new file mode 100644
index 000000000..065a7cd9b
--- /dev/null
+++ b/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h
@@ -0,0 +1,547 @@
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2023 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+//---------------------------------------------------------------------------
+//! \file NvCodecUtils.h
+//! \brief Miscellaneous classes and error checking functions.
+//!
+//! Used by Transcode/Encode samples apps for reading input files, mutithreading, performance measurement or colorspace
+//! conversion while decoding.
+//---------------------------------------------------------------------------
+
+#pragma once
+#include "Logger.h"
+#include <assert.h>
+#include <chrono>
+#include <condition_variable>
+#include <iomanip>
+#include <ios>
+#include <list>
+#include <sstream>
+#include <stdint.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <thread>
+#include <vector>
+
+extern simplelogger::Logger *logger;
+
+#ifdef __cuda_cuda_h__
+inline bool check(CUresult e, int iLine, const char *szFile) {
+    if (e != CUDA_SUCCESS) {
+        const char *szErrName = NULL;
+        cuGetErrorName(e, &szErrName);
+        LOG(FATAL) << "CUDA driver API error " << szErrName << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#ifdef __CUDA_RUNTIME_H__
+inline bool check(cudaError_t e, int iLine, const char *szFile) {
+    if (e != cudaSuccess) {
+        LOG(FATAL) << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#ifdef _NV_ENCODEAPI_H_
+inline bool check(NVENCSTATUS e, int iLine, const char *szFile) {
+    const char *aszErrName[] = {
+        "NV_ENC_SUCCESS",
+        "NV_ENC_ERR_NO_ENCODE_DEVICE",
+        "NV_ENC_ERR_UNSUPPORTED_DEVICE",
+        "NV_ENC_ERR_INVALID_ENCODERDEVICE",
+        "NV_ENC_ERR_INVALID_DEVICE",
+        "NV_ENC_ERR_DEVICE_NOT_EXIST",
+        "NV_ENC_ERR_INVALID_PTR",
+        "NV_ENC_ERR_INVALID_EVENT",
+        "NV_ENC_ERR_INVALID_PARAM",
+        "NV_ENC_ERR_INVALID_CALL",
+        "NV_ENC_ERR_OUT_OF_MEMORY",
+        "NV_ENC_ERR_ENCODER_NOT_INITIALIZED",
+        "NV_ENC_ERR_UNSUPPORTED_PARAM",
+        "NV_ENC_ERR_LOCK_BUSY",
+        "NV_ENC_ERR_NOT_ENOUGH_BUFFER",
+        "NV_ENC_ERR_INVALID_VERSION",
+        "NV_ENC_ERR_MAP_FAILED",
+        "NV_ENC_ERR_NEED_MORE_INPUT",
+        "NV_ENC_ERR_ENCODER_BUSY",
+        "NV_ENC_ERR_EVENT_NOT_REGISTERED",
+        "NV_ENC_ERR_GENERIC",
+        "NV_ENC_ERR_INCOMPATIBLE_CLIENT_KEY",
+        "NV_ENC_ERR_UNIMPLEMENTED",
+        "NV_ENC_ERR_RESOURCE_REGISTER_FAILED",
+        "NV_ENC_ERR_RESOURCE_NOT_REGISTERED",
+        "NV_ENC_ERR_RESOURCE_NOT_MAPPED",
+    };
+    if (e != NV_ENC_SUCCESS) {
+        LOG(FATAL) << "NVENC error " << aszErrName[e] << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#ifdef _WINERROR_
+inline bool check(HRESULT e, int iLine, const char *szFile) {
+    if (e != S_OK) {
+        std::stringstream stream;
+        stream << std::hex << std::uppercase << e;
+        LOG(FATAL) << "HRESULT error 0x" << stream.str() << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+#if defined(__gl_h_) || defined(__GL_H__)
+inline bool check(GLenum e, int iLine, const char *szFile) {
+    if (e != 0) {
+        LOG(ERROR) << "GLenum error " << e << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+#endif
+
+inline bool check(int e, int iLine, const char *szFile) {
+    if (e < 0) {
+        LOG(ERROR) << "General error " << e << " at line " << iLine << " in file " << szFile;
+        return false;
+    }
+    return true;
+}
+
+#define ck(call) check(call, __LINE__, __FILE__)
+#define MAKE_FOURCC(ch0, ch1, ch2, ch3)                                                                                \
+    ((uint32_t)(uint8_t)(ch0) | ((uint32_t)(uint8_t)(ch1) << 8) | ((uint32_t)(uint8_t)(ch2) << 16) |                   \
+     ((uint32_t)(uint8_t)(ch3) << 24))
+
+/**
+ * @brief Wrapper class around std::thread
+ */
+class NvThread {
+  public:
+    NvThread() = default;
+    NvThread(const NvThread &) = delete;
+    NvThread &operator=(const NvThread &other) = delete;
+
+    NvThread(std::thread &&thread) : t(std::move(thread)) {}
+
+    NvThread(NvThread &&thread) : t(std::move(thread.t)) {}
+
+    NvThread &operator=(NvThread &&other) {
+        t = std::move(other.t);
+        return *this;
+    }
+
+    ~NvThread() { join(); }
+
+    void join() {
+        if (t.joinable()) {
+            t.join();
+        }
+    }
+
+  private:
+    std::thread t;
+};
+
+#ifndef _WIN32
+#define _stricmp strcasecmp
+#define _stat64 stat64
+#endif
+
+/**
+ * @brief Utility class to allocate buffer memory. Helps avoid I/O during the encode/decode loop in case of performance
+ * tests.
+ */
+class BufferedFileReader {
+  public:
+    /**
+     * @brief Constructor function to allocate appropriate memory and copy file contents into it
+     */
+    BufferedFileReader(const char *szFileName, bool bPartial = false) {
+        struct _stat64 st;
+
+        if (_stat64(szFileName, &st) != 0) {
+            return;
+        }
+
+        nSize = st.st_size;
+        while (nSize) {
+            try {
+                pBuf = new uint8_t[(size_t)nSize];
+                if (nSize != st.st_size) {
+                    LOG(WARNING) << "File is too large - only " << std::setprecision(4) << 100.0 * nSize / st.st_size
+                                 << "% is loaded";
+                }
+                break;
+            } catch (std::bad_alloc) {
+                if (!bPartial) {
+                    LOG(ERROR) << "Failed to allocate memory in BufferedReader";
+                    return;
+                }
+                nSize = (uint32_t)(nSize * 0.9);
+            }
+        }
+
+        std::ifstream fpIn(szFileName, std::ifstream::in | std::ifstream::binary);
+        if (!fpIn) {
+            LOG(ERROR) << "Unable to open input file: " << szFileName;
+            return;
+        }
+
+        std::streamsize nRead = fpIn.read(reinterpret_cast<char *>(pBuf), nSize).gcount();
+        fpIn.close();
+
+        assert(nRead == nSize);
+    }
+    ~BufferedFileReader() {
+        if (pBuf) {
+            delete[] pBuf;
+        }
+    }
+    bool GetBuffer(uint8_t **ppBuf, uint64_t *pnSize) {
+        if (!pBuf) {
+            return false;
+        }
+
+        *ppBuf = pBuf;
+        *pnSize = nSize;
+        return true;
+    }
+
+  private:
+    uint8_t *pBuf = NULL;
+    uint64_t nSize = 0;
+};
+
+/**
+ * @brief Template class to facilitate color space conversion
+ */
+template <typename T> class YuvConverter {
+  public:
+    YuvConverter(int nWidth, int nHeight) : nWidth(nWidth), nHeight(nHeight) {
+        pQuad = new T[((nWidth + 1) / 2) * ((nHeight + 1) / 2)];
+    }
+    ~YuvConverter() { delete[] pQuad; }
+    void PlanarToUVInterleaved(T *pFrame, int nPitch = 0) {
+        if (nPitch == 0) {
+            nPitch = nWidth;
+        }
+
+        // sizes of source surface plane
+        int nSizePlaneY = nPitch * nHeight;
+        int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2);
+        int nSizePlaneV = nSizePlaneU;
+
+        T *puv = pFrame + nSizePlaneY;
+        if (nPitch == nWidth) {
+            memcpy(pQuad, puv, nSizePlaneU * sizeof(T));
+        } else {
+            for (int i = 0; i < (nHeight + 1) / 2; i++) {
+                memcpy(pQuad + ((nWidth + 1) / 2) * i, puv + ((nPitch + 1) / 2) * i, ((nWidth + 1) / 2) * sizeof(T));
+            }
+        }
+        T *pv = puv + nSizePlaneU;
+        for (int y = 0; y < (nHeight + 1) / 2; y++) {
+            for (int x = 0; x < (nWidth + 1) / 2; x++) {
+                puv[y * nPitch + x * 2] = pQuad[y * ((nWidth + 1) / 2) + x];
+                puv[y * nPitch + x * 2 + 1] = pv[y * ((nPitch + 1) / 2) + x];
+            }
+        }
+    }
+    void UVInterleavedToPlanar(T *pFrame, int nPitch = 0) {
+        if (nPitch == 0) {
+            nPitch = nWidth;
+        }
+
+        // sizes of source surface plane
+        int nSizePlaneY = nPitch * nHeight;
+        int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2);
+        int nSizePlaneV = nSizePlaneU;
+
+        T *puv = pFrame + nSizePlaneY, *pu = puv, *pv = puv + nSizePlaneU;
+
+        // split chroma from interleave to planar
+        for (int y = 0; y < (nHeight + 1) / 2; y++) {
+            for (int x = 0; x < (nWidth + 1) / 2; x++) {
+                pu[y * ((nPitch + 1) / 2) + x] = puv[y * nPitch + x * 2];
+                pQuad[y * ((nWidth + 1) / 2) + x] = puv[y * nPitch + x * 2 + 1];
+            }
+        }
+        if (nPitch == nWidth) {
+            memcpy(pv, pQuad, nSizePlaneV * sizeof(T));
+        } else {
+            for (int i = 0; i < (nHeight + 1) / 2; i++) {
+                memcpy(pv + ((nPitch + 1) / 2) * i, pQuad + ((nWidth + 1) / 2) * i, ((nWidth + 1) / 2) * sizeof(T));
+            }
+        }
+    }
+
+  private:
+    T *pQuad;
+    int nWidth, nHeight;
+};
+
+/**
+ * @brief Class for writing IVF format header for AV1 codec
+ */
+class IVFUtils {
+  public:
+    void WriteFileHeader(std::vector<uint8_t> &vPacket, uint32_t nFourCC, uint32_t nWidth, uint32_t nHeight,
+                         uint32_t nFrameRateNum, uint32_t nFrameRateDen, uint32_t nFrameCnt) {
+        char header[32];
+
+        header[0] = 'D';
+        header[1] = 'K';
+        header[2] = 'I';
+        header[3] = 'F';
+        mem_put_le16(header + 4, 0);              // version
+        mem_put_le16(header + 6, 32);             // header size
+        mem_put_le32(header + 8, nFourCC);        // fourcc
+        mem_put_le16(header + 12, nWidth);        // width
+        mem_put_le16(header + 14, nHeight);       // height
+        mem_put_le32(header + 16, nFrameRateNum); // rate
+        mem_put_le32(header + 20, nFrameRateDen); // scale
+        mem_put_le32(header + 24, nFrameCnt);     // length
+        mem_put_le32(header + 28, 0);             // unused
+
+        vPacket.insert(vPacket.end(), &header[0], &header[32]);
+    }
+
+    void WriteFrameHeader(std::vector<uint8_t> &vPacket, size_t nFrameSize, int64_t pts) {
+        char header[12];
+        mem_put_le32(header, (int)nFrameSize);
+        mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF));
+        mem_put_le32(header + 8, (int)(pts >> 32));
+
+        vPacket.insert(vPacket.end(), &header[0], &header[12]);
+    }
+
+  private:
+    static inline void mem_put_le32(void *vmem, int val) {
+        unsigned char *mem = (unsigned char *)vmem;
+        mem[0] = (unsigned char)((val >> 0) & 0xff);
+        mem[1] = (unsigned char)((val >> 8) & 0xff);
+        mem[2] = (unsigned char)((val >> 16) & 0xff);
+        mem[3] = (unsigned char)((val >> 24) & 0xff);
+    }
+
+    static inline void mem_put_le16(void *vmem, int val) {
+        unsigned char *mem = (unsigned char *)vmem;
+        mem[0] = (unsigned char)((val >> 0) & 0xff);
+        mem[1] = (unsigned char)((val >> 8) & 0xff);
+    }
+};
+
+/**
+ * @brief Utility class to measure elapsed time in seconds between the block of executed code
+ */
+class StopWatch {
+  public:
+    void Start() { t0 = std::chrono::high_resolution_clock::now(); }
+    double Stop() {
+        return std::chrono::duration_cast<std::chrono::nanoseconds>(
+                   std::chrono::high_resolution_clock::now().time_since_epoch() - t0.time_since_epoch())
+                   .count() /
+               1.0e9;
+    }
+
+  private:
+    std::chrono::high_resolution_clock::time_point t0;
+};
+
+template <typename T> class ConcurrentQueue {
+  public:
+    ConcurrentQueue() {}
+    ConcurrentQueue(size_t size) : maxSize(size) {}
+    ConcurrentQueue(const ConcurrentQueue &) = delete;
+    ConcurrentQueue &operator=(const ConcurrentQueue &) = delete;
+
+    void setSize(size_t s) { maxSize = s; }
+
+    void push_back(const T &value) {
+        // Do not use a std::lock_guard here. We will need to explicitly
+        // unlock before notify_one as the other waiting thread will
+        // automatically try to acquire mutex once it wakes up
+        // (which will happen on notify_one)
+        std::unique_lock<std::mutex> lock(m_mutex);
+        auto wasEmpty = m_List.empty();
+
+        while (full()) {
+            m_cond.wait(lock);
+        }
+
+        m_List.push_back(value);
+        if (wasEmpty && !m_List.empty()) {
+            lock.unlock();
+            m_cond.notify_one();
+        }
+    }
+
+    T pop_front() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+
+        while (m_List.empty()) {
+            m_cond.wait(lock);
+        }
+        auto wasFull = full();
+        T data = std::move(m_List.front());
+        m_List.pop_front();
+
+        if (wasFull && !full()) {
+            lock.unlock();
+            m_cond.notify_one();
+        }
+
+        return data;
+    }
+
+    T front() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+
+        while (m_List.empty()) {
+            m_cond.wait(lock);
+        }
+
+        return m_List.front();
+    }
+
+    size_t size() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        return m_List.size();
+    }
+
+    bool empty() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        return m_List.empty();
+    }
+    void clear() {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        m_List.clear();
+    }
+
+  private:
+    bool full() {
+        if (maxSize > 0 && m_List.size() == maxSize)
+            return true;
+        return false;
+    }
+
+  private:
+    std::list<T> m_List;
+    std::mutex m_mutex;
+    std::condition_variable m_cond;
+    size_t maxSize;
+};
+
+inline void CheckInputFile(const char *szInFilePath) {
+    std::ifstream fpIn(szInFilePath, std::ios::in | std::ios::binary);
+    if (fpIn.fail()) {
+        std::ostringstream err;
+        err << "Unable to open input file: " << szInFilePath << std::endl;
+        throw std::invalid_argument(err.str());
+    }
+}
+
+inline void ValidateResolution(int nWidth, int nHeight) {
+
+    if (nWidth <= 0 || nHeight <= 0) {
+        std::ostringstream err;
+        err << "Please specify positive non zero resolution as -s WxH. Current resolution is " << nWidth << "x"
+            << nHeight << std::endl;
+        throw std::invalid_argument(err.str());
+    }
+}
+
+template <class COLOR32>
+void Nv12ToColor32(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                   int iMatrix = 0);
+template <class COLOR64>
+void Nv12ToColor64(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                   int iMatrix = 0);
+
+template <class COLOR32>
+void P016ToColor32(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                   int iMatrix = 4);
+template <class COLOR64>
+void P016ToColor64(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                   int iMatrix = 4);
+
+template <class COLOR32>
+void YUV444ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                     int iMatrix = 0);
+template <class COLOR64>
+void YUV444ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                     int iMatrix = 0);
+
+template <class COLOR32>
+void YUV444P16ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                        int iMatrix = 4);
+template <class COLOR64>
+void YUV444P16ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight,
+                        int iMatrix = 4);
+
+template <class COLOR32>
+void Nv12ToColorPlanar(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight,
+                       int iMatrix = 0);
+template <class COLOR32>
+void P016ToColorPlanar(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight,
+                       int iMatrix = 4);
+
+template <class COLOR32>
+void YUV444ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight,
+                         int iMatrix = 0);
+template <class COLOR32>
+void YUV444P16ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight,
+                            int iMatrix = 4);
+
+void Bgra64ToP016(uint8_t *dpBgra, int nBgraPitch, uint8_t *dpP016, int nP016Pitch, int nWidth, int nHeight,
+                  int iMatrix = 4);
+
+void ConvertUInt8ToUInt16(uint8_t *dpUInt8, uint16_t *dpUInt16, int nSrcPitch, int nDestPitch, int nWidth, int nHeight);
+void ConvertUInt16ToUInt8(uint16_t *dpUInt16, uint8_t *dpUInt8, int nSrcPitch, int nDestPitch, int nWidth, int nHeight);
+
+void ResizeNv12(unsigned char *dpDstNv12, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcNv12,
+                int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char *dpDstNv12UV = nullptr);
+void ResizeP016(unsigned char *dpDstP016, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcP016,
+                int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char *dpDstP016UV = nullptr);
+
+void ScaleYUV420(unsigned char *dpDstY, unsigned char *dpDstU, unsigned char *dpDstV, int nDstPitch,
+                 int nDstChromaPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcY, unsigned char *dpSrcU,
+                 unsigned char *dpSrcV, int nSrcPitch, int nSrcChromaPitch, int nSrcWidth, int nSrcHeight,
+                 bool bSemiplanar);
+
+#ifdef __cuda_cuda_h__
+void ComputeCRC(uint8_t *pBuffer, uint32_t *crcValue, CUstream_st *outputCUStream);
+#endif

From d246bab430adeb461072918a551b2e2b68c9bce5 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Mon, 23 Oct 2023 11:21:17 +0800
Subject: [PATCH 29/33] Dockerfile - update mlc version into 3.10 for cuda and
 rocm dockerfiles (#562)

**Description**
Update mlc version into 3.10 for cuda and rocm dockerfiles to be
consistent with cuda12 dockerfile

Co-authored-by: yukirora <yuting.jiang@microsoft.com>
---
 dockerfile/cuda11.1.1.dockerfile | 4 ++--
 dockerfile/rocm5.0.x.dockerfile  | 4 ++--
 dockerfile/rocm5.1.x.dockerfile  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index d7feb2baa..6b3a2acb2 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -13,7 +13,7 @@ FROM nvcr.io/nvidia/pytorch:20.12-py3
 #   - HPC-X: v2.8.3
 #   - NCCL RDMA SHARP plugins: 7cccbc1
 # Intel:
-#   - mlc: v3.9a
+#   - mlc: v3.10
 
 LABEL maintainer="SuperBench"
 
@@ -111,7 +111,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/rocm5.0.x.dockerfile b/dockerfile/rocm5.0.x.dockerfile
index 6830263ce..02b33c3f9 100644
--- a/dockerfile/rocm5.0.x.dockerfile
+++ b/dockerfile/rocm5.0.x.dockerfile
@@ -17,7 +17,7 @@ FROM ${BASE_IMAGE}
 # Mellanox:
 #   - OFED: 5.2-2.2.3.0
 # Intel:
-#   - mlc: v3.9a
+#   - mlc: v3.10
 
 LABEL maintainer="SuperBench"
 
@@ -97,7 +97,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/rocm5.1.x.dockerfile b/dockerfile/rocm5.1.x.dockerfile
index 5e4b118e0..292293a3e 100644
--- a/dockerfile/rocm5.1.x.dockerfile
+++ b/dockerfile/rocm5.1.x.dockerfile
@@ -16,7 +16,7 @@ FROM ${BASE_IMAGE}
 # Mellanox:
 #   - OFED: 5.2-2.2.3.0
 # Intel:
-#   - mlc: v3.9a
+#   - mlc: v3.10
 
 LABEL maintainer="SuperBench"
 
@@ -109,7 +109,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz

From 07477c3baea7c8acf4f65c93c7d7d1069f4f7081 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 5 Nov 2023 11:35:49 +0000
Subject: [PATCH 30/33] Bump postcss from 8.3.5 to 8.4.31 in /website (#564)

Bumps [postcss](https://github.com/postcss/postcss) from 8.3.5 to 8.4.31.
- [Release notes](https://github.com/postcss/postcss/releases)
- [Changelog](https://github.com/postcss/postcss/blob/main/CHANGELOG.md)
- [Commits](postcss/postcss@8.3.5...8.4.31)

---
updated-dependencies:
- dependency-name: postcss
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 website/package-lock.json | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/website/package-lock.json b/website/package-lock.json
index a2e3b219d..b7eee3fe0 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -7291,11 +7291,6 @@
       "integrity": "sha512-2ZTgtl0nJsO0KQCjEpxcIr5D+Yv90plTitZt9JBfQvVJDS5seMl3FOvsh3+9CoYWXf/1l5OaZzzF6nDm4cagaQ==",
       "optional": true
     },
-    "nanoid": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.2.0.tgz",
-      "integrity": "sha512-fmsZYa9lpn69Ad5eDn7FMcnnSR+8R34W9qJEijxYhTbfOWzr22n1QxCMzXLK+ODyW2973V3Fux959iQoUxzUIA=="
-    },
     "nanomatch": {
       "version": "1.2.13",
       "resolved": "https://registry.npmjs.org/nanomatch/-/nanomatch-1.2.13.tgz",
@@ -7765,6 +7760,11 @@
       "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz",
       "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw=="
     },
+    "picocolors": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz",
+      "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ=="
+    },
     "picomatch": {
       "version": "2.3.0",
       "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.0.tgz",
@@ -7870,13 +7870,25 @@
       "integrity": "sha1-AerA/jta9xoqbAL+q7jB/vfgDqs="
     },
     "postcss": {
-      "version": "8.3.5",
-      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.3.5.tgz",
-      "integrity": "sha512-NxTuJocUhYGsMiMFHDUkmjSKT3EdH4/WbGF6GCi1NDGk+vbcUTun4fpbOqaPtD8IIsztA2ilZm2DhYCuyN58gA==",
+      "version": "8.4.31",
+      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz",
+      "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==",
       "requires": {
-        "colorette": "^1.2.2",
-        "nanoid": "^3.1.23",
-        "source-map-js": "^0.6.2"
+        "nanoid": "^3.3.6",
+        "picocolors": "^1.0.0",
+        "source-map-js": "^1.0.2"
+      },
+      "dependencies": {
+        "nanoid": {
+          "version": "3.3.6",
+          "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz",
+          "integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA=="
+        },
+        "source-map-js": {
+          "version": "1.0.2",
+          "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.0.2.tgz",
+          "integrity": "sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw=="
+        }
       }
     },
     "postcss-calc": {
@@ -9677,11 +9689,6 @@
       "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz",
       "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w="
     },
-    "source-map-js": {
-      "version": "0.6.2",
-      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-0.6.2.tgz",
-      "integrity": "sha512-/3GptzWzu0+0MBQFrDKzw/DvvMTUORvgY6k6jd/VS6iCR4RDTKWH6v6WPwQoUO8667uQEf9Oe38DxAYWY5F/Ug=="
-    },
     "source-map-resolve": {
       "version": "0.5.3",
       "resolved": "https://registry.npmjs.org/source-map-resolve/-/source-map-resolve-0.5.3.tgz",

From ce3737f98b27543c5d7ceb88259685a5736bc896 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 7 Nov 2023 10:36:42 +0800
Subject: [PATCH 31/33] Bump @babel/traverse from 7.14.5 to 7.23.2 in /website
 (#566)

Bumps [@babel/traverse](https://github.com/babel/babel/tree/HEAD/packages/babel-traverse) from 7.14.5 to 7.23.2.
- [Release notes](https://github.com/babel/babel/releases)
- [Changelog](https://github.com/babel/babel/blob/main/CHANGELOG.md)
- [Commits](https://github.com/babel/babel/commits/v7.23.2/packages/babel-traverse)

---
updated-dependencies:
- dependency-name: "@babel/traverse"
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 website/package-lock.json | 139 +++++++++++++++++++++++++++++++++++---
 1 file changed, 128 insertions(+), 11 deletions(-)

diff --git a/website/package-lock.json b/website/package-lock.json
index b7eee3fe0..7bf8c5310 100644
--- a/website/package-lock.json
+++ b/website/package-lock.json
@@ -271,6 +271,11 @@
         }
       }
     },
+    "@babel/helper-environment-visitor": {
+      "version": "7.22.20",
+      "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz",
+      "integrity": "sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA=="
+    },
     "@babel/helper-explode-assignable-expression": {
       "version": "7.14.5",
       "resolved": "https://registry.npmjs.org/@babel/helper-explode-assignable-expression/-/helper-explode-assignable-expression-7.14.5.tgz",
@@ -394,6 +399,11 @@
         "@babel/types": "^7.14.5"
       }
     },
+    "@babel/helper-string-parser": {
+      "version": "7.22.5",
+      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.22.5.tgz",
+      "integrity": "sha512-mM4COjgZox8U+JcXQwPijIZLElkgEpO5rsERVDJTc2qfCDfERyob6k5WegS14SX18IIjv+XD+GrqNumY5JRCDw=="
+    },
     "@babel/helper-validator-identifier": {
       "version": "7.14.5",
       "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.14.5.tgz",
@@ -1268,19 +1278,126 @@
       }
     },
     "@babel/traverse": {
-      "version": "7.14.5",
-      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.14.5.tgz",
-      "integrity": "sha512-G3BiS15vevepdmFqmUc9X+64y0viZYygubAMO8SvBmKARuF6CPSZtH4Ng9vi/lrWlZFGe3FWdXNy835akH8Glg==",
-      "requires": {
-        "@babel/code-frame": "^7.14.5",
-        "@babel/generator": "^7.14.5",
-        "@babel/helper-function-name": "^7.14.5",
-        "@babel/helper-hoist-variables": "^7.14.5",
-        "@babel/helper-split-export-declaration": "^7.14.5",
-        "@babel/parser": "^7.14.5",
-        "@babel/types": "^7.14.5",
+      "version": "7.23.2",
+      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.2.tgz",
+      "integrity": "sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==",
+      "requires": {
+        "@babel/code-frame": "^7.22.13",
+        "@babel/generator": "^7.23.0",
+        "@babel/helper-environment-visitor": "^7.22.20",
+        "@babel/helper-function-name": "^7.23.0",
+        "@babel/helper-hoist-variables": "^7.22.5",
+        "@babel/helper-split-export-declaration": "^7.22.6",
+        "@babel/parser": "^7.23.0",
+        "@babel/types": "^7.23.0",
         "debug": "^4.1.0",
         "globals": "^11.1.0"
+      },
+      "dependencies": {
+        "@babel/code-frame": {
+          "version": "7.22.13",
+          "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.22.13.tgz",
+          "integrity": "sha512-XktuhWlJ5g+3TJXc5upd9Ks1HutSArik6jf2eAjYFyIOf4ej3RN+184cZbzDvbPnuTJIUhPKKJE3cIsYTiAT3w==",
+          "requires": {
+            "@babel/highlight": "^7.22.13",
+            "chalk": "^2.4.2"
+          }
+        },
+        "@babel/generator": {
+          "version": "7.23.0",
+          "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.23.0.tgz",
+          "integrity": "sha512-lN85QRR+5IbYrMWM6Y4pE/noaQtg4pNiqeNGX60eqOfo6gtEj6uw/JagelB8vVztSd7R6M5n1+PQkDbHbBRU4g==",
+          "requires": {
+            "@babel/types": "^7.23.0",
+            "@jridgewell/gen-mapping": "^0.3.2",
+            "@jridgewell/trace-mapping": "^0.3.17",
+            "jsesc": "^2.5.1"
+          }
+        },
+        "@babel/helper-function-name": {
+          "version": "7.23.0",
+          "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz",
+          "integrity": "sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw==",
+          "requires": {
+            "@babel/template": "^7.22.15",
+            "@babel/types": "^7.23.0"
+          }
+        },
+        "@babel/helper-hoist-variables": {
+          "version": "7.22.5",
+          "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz",
+          "integrity": "sha512-wGjk9QZVzvknA6yKIUURb8zY3grXCcOZt+/7Wcy8O2uctxhplmUPkOdlgoNhmdVee2c92JXbf1xpMtVNbfoxRw==",
+          "requires": {
+            "@babel/types": "^7.22.5"
+          }
+        },
+        "@babel/helper-split-export-declaration": {
+          "version": "7.22.6",
+          "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.22.6.tgz",
+          "integrity": "sha512-AsUnxuLhRYsisFiaJwvp1QF+I3KjD5FOxut14q/GzovUe6orHLesW2C7d754kRm53h5gqrz6sFl6sxc4BVtE/g==",
+          "requires": {
+            "@babel/types": "^7.22.5"
+          }
+        },
+        "@babel/helper-validator-identifier": {
+          "version": "7.22.20",
+          "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz",
+          "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A=="
+        },
+        "@babel/highlight": {
+          "version": "7.22.20",
+          "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.22.20.tgz",
+          "integrity": "sha512-dkdMCN3py0+ksCgYmGG8jKeGA/8Tk+gJwSYYlFGxG5lmhfKNoAy004YpLxpS1W2J8m/EK2Ew+yOs9pVRwO89mg==",
+          "requires": {
+            "@babel/helper-validator-identifier": "^7.22.20",
+            "chalk": "^2.4.2",
+            "js-tokens": "^4.0.0"
+          }
+        },
+        "@babel/parser": {
+          "version": "7.23.0",
+          "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.23.0.tgz",
+          "integrity": "sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw=="
+        },
+        "@babel/template": {
+          "version": "7.22.15",
+          "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.22.15.tgz",
+          "integrity": "sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w==",
+          "requires": {
+            "@babel/code-frame": "^7.22.13",
+            "@babel/parser": "^7.22.15",
+            "@babel/types": "^7.22.15"
+          }
+        },
+        "@babel/types": {
+          "version": "7.23.0",
+          "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.23.0.tgz",
+          "integrity": "sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg==",
+          "requires": {
+            "@babel/helper-string-parser": "^7.22.5",
+            "@babel/helper-validator-identifier": "^7.22.20",
+            "to-fast-properties": "^2.0.0"
+          }
+        },
+        "@jridgewell/trace-mapping": {
+          "version": "0.3.20",
+          "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.20.tgz",
+          "integrity": "sha512-R8LcPeWZol2zR8mmH3JeKQ6QRCFb7XgUhV9ZlGhHLGyg4wpPiPZNQOOWhFZhxKw8u//yTbNGI42Bx/3paXEQ+Q==",
+          "requires": {
+            "@jridgewell/resolve-uri": "^3.1.0",
+            "@jridgewell/sourcemap-codec": "^1.4.14"
+          }
+        },
+        "chalk": {
+          "version": "2.4.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz",
+          "integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==",
+          "requires": {
+            "ansi-styles": "^3.2.1",
+            "escape-string-regexp": "^1.0.5",
+            "supports-color": "^5.3.0"
+          }
+        }
       }
     },
     "@babel/types": {

From c7800bb8e038baa103b8f6a14572238061b410f7 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Tue, 14 Nov 2023 11:52:56 +0800
Subject: [PATCH 32/33] Bug Fix - remove cp ptx file command in gpu burn test 
 (#567)

**Description**
remove cp ptx file in gpu burn test since the command is run inside
self.args.bin_dir dir.


https://github.com/microsoft/superbenchmark/blob/d246bab430adeb461072918a551b2e2b68c9bce5/superbench/benchmarks/micro_benchmarks/micro_base.py#L183
---
 superbench/benchmarks/micro_benchmarks/gpu_burn_test.py | 6 +-----
 tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py | 4 ----
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
index c5ef05eae..fba4ad2b3 100644
--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
@@ -66,12 +66,8 @@ def _preprocess(self):
         if self._args.tensor_core:
             command += ' -tc'
         command += ' {} '.format(self._args.time)
-        # copy compare.ptx which needs to be in the working directory
-        compare_copy = 'cp ' + self._args.bin_dir + '/compare.ptx ./'
-        # remove compare.ptx from working directory
-        compare_rm = 'rm ' + 'compare.ptx'
 
-        self._commands.append(compare_copy + ' && ' + command + ' && ' + compare_rm)
+        self._commands.append(command)
 
         return True
 
diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
index eff5af202..3ec352c4d 100644
--- a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
@@ -46,14 +46,10 @@ def test_gpu_burn(self, results):
         assert (benchmark._args.tensor_core)
 
         # Check command
-        compare_copy = 'cp ' + benchmark._args.bin_dir + '/compare.ptx ./'
-        compare_rm = 'rm ' + 'compare.ptx'
         assert (1 == len(benchmark._commands))
-        assert (benchmark._commands[0].startswith(compare_copy))
         assert ('-d' in benchmark._commands[0])
         assert ('-tc' in benchmark._commands[0])
         assert (str(time) in benchmark._commands[0])
-        assert (compare_rm in benchmark._commands[0])
 
         # Check results
         assert (benchmark._process_raw_result(0, results))

From f53d941a22fc0746e98ef3560a6799422be8fa47 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Mon, 20 Nov 2023 11:21:20 +0800
Subject: [PATCH 33/33] Benchmarks: micro benchmarks - add int8 support for
 cublaslt function (#574)

**Description**
add int8 support for cublaslt function.
---
 superbench/benchmarks/micro_benchmarks/cublaslt_function.py | 2 +-
 .../micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu         | 5 +++++
 .../micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc        | 2 ++
 tests/benchmarks/micro_benchmarks/test_cublaslt_function.py | 6 +++---
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
index 59733ea10..9bf3d99f3 100644
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
@@ -23,7 +23,7 @@ def __init__(self, name, parameters=''):
         super().__init__(name, parameters)
 
         self._bin_name = 'cublaslt_gemm'
-        self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2']
+        self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'int8']
 
     def mrange(self, start, stop=-1, multiplication_factor=2):
         """Range constructor with multiplication factor.
diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
index 788b1989d..002b06447 100644
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
@@ -16,6 +16,7 @@ using fp16 = half;
 using bf16 = nv_bfloat16;
 using fp8e4m3 = __nv_fp8_e4m3;
 using fp8e5m2 = __nv_fp8_e5m2;
+using int8 = int8_t;
 
 struct Args {
     int m = 16;
@@ -84,6 +85,8 @@ template <typename T> cudaDataType_t get_datatype() {
         return CUDA_R_8F_E4M3;
     if (std::is_same<T, fp8e5m2>::value)
         return CUDA_R_8F_E5M2;
+    if (std::is_same<T, int8>::value)
+        return CUDA_R_8I;
     throw std::invalid_argument("Unknown type");
 }
 
@@ -162,6 +165,8 @@ int main(int argc, char **argv) {
         run<fp8e4m3, fp8e4m3, fp16>(&args);
     else if (args.in_type == "fp8e5m2")
         run<fp8e5m2, fp8e4m3, fp16>(&args);
+    else if (args.in_type == "int8")
+        run<int8>(&args);
     else
         throw std::invalid_argument("Unknown type " + args.in_type);
 
diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
index 4842c22d1..6ec5a101e 100644
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
@@ -62,6 +62,8 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
         gemm_compute_type = CUBLAS_COMPUTE_32F;
     if (a_type == CUDA_R_64F || b_type == CUDA_R_64F)
         gemm_compute_type = CUBLAS_COMPUTE_64F;
+    if (a_type == CUDA_R_8I)
+        gemm_compute_type = CUBLAS_COMPUTE_32I;
 
     cublasLtMatmulDesc_t op_desc = nullptr;
     CUBLAS_CHECK(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F));
diff --git a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
index b504062a2..a6fae8f0e 100644
--- a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
+++ b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
@@ -63,15 +63,15 @@ def test_cublaslt_gemm_command_generation(self):
         (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
         benchmark = benchmark_cls(
             self.benchmark_name,
-            parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64',
+            parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64 int8',
         )
         self.assertTrue(benchmark._preprocess())
-        self.assertEqual(4 * (2 * 2 * 3 + 2) * 3, len(benchmark._commands))
+        self.assertEqual(4 * (2 * 2 * 3 + 2) * len(benchmark._args.in_types), len(benchmark._commands))
 
         def cmd(t, b, m, n, k):
             return f'{benchmark._CublasLtBenchmark__bin_path} -m {m} -n {n} -k {k} -b {b} -w 20 -i 50 -t {t}'
 
-        for _t in ['fp16', 'fp32', 'fp64']:
+        for _t in ['fp16', 'fp32', 'fp64', 'int8']:
             for _b in [2, 4, 8, 16]:
                 for _m in [2, 4]:
                     for _n in [4, 8]: