From 51761b3af172b4fc54ce0a3abc302e203d2bf44a Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 14 Apr 2023 20:57:55 +0800 Subject: [PATCH 01/33] Release - SuperBench v0.8.0 (#517) **Description** Cherry-pick bug fixes from v0.8.0 to main. **Major Revisions** * Monitor - Fix the cgroup version checking logic (#502) * Benchmark - Fix matrix size overflow issue in cuBLASLt GEMM (#503) * Fix wrong torch usage in communication wrapper for Distributed Inference Benchmark (#505) * Analyzer: Fix bug in python3.8 due to pandas api change (#504) * Bug - Fix bug to get metric from cmd when error happens (#506) * Monitor - Collect realtime GPU power when benchmarking (#507) * Add num_workers argument in model benchmark (#511) * Remove unreachable condition when write host list (#512) * Update cuda11.8 image to cuda12.1 based on nvcr23.03 (#513) * Doc - Fix wrong unit of cpu-memory-bw-latency in doc (#515) * Docs - Upgrade version and release note (#508) Co-authored-by: guoshzhao Co-authored-by: Ziyue Yang Co-authored-by: Yuting Jiang --- .github/workflows/build-image.yml | 6 +- ...uda11.8.dockerfile => cuda12.1.dockerfile} | 50 +++++------ docs/developer-guides/using-docker.mdx | 4 +- docs/getting-started/installation.mdx | 4 +- docs/getting-started/run-superbench.md | 2 +- docs/superbench-config.mdx | 2 +- .../benchmarks/micro-benchmarks.md | 24 +++--- docs/user-tutorial/container-images.mdx | 6 ++ docs/user-tutorial/data-diagnosis.md | 2 +- docs/user-tutorial/result-summary.md | 2 +- superbench/__init__.py | 2 +- superbench/analyzer/data_analysis.py | 14 +++- superbench/analyzer/data_diagnosis.py | 2 +- superbench/analyzer/result_summary.py | 4 +- .../cublaslt_gemm/cublaslt_gemm.cu | 15 ++-- .../cublaslt_gemm/cublaslt_utils.cc | 82 +++++++++---------- .../cublaslt_gemm/cublaslt_utils.h | 14 ++-- .../micro_benchmarks/cudnn_function.py | 22 +++-- .../micro_benchmarks/dist_inference.py | 2 +- .../benchmarks/model_benchmarks/model_base.py | 7 ++ .../model_benchmarks/pytorch_base.py | 2 +- superbench/common/utils/device_manager.py | 16 ++++ .../utils/gen_traffic_pattern_config.py | 21 +++-- superbench/config/amd_mi100_hpe.yaml | 2 +- superbench/config/amd_mi100_z53.yaml | 2 +- .../inference/standard_nc64as_t4_v3.yaml | 2 +- .../inference/standard_nc96ads_a100_v4.yaml | 2 +- .../inference/standard_nv18ads_a10_v5.yaml | 2 +- superbench/config/azure_ndmv4.yaml | 2 +- superbench/config/azure_ndv4.yaml | 2 +- superbench/config/default.yaml | 2 +- superbench/monitor/monitor.py | 22 ++--- superbench/monitor/record.py | 16 ++++ superbench/runner/runner.py | 3 +- .../model_benchmarks/test_model_base.py | 2 + tests/monitor/test_monitor.py | 4 +- tests/monitor/test_monitor_record.py | 9 ++ website/blog/2023-04-14-release-0-8.md | 44 ++++++++++ website/docusaurus.config.js | 2 +- website/package-lock.json | 2 +- website/package.json | 2 +- 41 files changed, 265 insertions(+), 162 deletions(-) rename dockerfile/{cuda11.8.dockerfile => cuda12.1.dockerfile} (81%) create mode 100644 website/blog/2023-04-14-release-0-8.md diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 6e599e9c7..824418a6f 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -24,9 +24,9 @@ jobs: strategy: matrix: include: - - name: cuda11.8 - dockerfile: cuda11.8 - tags: superbench/main:cuda11.8 + - name: cuda12.1 + dockerfile: cuda12.1 + tags: superbench/main:cuda12.1 - name: cuda11.1.1 dockerfile: cuda11.1.1 tags: superbench/main:cuda11.1.1,superbench/superbench:latest diff --git a/dockerfile/cuda11.8.dockerfile b/dockerfile/cuda12.1.dockerfile similarity index 81% rename from dockerfile/cuda11.8.dockerfile rename to dockerfile/cuda12.1.dockerfile index 7615b60e3..4a257bf43 100644 --- a/dockerfile/cuda11.8.dockerfile +++ b/dockerfile/cuda12.1.dockerfile @@ -1,18 +1,18 @@ -FROM nvcr.io/nvidia/pytorch:22.12-py3 +FROM nvcr.io/nvidia/pytorch:23.03-py3 # OS: # - Ubuntu: 20.04 # - OpenMPI: 4.1.5a1 # - Docker Client: 20.10.8 # NVIDIA: -# - CUDA: 11.8.0 -# - cuDNN: 8.7.0.84 -# - NCCL: v2.15.5-1 +# - CUDA: 12.1.0 +# - cuDNN: 8.8.1.3 +# - NCCL: v2.17.1-1 # Mellanox: -# - OFED: 5.2-2.2.3.0 -# - HPC-X: v2.8.3 +# - OFED: 5.2-2.2.3.0 # TODO +# - HPC-X: v2.14 # Intel: -# - mlc: v3.9a +# - mlc: v3.10 LABEL maintainer="SuperBench" @@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \ # Install OFED ENV OFED_VERSION=5.2-2.2.3.0 RUN cd /tmp && \ - wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* # Install HPC-X +ENV HPCX_VERSION=v2.14 RUN cd /opt && \ rm -rf hpcx && \ - wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \ - tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \ - ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \ - rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz + wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \ + tar xf hpcx.tbz && \ + mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \ + rm hpcx.tbz # Install Intel MLC RUN cd /tmp && \ - wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \ + wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \ tar xzf mlc.tgz Linux/mlc && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz -ENV PATH="${PATH}" \ - LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ - SB_HOME=/opt/superbench \ - SB_MICRO_PATH=/opt/superbench \ - ANSIBLE_DEPRECATION_WARNINGS=FALSE \ - ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections - -RUN echo PATH="$PATH" > /etc/environment && \ - echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ - echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment - # Install AOCC compiler RUN cd /tmp && \ wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ @@ -115,6 +105,18 @@ RUN cd /tmp && \ mv amd-blis /opt/AMD && \ rm -rf aocl-blis-linux-aocc-4.0.tar.gz + +ENV PATH="${PATH}" \ + LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment + # Add config files ADD dockerfile/etc /opt/microsoft/ diff --git a/docs/developer-guides/using-docker.mdx b/docs/developer-guides/using-docker.mdx index 621e9cffd..b73891853 100644 --- a/docs/developer-guides/using-docker.mdx +++ b/docs/developer-guides/using-docker.mdx @@ -29,7 +29,7 @@ You need to [clone the code](./development.md#set-up) first before building the export DOCKER_BUILDKIT=1 docker buildx build \ --platform linux/amd64 --cache-to type=inline,mode=max \ - --tag superbench-dev --file dockerfile/cuda11.1.1.dockerfile . + --tag superbench-dev --file dockerfile/cuda12.1.dockerfile . ``` @@ -39,7 +39,7 @@ docker buildx build \ export DOCKER_BUILDKIT=1 docker buildx build \ --platform linux/amd64 --cache-to type=inline,mode=max \ - --tag superbench-dev --file dockerfile/rocm4.2-pytorch1.7.0.dockerfile . + --tag superbench-dev --file dockerfile/rocm5.1.x.dockerfile . ``` diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx index 3172605cb..82c1fc9c3 100644 --- a/docs/getting-started/installation.mdx +++ b/docs/getting-started/installation.mdx @@ -45,7 +45,7 @@ but it is not strictly necessary. ```bash # create a new virtual environment -python3 -m venv --system-site-packages ./venv +python3 -m venv ./venv # activate the virtual environment source ./venv/bin/activate @@ -61,7 +61,7 @@ You can clone the source from GitHub and build it. :::note Note You should checkout corresponding tag to use release version, for example, -`git clone -b v0.7.0 https://github.com/microsoft/superbenchmark` +`git clone -b v0.8.0 https://github.com/microsoft/superbenchmark` ::: ```bash diff --git a/docs/getting-started/run-superbench.md b/docs/getting-started/run-superbench.md index e97626c56..32a8c6d80 100644 --- a/docs/getting-started/run-superbench.md +++ b/docs/getting-started/run-superbench.md @@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password] :::note Note You should deploy corresponding Docker image to use release version, for example, -`sb deploy -f local.ini -i superbench/superbench:v0.7.0-cuda11.1.1` +`sb deploy -f local.ini -i superbench/superbench:v0.8.0-cuda12.1` You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone. diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx index 8fe6aa1ea..8802830b2 100644 --- a/docs/superbench-config.mdx +++ b/docs/superbench-config.mdx @@ -70,7 +70,7 @@ superbench: ```yaml -version: v0.7 +version: v0.8 superbench: enable: benchmark_1 monitor: diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index 2788a2815..b2e43db3f 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -180,11 +180,11 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer #### Metrics -| Name | Unit | Description | -|---------------------|--------------------|----------------------------------------------------------------------------| -| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). | -| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. | -| cpu-hpl/time | time (s) | Time elapsed during HPL run. | +| Name | Unit | Description | +|--------------------|--------------------|---------------------------------------------------------------------------| +| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). | +| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. | +| cpu-hpl/time | time (s) | Time elapsed during HPL run. | ### `cpu-stream` @@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/ | Name | Unit | Description | |-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------| -| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth. | -| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us) | Former NUMA to latter NUMA memory latency. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. | +| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (MB/s) | Former NUMA to latter NUMA memory bandwidth. | +| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (ns) | Former NUMA to latter NUMA memory latency. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, full read. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. | ### `mem-bw` diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx index 112bbf25a..27cf8da6f 100644 --- a/docs/user-tutorial/container-images.mdx +++ b/docs/user-tutorial/container-images.mdx @@ -29,6 +29,8 @@ available tags are listed below for all stable versions. | Tag | Description | |-------------------|------------------------------------| +| v0.8.0-cuda12.1 | SuperBench v0.8.0 with CUDA 12.1 | +| v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 | | v0.7.0-cuda11.8 | SuperBench v0.7.0 with CUDA 11.8 | | v0.7.0-cuda11.1.1 | SuperBench v0.7.0 with CUDA 11.1.1 | | v0.6.0-cuda11.1.1 | SuperBench v0.6.0 with CUDA 11.1.1 | @@ -43,6 +45,10 @@ available tags are listed below for all stable versions. | Tag | Description | |-------------------------------|--------------------------------------------------| +| v0.8.0-rocm5.1.3 | SuperBench v0.8.0 with ROCm 5.1.3 | +| v0.8.0-rocm5.1.1 | SuperBench v0.8.0 with ROCm 5.1.1 | +| v0.8.0-rocm5.0.1 | SuperBench v0.8.0 with ROCm 5.0.1 | +| v0.8.0-rocm5.0 | SuperBench v0.8.0 with ROCm 5.0 | | v0.7.0-rocm5.1.3 | SuperBench v0.7.0 with ROCm 5.1.3 | | v0.7.0-rocm5.1.1 | SuperBench v0.7.0 with ROCm 5.1.1 | | v0.7.0-rocm5.0.1 | SuperBench v0.7.0 with ROCm 5.0.1 | diff --git a/docs/user-tutorial/data-diagnosis.md b/docs/user-tutorial/data-diagnosis.md index e4094f83a..94a2a025d 100644 --- a/docs/user-tutorial/data-diagnosis.md +++ b/docs/user-tutorial/data-diagnosis.md @@ -65,7 +65,7 @@ superbench: example: ```yaml # SuperBench rules -version: v0.7 +version: v0.8 superbench: rules: failure-rule: diff --git a/docs/user-tutorial/result-summary.md b/docs/user-tutorial/result-summary.md index d9053d3b6..e53738ff8 100644 --- a/docs/user-tutorial/result-summary.md +++ b/docs/user-tutorial/result-summary.md @@ -58,7 +58,7 @@ superbench: ```yaml title="Example" # SuperBench rules -version: v0.7 +version: v0.8 superbench: rules: kernel_launch: diff --git a/superbench/__init__.py b/superbench/__init__.py index a0b5f7c02..5b85c9a9a 100644 --- a/superbench/__init__.py +++ b/superbench/__init__.py @@ -6,5 +6,5 @@ Provide hardware and software benchmarks for AI systems. """ -__version__ = '0.7.0' +__version__ = '0.8.0' __author__ = 'Microsoft' diff --git a/superbench/analyzer/data_analysis.py b/superbench/analyzer/data_analysis.py index d7ac40f1b..5a7fb1ed8 100644 --- a/superbench/analyzer/data_analysis.py +++ b/superbench/analyzer/data_analysis.py @@ -31,11 +31,13 @@ def statistic(raw_data_df): logger.warning('DataAnalyzer: empty data.') return data_statistics_df try: + raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce') + raw_data_df = raw_data_df.dropna(axis=1, how='all') data_statistics_df = raw_data_df.describe() - data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01) - data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05) - data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95) - data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99) + data_statistics_df.loc['1%'] = raw_data_df.quantile(0.01, numeric_only=True) + data_statistics_df.loc['5%'] = raw_data_df.quantile(0.05, numeric_only=True) + data_statistics_df.loc['95%'] = raw_data_df.quantile(0.95, numeric_only=True) + data_statistics_df.loc['99%'] = raw_data_df.quantile(0.99, numeric_only=True) statistics_error = [] for column in list(raw_data_df.columns): if column not in list(data_statistics_df.columns) and not raw_data_df[column].isnull().all(): @@ -122,6 +124,8 @@ def correlation(raw_data_df): logger.warning('DataAnalyzer: empty data.') return data_corr_df try: + raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce') + raw_data_df = raw_data_df.dropna(axis=1, how='all') data_corr_df = raw_data_df.corr() statistics_error = [] for column in list(raw_data_df.columns): @@ -181,6 +185,8 @@ def generate_baseline(raw_data_df, output_dir): output_dir (str): the directory of output file """ try: + raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce') + raw_data_df = raw_data_df.dropna(axis=1, how='all') if not isinstance(raw_data_df, pd.DataFrame): logger.error('DataAnalyzer: the type of raw data is not pd.DataFrame') return diff --git a/superbench/analyzer/data_diagnosis.py b/superbench/analyzer/data_diagnosis.py index f7a906560..ee5d705b6 100644 --- a/superbench/analyzer/data_diagnosis.py +++ b/superbench/analyzer/data_diagnosis.py @@ -285,7 +285,7 @@ def output_diagnosis_in_excel(self, raw_data_df, data_not_accept_df, output_path logger.log_and_raise(exception=IOError, msg='DataDiagnosis: excel_data_output - invalid file path.') file_handler.output_excel_raw_data(writer, raw_data_df, 'Raw Data') file_handler.output_excel_data_not_accept(writer, data_not_accept_df, rules) - writer.save() + writer.close() except Exception as e: logger.log_and_raise(exception=Exception, msg='DataDiagnosis: excel_data_output - {}'.format(str(e))) diff --git a/superbench/analyzer/result_summary.py b/superbench/analyzer/result_summary.py index e269d70a5..09954a8dc 100644 --- a/superbench/analyzer/result_summary.py +++ b/superbench/analyzer/result_summary.py @@ -117,7 +117,7 @@ def _merge_summary(self, summary): summary_df = pd.DataFrame() for category in summary: for i in range(len(summary[category])): - summary_df = summary_df.append([summary[category][i]], ignore_index=True) + summary_df = pd.concat([summary_df, pd.DataFrame([summary[category][i]])], ignore_index=True) return summary_df def _generate_summary(self, round): @@ -217,7 +217,7 @@ def output_summary_in_excel(self, raw_data_df, summary, output_path): file_handler.merge_column_in_excel(worksheet, row, 1) else: logger.error('ResultSummary: excel_data_output - summary is empty.') - writer.save() + writer.close() except Exception as e: logger.error('ResultSummary: excel_data_output - {}'.format(str(e))) diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu index bc8478274..788b1989d 100644 --- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu +++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu @@ -88,20 +88,21 @@ template cudaDataType_t get_datatype() { } template -float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) { +float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, int iter) { // init matrix Ta *matrix_a = nullptr; Tb *matrix_b = nullptr; Tout *matrix_out = nullptr; - cudaMalloc(&matrix_a, m * k * std::max(batch, 1) * sizeof(Ta)); - cudaMalloc(&matrix_b, k * n * std::max(batch, 1) * sizeof(Tb)); - cudaMalloc(&matrix_out, m * n * std::max(batch, 1) * sizeof(Tout)); + batch = std::max(batch, 1); + cudaMalloc(&matrix_a, m * k * batch * sizeof(Ta)); + cudaMalloc(&matrix_b, k * n * batch * sizeof(Tb)); + cudaMalloc(&matrix_out, m * n * batch * sizeof(Tout)); - init_matrix<<<216, 1024>>>(matrix_a, 1.f, m * k * std::max(batch, 1)); - init_matrix<<<216, 1024>>>(matrix_b, 2.f, k * n * std::max(batch, 1)); + init_matrix<<<216, 1024>>>(matrix_a, 1.f, m * k * batch); + init_matrix<<<216, 1024>>>(matrix_b, 2.f, k * n * batch); // init gemm - int lda = k, ldb = k, ldd = m; + size_t lda = k, ldb = k, ldd = m; std::unique_ptr gemm = std::make_unique(); gemm->Init(); gemm->Setup(m, n, k, batch, lda, ldb, ldd, get_datatype(), get_datatype(), get_datatype(), diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc index a91304c5a..4842c22d1 100644 --- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc +++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc @@ -5,12 +5,12 @@ void cublasLtGemm::Init() { cublasLtHandle_t handle; - checkCublasStatus(cublasLtCreate(&handle)); + CUBLAS_CHECK(cublasLtCreate(&handle)); handle_.reset(handle); /* preference can be initialized without arguments */ cublasLtMatmulPreference_t preference; - checkCublasStatus(cublasLtMatmulPreferenceCreate(&preference)); + CUBLAS_CHECK(cublasLtMatmulPreferenceCreate(&preference)); preference_.reset(preference); } @@ -24,32 +24,32 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l // force c_type cudaDataType_t c_type = d_type; // Create matrix descriptors. - checkCublasStatus( + CUBLAS_CHECK( cublasLtMatrixLayoutCreate(&a_desc, a_type, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda)); - checkCublasStatus( + CUBLAS_CHECK( cublasLtMatrixLayoutCreate(&b_desc, b_type, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb)); - checkCublasStatus(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldd)); - checkCublasStatus(cublasLtMatrixLayoutCreate(&d_desc, d_type, m, n, ldd)); + CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldd)); + CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&d_desc, d_type, m, n, ldd)); // strided batch gemm if (batch > 0) { int64_t stridea = m * k, strideb = k * n, stridec = m * n, strided = m * n; - checkCublasStatus( + CUBLAS_CHECK( cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); - checkCublasStatus(cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, - &stridea, sizeof(stridea))); - checkCublasStatus( + CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, + sizeof(stridea))); + CUBLAS_CHECK( cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); - checkCublasStatus(cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, - &strideb, sizeof(strideb))); - checkCublasStatus( + CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(b_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, + sizeof(strideb))); + CUBLAS_CHECK( cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); - checkCublasStatus(cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, - &stridec, sizeof(stridec))); - checkCublasStatus( + CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, + sizeof(stridec))); + CUBLAS_CHECK( cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); - checkCublasStatus(cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, - &strided, sizeof(strided))); + CUBLAS_CHECK(cublasLtMatrixLayoutSetAttribute(d_desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strided, + sizeof(strided))); } a_desc_.reset(a_desc); b_desc_.reset(b_desc); @@ -64,7 +64,7 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l gemm_compute_type = CUBLAS_COMPUTE_64F; cublasLtMatmulDesc_t op_desc = nullptr; - checkCublasStatus(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F)); + CUBLAS_CHECK(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F)); op_desc_.reset(op_desc); if (a_type == CUDA_R_8F_E5M2 || b_type == CUDA_R_8F_E5M2 || a_type == CUDA_R_8F_E4M3 || b_type == CUDA_R_8F_E4M3) { @@ -73,33 +73,31 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l cublasLtMatmulDescSetAttribute(op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(fastAccuMode)); } - checkCublasStatus( - cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa))); - checkCublasStatus( - cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb))); + CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa))); + CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb))); if (a_scale_inverse != nullptr) { - checkCublasStatus(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, - &a_scale_inverse, sizeof(a_scale_inverse))); + CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, + &a_scale_inverse, sizeof(a_scale_inverse))); } if (b_scale_inverse != nullptr) { - checkCublasStatus(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, - &b_scale_inverse, sizeof(b_scale_inverse))); + CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, + &b_scale_inverse, sizeof(b_scale_inverse))); } - checkCublasStatus( + CUBLAS_CHECK( cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue))); } size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_size) { - checkCublasStatus(cublasLtMatmulPreferenceSetAttribute(preference_.get(), CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, - &max_workspace_size, sizeof(max_workspace_size))); + CUBLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(preference_.get(), CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &max_workspace_size, sizeof(max_workspace_size))); int found_algorithm_count = 0; std::vector results(max_algorithm_count); // Though we query all of possible algorithm, we will use the first later - checkCublasStatus(cublasLtMatmulAlgoGetHeuristic(handle_.get(), op_desc_.get(), a_desc_.get(), b_desc_.get(), - c_desc_.get(), d_desc_.get(), preference_.get(), - max_algorithm_count, results.data(), &found_algorithm_count)); + CUBLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(handle_.get(), op_desc_.get(), a_desc_.get(), b_desc_.get(), + c_desc_.get(), d_desc_.get(), preference_.get(), max_algorithm_count, + results.data(), &found_algorithm_count)); if (found_algorithm_count == 0) { throw std::runtime_error("Unable to find any suitable algorithms"); } @@ -111,13 +109,13 @@ size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_ void cublasLtGemm::Execute(void *matrix_a, void *matrix_b, void *matrix_c, void *matrix_d, float alpha, float beta, void *workspace, size_t workspace_size, cudaStream_t stream) { - checkCublasStatus(cublasLtMatmul(handle_.get(), op_desc_.get(), static_cast(&alpha), /* alpha */ - matrix_a, /* A */ - a_desc_.get(), matrix_b, /* B */ - b_desc_.get(), static_cast(&beta), /* beta */ - matrix_c, /* C */ - c_desc_.get(), matrix_d, /* D */ - d_desc_.get(), &heuristic_results_.front().algo, /* algo */ - workspace, /* workspace */ - workspace_size, stream)); /* stream */ + CUBLAS_CHECK(cublasLtMatmul(handle_.get(), op_desc_.get(), static_cast(&alpha), /* alpha */ + matrix_a, /* A */ + a_desc_.get(), matrix_b, /* B */ + b_desc_.get(), static_cast(&beta), /* beta */ + matrix_c, /* C */ + c_desc_.get(), matrix_d, /* D */ + d_desc_.get(), &heuristic_results_.front().algo, /* algo */ + workspace, /* workspace */ + workspace_size, stream)); /* stream */ } diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h index ca1f8fcfd..f89f934e4 100644 --- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h +++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h @@ -10,12 +10,14 @@ #include -inline void checkCublasStatus(cublasStatus_t status) { - if (status != CUBLAS_STATUS_SUCCESS) { - printf("cuBLAS API failed with status %s\n", cublasGetStatusString(status)); - throw std::logic_error("cuBLAS API failed"); - } -} +#define CUBLAS_CHECK(func) \ + do { \ + cublasStatus_t status = func; \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + printf("cuBLAS call %s failed at %s:%d '%s'\n", #func, __FILE__, __LINE__, cublasGetStatusString(status)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) class cublasLtGemm { public: diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function.py b/superbench/benchmarks/micro_benchmarks/cudnn_function.py index 4c7f08193..82384ae8b 100644 --- a/superbench/benchmarks/micro_benchmarks/cudnn_function.py +++ b/superbench/benchmarks/micro_benchmarks/cudnn_function.py @@ -408,23 +408,21 @@ def _process_raw_result(self, cmd_idx, raw_output): True if the raw output string is valid and result can be extracted. """ self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) - + metric = '' try: lines = raw_output.splitlines() - metric = '' + + cmd_config = json.loads(self._commands[cmd_idx].split('--config_json')[-1].replace(' ', '')[1:-1]) + for key in sorted(cmd_config.keys()): + if 'name' in key: + metric = key + '_' + str(cmd_config[key]) + metric + else: + metric = metric + '_' + key + '_' + str(cmd_config[key]) + metric = metric.replace(' ', '').replace(',', '_') + error = False raw_data = [] for line in lines: - if '[function config]' in line: - metric = '' - metric_json_str = line[line.index('[function config]: ') + - len('[function config]: '):].replace(' ', '').replace(':', '_')[1:-1] - metric_list = metric_json_str.split(',') - for key in metric_list: - if 'name' in key: - metric = key + metric - else: - metric = metric + '_' + key if '[raw_data]' in line: raw_data = line[line.index('[raw_data]: ') + len('[raw_data]: '):] raw_data = raw_data.split(',') diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference.py b/superbench/benchmarks/micro_benchmarks/dist_inference.py index 4083c16ad..535c4fbf6 100644 --- a/superbench/benchmarks/micro_benchmarks/dist_inference.py +++ b/superbench/benchmarks/micro_benchmarks/dist_inference.py @@ -121,7 +121,7 @@ def __all_gather_wrapper(self, x): Return: Tensor after all-gather. """ - output = torch.empty_like([x.shape[0] * self.num_ranks] + list(x.shape[1:])) + output = torch.empty([x.shape[0] * self.num_ranks] + list(x.shape[1:]), dtype=x.dtype, device=x.device) dist.all_gather_into_tensor(output, x) return output diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py index 133ee76f4..a51c05850 100644 --- a/superbench/benchmarks/model_benchmarks/model_base.py +++ b/superbench/benchmarks/model_benchmarks/model_base.py @@ -78,6 +78,13 @@ def add_parser_arguments(self): required=False, help='The number of batch size.', ) + self._parser.add_argument( + '--num_workers', + type=int, + default=8, + required=False, + help='Number of subprocesses to use for data loading.', + ) self._parser.add_argument( '--precision', type=Precision, diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index ce1cca93b..f0cb52319 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -181,7 +181,7 @@ def _init_dataloader(self): dataset=self._dataset, batch_size=self._args.batch_size, shuffle=False, - num_workers=8, + num_workers=self._args.num_workers, sampler=train_sampler, drop_last=True, pin_memory=self._args.pin_memory diff --git a/superbench/common/utils/device_manager.py b/superbench/common/utils/device_manager.py index aeb62b586..2a6a8a889 100644 --- a/superbench/common/utils/device_manager.py +++ b/superbench/common/utils/device_manager.py @@ -72,6 +72,22 @@ def get_device_temperature(self, idx): temp = None return temp + def get_device_power(self, idx): + """Get the realtime power of device, unit: watt. + + Args: + idx (int): device index. + + Return: + temp (float): the realtime power of device, None means failed to get the data. + """ + try: + power = nvml.nvmlDeviceGetPowerUsage(self._device_handlers[idx]) + except Exception as err: + logger.error('Get device power failed: {}'.format(str(err))) + return None + return int(int(power) / 1000) + def get_device_power_limit(self, idx): """Get the power management limit of device, unit: watt. diff --git a/superbench/common/utils/gen_traffic_pattern_config.py b/superbench/common/utils/gen_traffic_pattern_config.py index 97864784c..84a2e65d0 100644 --- a/superbench/common/utils/gen_traffic_pattern_config.py +++ b/superbench/common/utils/gen_traffic_pattern_config.py @@ -182,15 +182,14 @@ def gen_traffic_pattern_host_groups(host_list, pattern, mpi_pattern_path, benchm logger.error('Unsupported traffic pattern: {}'.format(pattern.type)) host_groups = __convert_config_to_host_group(config, host_list) # write traffic pattern host groups to specified path - if pattern.mpi_pattern: - with open(mpi_pattern_path, 'a') as f: - f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n') - for host_group in host_groups: - row = [] - for host_list in host_group: - group = ','.join(host_list) - row.append(group) - group = ';'.join(row) - f.write(group + '\n') - f.write('\n') + with open(mpi_pattern_path, 'a') as f: + f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n') + for host_group in host_groups: + row = [] + for host_list in host_group: + group = ','.join(host_list) + row.append(group) + group = ';'.join(row) + f.write(group + '\n') + f.write('\n') return host_groups diff --git a/superbench/config/amd_mi100_hpe.yaml b/superbench/config/amd_mi100_hpe.yaml index 4f6a68a6e..150424c0f 100644 --- a/superbench/config/amd_mi100_hpe.yaml +++ b/superbench/config/amd_mi100_hpe.yaml @@ -3,7 +3,7 @@ # Server: # - Product: HPE Apollo 6500 -version: v0.7 +version: v0.8 superbench: enable: null var: diff --git a/superbench/config/amd_mi100_z53.yaml b/superbench/config/amd_mi100_z53.yaml index 9ef423a3e..188c93547 100644 --- a/superbench/config/amd_mi100_z53.yaml +++ b/superbench/config/amd_mi100_z53.yaml @@ -4,7 +4,7 @@ # - Product: G482-Z53 # - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html -version: v0.7 +version: v0.8 superbench: enable: null var: diff --git a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml index 4ba445909..62e0d6586 100644 --- a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml +++ b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml @@ -1,4 +1,4 @@ -version: v0.7 +version: v0.8 superbench: enable: null monitor: diff --git a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml index 56dc89b15..337affacf 100644 --- a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml +++ b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml @@ -1,4 +1,4 @@ -version: v0.7 +version: v0.8 superbench: enable: null monitor: diff --git a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml index d980488a2..f95469cb0 100644 --- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml +++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml @@ -1,4 +1,4 @@ -version: v0.7 +version: v0.8 superbench: enable: null monitor: diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml index 1d23a2ebb..e482d6ed0 100644 --- a/superbench/config/azure_ndmv4.yaml +++ b/superbench/config/azure_ndmv4.yaml @@ -3,7 +3,7 @@ # Azure NDm A100 v4 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series -version: v0.7 +version: v0.8 superbench: enable: null monitor: diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml index 02317c144..cb9a93ddc 100644 --- a/superbench/config/azure_ndv4.yaml +++ b/superbench/config/azure_ndv4.yaml @@ -1,5 +1,5 @@ # SuperBench Config -version: v0.7 +version: v0.8 superbench: enable: null monitor: diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index bff622a51..60d6be7b0 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -1,5 +1,5 @@ # SuperBench Config -version: v0.7 +version: v0.8 superbench: enable: null monitor: diff --git a/superbench/monitor/monitor.py b/superbench/monitor/monitor.py index 0945965eb..b3d01711d 100644 --- a/superbench/monitor/monitor.py +++ b/superbench/monitor/monitor.py @@ -38,16 +38,7 @@ def __init__(self, container_name, sample_duration, sample_interval, output_file self.__unit_MiByte = 1024 * 1024 * 1.0 self.__output_handler = open(self.__output_file, 'a') - self.__cgroup = 1 - output = run_command('grep cgroup /proc/filesystems', quiet=True) - if output.returncode != 0: - logger.error('Failed to check the cgroup version, will assume using cgroup V1.') - else: - if 'cgroup2' in output.stdout: - self.__cgroup = 2 - - logger.info('cgroup version: {}.'.format(self.__cgroup)) def __preprocess(self): """Preprocess/preparation operations before the monitoring. @@ -77,13 +68,15 @@ def __preprocess(self): container_pid = output.stdout try: - if self.__cgroup == 1: - self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0] + cpu_file_cgroup_v1 = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id)) + if len(cpu_file_cgroup_v1) > 0: + self._cpu_file = cpu_file_cgroup_v1[0] self._mem_file = glob.glob( '/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id) )[0] self._net_file = '/proc/{}/net/dev'.format(container_pid) else: + self.__cgroup = 2 self._cpu_file = glob.glob( '/sys/fs/cgroup/system.slice/docker-{}*.scope/cpu.stat'.format(container_id) )[0] @@ -99,10 +92,12 @@ def __preprocess(self): ) return False else: - if self.__cgroup == 1: - self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat' + cpu_file_cgroup_v1 = '/sys/fs/cgroup/cpuacct/cpuacct.stat' + if os.path.exists(cpu_file_cgroup_v1): + self._cpu_file = cpu_file_cgroup_v1 self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes' else: + self.__cgroup = 2 self._cpu_file = '/sys/fs/cgroup/cpu.stat' self._mem_file = '/sys/fs/cgroup/memory.stat' self._net_file = '/proc/net/dev' @@ -199,6 +194,7 @@ def __sample_gpu_metrics(self, record): for i in range(device_count): record.gpu_usage.append(dm.device_manager.get_device_utilization(i)) record.gpu_temperature.append(dm.device_manager.get_device_temperature(i)) + record.gpu_power.append(dm.device_manager.get_device_power(i)) record.gpu_power_limit.append(dm.device_manager.get_device_power_limit(i)) mem_used, mem_total = dm.device_manager.get_device_memory(i) record.gpu_mem_used.append(mem_used) diff --git a/superbench/monitor/record.py b/superbench/monitor/record.py index 73ff7c3a6..3b229f108 100644 --- a/superbench/monitor/record.py +++ b/superbench/monitor/record.py @@ -14,6 +14,7 @@ class MonitorRecord: """Record class to save all monitoring data.""" reduce_ops = { 'gpu_temperature': ReduceType.MAX, + 'gpu_power': ReduceType.MAX, 'gpu_power_limit': ReduceType.MIN, 'gpu_corrected_ecc': ReduceType.LAST, 'gpu_uncorrected_ecc': ReduceType.LAST, @@ -28,6 +29,7 @@ def __init__(self): self.__mem_total = None self.__gpu_usage = list() self.__gpu_temperature = list() + self.__gpu_power = list() self.__gpu_power_limit = list() self.__gpu_mem_used = list() self.__gpu_mem_total = list() @@ -112,6 +114,20 @@ def gpu_temperature(self, gpu_temperature): """ self.__gpu_temperature = gpu_temperature + @property + def gpu_power(self): + """Decoration function to access __gpu_power.""" + return self.__gpu_power + + @gpu_power.setter + def gpu_power(self, gpu_power): + """Set the gpu realtime power, unit: Watt. + + Args: + gpu_power(list): list of gpu realtime power. + """ + self.__gpu_power = gpu_power + @property def gpu_power_limit(self): """Decoration function to access __gpu_power_limit.""" diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 29d114b14..28b5c7186 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -387,8 +387,9 @@ def __merge_monitor_metrics(self, node_path): metrics_dict[metric].append(value) for metric, values in metrics_dict.items(): + prefix = metric.split(':')[0] for pattern, reduce_type in MonitorRecord.reduce_ops.items(): - if pattern in metric: + if pattern == prefix: reduce_func = Reducer.get_reduce_func(reduce_type) metric_name = 'monitor/{}'.format(metric) metrics_summary[metric_name] = reduce_func(values) diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py index 926088aea..deba3a438 100644 --- a/tests/benchmarks/model_benchmarks/test_model_base.py +++ b/tests/benchmarks/model_benchmarks/test_model_base.py @@ -167,6 +167,7 @@ def test_arguments_related_interfaces(): --no_gpu Disable GPU training. --num_steps int The number of test step. --num_warmup int The number of warmup step. + --num_workers int Number of subprocesses to use for data loading. --pin_memory Enable option to pin memory in data loader. --precision Precision [Precision ...] Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2 @@ -206,6 +207,7 @@ def test_preprocess(): --no_gpu Disable GPU training. --num_steps int The number of test step. --num_warmup int The number of warmup step. + --num_workers int Number of subprocesses to use for data loading. --pin_memory Enable option to pin memory in data loader. --precision Precision [Precision ...] Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2 diff --git a/tests/monitor/test_monitor.py b/tests/monitor/test_monitor.py index 0fa601e21..16ca151a7 100644 --- a/tests/monitor/test_monitor.py +++ b/tests/monitor/test_monitor.py @@ -44,8 +44,8 @@ def test_monitor(self): monitor._Monitor__sample_gpu_metrics(record) gpu_list_metrics = [ - record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total, - record.gpu_corrected_ecc, record.gpu_uncorrected_ecc + record.gpu_usage, record.gpu_temperature, record.gpu_power, record.gpu_power_limit, record.gpu_mem_used, + record.gpu_mem_total, record.gpu_corrected_ecc, record.gpu_uncorrected_ecc ] for metric in gpu_list_metrics: assert (metric) diff --git a/tests/monitor/test_monitor_record.py b/tests/monitor/test_monitor_record.py index 13b866fd9..069dbf6fd 100644 --- a/tests/monitor/test_monitor_record.py +++ b/tests/monitor/test_monitor_record.py @@ -17,6 +17,7 @@ def test_monitor_record(): mr.mem_total = 1024 mr.gpu_usage = [90, 80, 86, 72, 79, 81, 94, 85] mr.gpu_temperature = [62, 75, 69, 63, 72, 77, 80, 71] + mr.gpu_power = [257, 290, 280, 262, 291, 284, 281, 273] mr.gpu_power_limit = [400, 400, 400, 350, 400, 400, 400, 400] mr.gpu_mem_used = [2550, 2680, 2543, 2588, 2612, 2603, 2515, 2593] mr.gpu_mem_total = [16777216, 16777216, 16777216, 16777216, 16777216, 16777216, 16777216, 16777216] @@ -59,6 +60,14 @@ def test_monitor_record(): 'gpu_temperature:5': 77, 'gpu_temperature:6': 80, 'gpu_temperature:7': 71, + 'gpu_power:0': 257, + 'gpu_power:1': 290, + 'gpu_power:2': 280, + 'gpu_power:3': 262, + 'gpu_power:4': 291, + 'gpu_power:5': 284, + 'gpu_power:6': 281, + 'gpu_power:7': 273, 'gpu_power_limit:0': 400, 'gpu_power_limit:1': 400, 'gpu_power_limit:2': 400, diff --git a/website/blog/2023-04-14-release-0-8.md b/website/blog/2023-04-14-release-0-8.md new file mode 100644 index 000000000..2124ebc04 --- /dev/null +++ b/website/blog/2023-04-14-release-0-8.md @@ -0,0 +1,44 @@ +--- +slug: release-sb-v0.8 +title: Releasing SuperBench v0.8 +author: Peng Cheng +author_title: SuperBench Team +author_url: https://github.com/cp5555 +author_image_url: https://github.com/cp5555.png +tags: [superbench, announcement, release] +--- + +We are very happy to announce that **SuperBench 0.8.0 version** is officially released today! + +You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation). + +## SuperBench 0.8.0 Release Notes + +### SuperBench Improvements + +- Support SuperBench Executor running on Windows. +- Remove fixed rccl version in rocm5.1.x docker file. +- Upgrade networkx version to fix installation compatibility issue. +- Pin setuptools version to v65.7.0. +- Limit ansible_runner version for Python 3.6. +- Support cgroup V2 when read system metrics in monitor. +- Fix analyzer bug in Python 3.8 due to pandas api change. +- Collect real-time GPU power in monitor. +- Remove unreachable condition when write host list in mpi mode. +- Upgrade Docker image with cuda12.1, nccl 2.17.1-1, hpcx v2.14, and mlc 3.10. +- Fix wrong unit of cpu-memory-bw-latency in document. + +### Micro-benchmark Improvements + +- Add STREAM benchmark for sustainable memory bandwidth and the corresponding computation rate. +- Add HPL Benchmark for HPC Linpack Benchmark. +- Support flexible warmup and non-random data initialization in cublas-benchmark. +- Support error tolerance in micro-benchmark for CuDNN function. +- Add distributed inference benchmark. +- Support tensor core precisions (e.g., FP8) and batch/shape range in cublaslt gemm. + +### Model Benchmark Improvements + +- Fix torch.dist init issue with multiple models. +- Support TE FP8 in BERT/GPT2 model. +- Add num_workers configurations in model benchmark. diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 7e780b1c6..cc583913d 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -101,7 +101,7 @@ module.exports = { announcementBar: { id: 'supportus', content: - '📢 v0.7.0 has been released! ' + + '📢 v0.8.0 has been released! ' + '⭐️ If you like SuperBench, give it a star on GitHub! ⭐️', }, algolia: { diff --git a/website/package-lock.json b/website/package-lock.json index 369418ed1..7526213de 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -1,6 +1,6 @@ { "name": "superbench-website", - "version": "0.7.0", + "version": "0.8.0", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/website/package.json b/website/package.json index f4d217d67..c761f26d8 100644 --- a/website/package.json +++ b/website/package.json @@ -1,6 +1,6 @@ { "name": "superbench-website", - "version": "0.7.0", + "version": "0.8.0", "private": true, "scripts": { "docusaurus": "docusaurus", From 4cb431cab4dfc43f61e4adf7712fb3e9ebe48e25 Mon Sep 17 00:00:00 2001 From: Ziyue Yang Date: Mon, 24 Apr 2023 10:17:49 +0800 Subject: [PATCH 02/33] Benchmarks - Revise step time collection in distributed inference benchmark (#524) **Description** This commit revises distributed inference benchmark to give a unified step time result by taking maximum step times of different GPUs. --- superbench/benchmarks/micro_benchmarks/dist_inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference.py b/superbench/benchmarks/micro_benchmarks/dist_inference.py index 535c4fbf6..8e51b6bd8 100644 --- a/superbench/benchmarks/micro_benchmarks/dist_inference.py +++ b/superbench/benchmarks/micro_benchmarks/dist_inference.py @@ -14,6 +14,7 @@ from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode, Precision from superbench.benchmarks.micro_benchmarks import MicroBenchmark from superbench.benchmarks.context import Enum +from superbench.benchmarks.reducer import ReduceType class ComputationKernelType(Enum): @@ -390,7 +391,7 @@ def _process_data(self, step_times): Return: True if _process_data succeeds. """ - if not self._process_numeric_result('step_times', step_times, cal_percentile=True): + if not self._process_numeric_result('step_times', step_times, reduce_type=ReduceType.MAX, cal_percentile=True): return False return True From 664c59a14d376510e43cf16bdd4e1eead2f0f923 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 28 Apr 2023 11:36:11 +0800 Subject: [PATCH 03/33] Docs - Update version in README (#529) Update version in README. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 22e3932af..ffcd51960 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ __SuperBench__ is a validation and profiling tool for AI infrastructure. -📢 [v0.7.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.7.0) has been released! +📢 [v0.8.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.8.0) has been released! ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._ From f38a9829d048d47012dd7aa679ea479ba3edd3c1 Mon Sep 17 00:00:00 2001 From: guoshzhao Date: Fri, 28 Apr 2023 13:15:47 +0800 Subject: [PATCH 04/33] ModelBenchmarks - Fix early stop logic due to num_steps. (#522) **Description** Model benchmarks can stop due to `num_steps` or `duration` config which will take effect when the value is set greater than 0. If both are set greater than 0, the earliest condition reached will work. --- docs/superbench-config.mdx | 4 +-- .../benchmarks/model_benchmarks/model_base.py | 7 +++- .../model_benchmarks/test_model_base.py | 35 +++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx index 8802830b2..5720a8125 100644 --- a/docs/superbench-config.mdx +++ b/docs/superbench-config.mdx @@ -344,10 +344,10 @@ There have four common parameters for all benchmarks: For Model-Benchmark, there have some parameters that can control the elapsed time. * duration: the elapsed time of benchmark in seconds. -* num_warmup: the number of warmup step. +* num_warmup: the number of warmup step, should be positive integer. * num_steps: the number of test step. -If `duration > 0` and `num_warmup + num_steps > 0`, then benchmark will take the least as the elapsed time. Otherwise only one of them will take effect. +If `duration > 0` and `num_steps > 0`, then benchmark will take the least as the elapsed time. Otherwise only one of them will take effect. ## `Mode` Schema diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py index a51c05850..6238c2b0e 100644 --- a/superbench/benchmarks/model_benchmarks/model_base.py +++ b/superbench/benchmarks/model_benchmarks/model_base.py @@ -204,6 +204,11 @@ def _preprocess(self): ) ) + if self._args.num_warmup < 0: + logger.error('num_warmup should be positive integer, while {} is set.'.format(self._args.num_warmup)) + self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) + return False + if not self._init_distributed_setting(): self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE) return False @@ -374,7 +379,7 @@ def _is_finished(self, curr_step, curr_time): if ( (self._args.duration > 0 and (curr_time - self._sub_benchmark_start_time) >= self._args.duration) - or (total_steps > 0 and curr_step >= total_steps) + or (self._args.num_steps > 0 and curr_step >= total_steps) ): return True diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py index deba3a438..1b6af1775 100644 --- a/tests/benchmarks/model_benchmarks/test_model_base.py +++ b/tests/benchmarks/model_benchmarks/test_model_base.py @@ -20,6 +20,7 @@ def __init__(self, name, parameters=''): """ super().__init__(name, parameters) self._supported_precision = [Precision.FLOAT32, Precision.FLOAT16] + self._sub_benchmark_start_time = 0 def add_parser_arguments(self): """Add the specified arguments.""" @@ -377,3 +378,37 @@ def test_check_result_format(): # Negative case for __check_raw_data() - invalid benchmark result. assert (benchmark._Benchmark__check_result_format() is False) assert (benchmark.return_code == ReturnCode.INVALID_BENCHMARK_RESULT) + + +def test_is_finished(): + """Test interface Benchmark._is_finished().""" + # Only step takes effect, benchmarking finish due to step. + benchmark = create_benchmark('--num_warmup 32 --num_steps 128 --duration 0') + benchmark._preprocess() + end_time = 2 + curr_step = 50 + assert (benchmark._is_finished(curr_step, end_time) is False) + curr_step = 160 + assert (benchmark._is_finished(curr_step, end_time)) + + # Only duration takes effect, benchmarking finish due to duration. + benchmark = create_benchmark('--num_warmup 32 --num_steps 0 --duration 10') + benchmark._preprocess() + benchmark._sub_benchmark_start_time = 0 + curr_step = 50 + end_time = 1 + assert (benchmark._is_finished(curr_step, end_time) is False) + end_time = 10 + assert (benchmark._is_finished(curr_step, end_time)) + + # Both step and duration take effect. + benchmark = create_benchmark('--num_warmup 32 --num_steps 128 --duration 10') + benchmark._preprocess() + # Benchmarking finish due to step. + curr_step = 160 + end_time = 2 + assert (benchmark._is_finished(curr_step, end_time)) + # Benchmarking finish due to duration. + curr_step = 50 + end_time = 10 + assert (benchmark._is_finished(curr_step, end_time)) From 4c0d96e5d8dcd234084dfdaa02ccf647dda8f775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=CC=B7N=CC=B7?= Date: Thu, 4 May 2023 07:55:42 +0700 Subject: [PATCH 05/33] Docs - Fix typo on kernel_parameters and kernel_modules in system-config (#528) **Description** Kernel_parameters and kernel_modules command and examples are exchanged. --- docs/user-tutorial/system-config.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index 1daef4c7b..dbde728d3 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -91,22 +91,22 @@ id: system-config Kernel kernel_modules - sysctl + lsmod list of active kernel modules - "abi.vsyscall32": "1",
- "debug.exception-trace": "1",
+ "Module": "binfmt_misc",
+ "Size": "24576",
+ "Used": "1"
... kernel_parameters - lsmod + sysctl kernel parameters - "Module": "binfmt_misc",
- "Size": "24576",
- "Used": "1"
+ "abi.vsyscall32": "1",
+ "debug.exception-trace": "1",
... From a1cd3c94750631a0ac6a01b93a234500a0d6838f Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Tue, 23 May 2023 17:25:35 +0800 Subject: [PATCH 06/33] Runner - Add signal handler in runner (#530) Add signal handler in runner to gracefully exit when receiving SIGINT (Ctrl+C) or SIGTERM during benchmark execution. --- setup.py | 1 + superbench/runner/ansible.py | 5 +++-- superbench/runner/runner.py | 21 ++++++++++++++++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index b42639eea..30d3d1878 100644 --- a/setup.py +++ b/setup.py @@ -198,6 +198,7 @@ def run(self): 'types-pkg_resources', 'types-pyyaml', 'typing-extensions>=3.10', + 'urllib3<2.0', 'vcrpy>=4.1.1', 'yapf==0.31.0', ], diff --git a/superbench/runner/ansible.py b/superbench/runner/ansible.py index c012edc5c..fc71b7bd6 100644 --- a/superbench/runner/ansible.py +++ b/superbench/runner/ansible.py @@ -59,11 +59,12 @@ def __init__(self, config): self._config['cmdline'] += ' --ask-pass --ask-become-pass' logger.info(self._config) - def run(self, ansible_config, sudo=False): # pragma: no cover + def run(self, ansible_config, cancel_callback=None, sudo=False): # pragma: no cover """Run Ansible runner. Args: ansible_config (dict): Ansible config dict. + cancel_callback (Callable): Ansible runner cancel callback. sudo (bool): Run as sudo or not. Defaults to False. Returns: @@ -73,7 +74,7 @@ def run(self, ansible_config, sudo=False): # pragma: no cover logger.info('Run as sudo ...') ansible_config['cmdline'] += ' --become' with tempfile.TemporaryDirectory(prefix='ansible') as tmpdir: - r = ansible_runner.run(private_data_dir=tmpdir, **ansible_config) + r = ansible_runner.run(private_data_dir=tmpdir, cancel_callback=cancel_callback, **ansible_config) logger.debug(r.stats) if r.rc == 0: logger.info('Run succeed, return code {}.'.format(r.rc)) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 28b5c7186..d91020bfb 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -4,8 +4,10 @@ """SuperBench Runner.""" import os +import sys import json import random +import signal from pathlib import Path from pprint import pformat from collections import defaultdict @@ -233,6 +235,18 @@ def fetch_results(self): # pragma: no cover ) ) + def __signal_handler(self, signum, frame): + """Signal handler for runner. + + Args: + signum (int): Signal number. + frame (FrameType): Timeout frame. + """ + if signum == signal.SIGINT or signum == signal.SIGTERM: + logger.info('Killed by %s, exiting ...', signal.Signals(signum).name) + self.cleanup() + sys.exit(128 + signum) + def __create_results_summary(self): # pragma: no cover """Create the result summary file of all nodes.""" all_results = list() @@ -438,12 +452,17 @@ def _run_proc(self, benchmark_name, mode, vars): # we do not expect timeout in ansible unless subprocess hangs ansible_runner_config['timeout'] = timeout + 60 - rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip)) + # overwrite ansible runner's default signal handler with main process's + rc = self._ansible_client.run( + ansible_runner_config, cancel_callback=lambda: None, sudo=(not self._docker_config.skip) + ) return rc def run(self): """Run the SuperBench benchmarks distributedly.""" self.check_env() + signal.signal(signal.SIGINT, self.__signal_handler) + signal.signal(signal.SIGTERM, self.__signal_handler) for benchmark_name in self._sb_benchmarks: if benchmark_name not in self._sb_enabled_benchmarks: continue From f4dab9f7baf00dffdf7d2f27e7f9e76b816ffb47 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 14 Jun 2023 10:51:45 +0800 Subject: [PATCH 07/33] Update error message in setup (#538) Update error message in setup, require wheel for pip>=23.1. --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 30d3d1878..af65fc690 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,10 @@ try: pkg_resources.require(['pip>=18', 'setuptools>=45, <66']) except (pkg_resources.VersionConflict, pkg_resources.DistributionNotFound): - print('Try update pip/setuptools versions, for example, python3 -m pip install --upgrade pip setuptools==65.7') + print( + '\033[93mTry update pip/setuptools versions, for example, ' + 'python3 -m pip install --upgrade pip wheel setuptools==65.7\033[0m' + ) raise here = pathlib.Path(__file__).parent.resolve() From e909ddd0caae9b5b5b94a1f74f7cbbe2eab59733 Mon Sep 17 00:00:00 2001 From: guoshzhao Date: Fri, 16 Jun 2023 17:50:09 +0800 Subject: [PATCH 08/33] Benchmarks - Update outdate references (#539) **Description** Update 404 outdate reference links. --- superbench/benchmarks/model_benchmarks/pytorch_bert.py | 6 +++--- superbench/benchmarks/model_benchmarks/pytorch_gpt2.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/pytorch_bert.py b/superbench/benchmarks/model_benchmarks/pytorch_bert.py index d43c188b5..d32c586b3 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_bert.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_bert.py @@ -71,7 +71,7 @@ def __init__(self, name, parameters=''): def add_parser_arguments(self): """Add the BERT-specified arguments. - BERT model reference: https://huggingface.co/transformers/model_doc/bert.html + BERT model reference: https://huggingface.co/docs/transformers/model_doc/bert """ super().add_parser_arguments() @@ -227,7 +227,7 @@ def _inference_step(self, precision): # Register BERT Large benchmark. -# Reference: https://huggingface.co/transformers/pretrained_models.html +# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-bert-large', PytorchBERT, @@ -235,7 +235,7 @@ def _inference_step(self, precision): ) # Register BERT Base benchmark. -# Reference: https://huggingface.co/transformers/pretrained_models.html +# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-bert-base', PytorchBERT, diff --git a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py index 77c8e4145..4ddcb7d6e 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py @@ -71,7 +71,7 @@ def __init__(self, name, parameters=''): def add_parser_arguments(self): """Add the GPT2-specified arguments. - GPT2 model reference: https://huggingface.co/transformers/model_doc/gpt2.html + GPT2 model reference: https://huggingface.co/docs/transformers/model_doc/gpt2 """ super().add_parser_arguments() @@ -221,25 +221,25 @@ def _inference_step(self, precision): # Register GPT2 benchmark with 117M parameters. -# Reference: https://huggingface.co/transformers/pretrained_models.html +# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-gpt2-small', PytorchGPT2, parameters='--hidden_size=768 --num_hidden_layers=12 --num_attention_heads=12' ) # Register GPT2 benchmark with 345M parameters. -# Reference: https://huggingface.co/transformers/pretrained_models.html +# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-gpt2-medium', PytorchGPT2, parameters='--hidden_size=1024 --num_hidden_layers=24 --num_attention_heads=16' ) # Register GPT2 benchmark with 774M parameters. -# Reference: https://huggingface.co/transformers/pretrained_models.html +# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-gpt2-large', PytorchGPT2, parameters='--hidden_size=1280 --num_hidden_layers=36 --num_attention_heads=20' ) # Register GPT2 benchmark with 1558M parameters. -# Reference: https://huggingface.co/transformers/pretrained_models.html +# Reference: https://huggingface.co/transformers/v3.3.1/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-gpt2-xl', PytorchGPT2, parameters='--hidden_size=1600 --num_hidden_layers=48 --num_attention_heads=25' ) From bbb0e24342a69df7ed547d8eb3ca630091a4925f Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Wed, 21 Jun 2023 09:58:13 +0800 Subject: [PATCH 09/33] Benchmarks - Add support for DirectX GPU platform (#536) **Description** Add support for DirectX GPU platform. **Major Revision** - Add DirectX platform for benchmark registry - Add gpu_vendor identify for AMD and NVIDIA with win driver --- superbench/benchmarks/context.py | 1 + superbench/common/devices/gpu.py | 4 ++++ superbench/executor/executor.py | 2 ++ 3 files changed, 7 insertions(+) diff --git a/superbench/benchmarks/context.py b/superbench/benchmarks/context.py index cb9a756ed..d1ad2237b 100644 --- a/superbench/benchmarks/context.py +++ b/superbench/benchmarks/context.py @@ -24,6 +24,7 @@ class Platform(Enum): CPU = 'CPU' CUDA = 'CUDA' ROCM = 'ROCm' + DIRECTX = 'DirectX' class Framework(Enum): diff --git a/superbench/common/devices/gpu.py b/superbench/common/devices/gpu.py index 9cbb06a9c..e12889e10 100644 --- a/superbench/common/devices/gpu.py +++ b/superbench/common/devices/gpu.py @@ -29,6 +29,10 @@ def get_vendor(self): if not list(Path('/dev/dri').glob('card*')): logger.warning('Cannot find AMD GPU device.') return 'amd' + if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')): + return 'nvidia-graphics' + if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/u*.inf_amd64_*/*/aticfx64.dll')): + return 'amd-graphics' return None @property diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index f78806981..ca2b78093 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -87,6 +87,8 @@ def __get_platform(self): return Platform.CUDA elif gpu.vendor == 'amd': return Platform.ROCM + elif gpu.vendor == 'amd-graphics' or gpu.vendor == 'nvidia-graphics': + return Platform.DIRECTX except Exception as e: logger.error(e) return Platform.CPU From 44ef531465d555af6a4e72d82b77baf53e77d39b Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Wed, 28 Jun 2023 05:35:11 +0000 Subject: [PATCH 10/33] Dockerfile - Add SuperBench Windows Dockerfile (#534) **Description** Add dockerfile for win10 and building script for directx_benchmarks. **Major Revision** - Add docker file for win10 and required scripts to install the dependency - Add building script to build all directx vs benchmarks - Add call of building script in Makefile --------- Co-authored-by: yukirora Co-authored-by: Yifan Xiong --- .github/workflows/build-win.yml | 46 +++++++++++++++ Makefile | 3 + dockerfile/directx/enable-graphics-apis.py | 69 ++++++++++++++++++++++ dockerfile/directx/install-components.bat | 9 +++ dockerfile/directx/mini_vsconfig.json | 14 +++++ dockerfile/directx12.dockerfile | 65 ++++++++++++++++++++ superbench/benchmarks/build.bat | 18 ++++++ 7 files changed, 224 insertions(+) create mode 100644 .github/workflows/build-win.yml create mode 100644 dockerfile/directx/enable-graphics-apis.py create mode 100644 dockerfile/directx/install-components.bat create mode 100644 dockerfile/directx/mini_vsconfig.json create mode 100644 dockerfile/directx12.dockerfile create mode 100644 superbench/benchmarks/build.bat diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml new file mode 100644 index 000000000..7226af8c7 --- /dev/null +++ b/.github/workflows/build-win.yml @@ -0,0 +1,46 @@ +name: Build on Windows + +on: + push: + branches: + - main + - release/* + pull_request: + branches: + - main + - release/* + +jobs: + docker: + name: Docker build win2004 + runs-on: [self-hosted, windows, x64, win2004] + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: true + - name: Build Docker image + working-directory: . + shell: pwsh + run: | + docker build ` + --file dockerfile/directx12.dockerfile ` + --label org.opencontainers.image.source=${{ github.event.repository.html_url }} ` + --label org.opencontainers.image.created=${{ github.event.repository.pushed_at }} ` + --label org.opencontainers.image.revision=${{ github.sha }} ` + --platform windows/amd64 ` + --isolation=process ` + --tag $env:TAG . + env: + TAG: superbench/main:win2004 + - name: Push Docker image + if: ${{ github.event_name != 'pull_request' }} + shell: pwsh + run: | + docker login -u $env:USER -p $env:PASS + docker push $env:TAG + docker logout + env: + TAG: superbench/main:win2004 + USER: ${{ secrets.DOCKERHUB_USERNAME }} + PASS: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/Makefile b/Makefile index 8e43caadd..a7b8f05d1 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,9 @@ cppformat: cppbuild: cd ./superbench/benchmarks/ && bash build.sh +directxbuild: + cd ./superbench/benchmarks/ && build.bat + thirdparty: cd ./third_party/ && make all diff --git a/dockerfile/directx/enable-graphics-apis.py b/dockerfile/directx/enable-graphics-apis.py new file mode 100644 index 000000000..7f6d0e3cd --- /dev/null +++ b/dockerfile/directx/enable-graphics-apis.py @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Enables graphics APIs in the Windows container.""" +# Reference to +# https://github.com/EpicGames/UnrealEngine/blob/release/Engine/Extras/Containers/Dockerfiles/windows/runtime/enable-graphics-apis.ps1 + +import os +import shutil +import glob + + +def copy_to_system32(source_directory, filenames, rename=None): + """Copies the specified files from the source directory to the system32 directory.""" + for filename in filenames: + source = os.path.join(source_directory, filename) + destination = os.path.join('C:\\Windows\\System32', filename) + if rename and filename in rename: + renamed = rename[filename] + destination = os.path.join('C:\\Windows\\System32', renamed) + try: + print(f'Copying {source} to {destination}') + shutil.copy2(source, destination) + except Exception as e: + print(f'Warning: failed to copy file {filename}. Reason: {str(e)}') + + +# Attempt to locate the NVIDIA Display Driver directory in the host system's driver store +nvidia_sentinel_file = glob.glob('C:\\Windows\\System32\\HostDriverStore\\FileRepository\\nv*.inf_amd64_*\\nvapi64.dll') +if nvidia_sentinel_file: + nvidia_directory = os.path.dirname(nvidia_sentinel_file[0]) + print(f'Found NVIDIA Display Driver directory: {nvidia_directory}') + + print('\nEnabling NVIDIA NVAPI support:') + copy_to_system32(nvidia_directory, ['nvapi64.dll']) + + print('\nEnabling NVIDIA NVENC support:') + copy_to_system32(nvidia_directory, ['nvEncodeAPI64.dll', 'nvEncMFTH264x.dll', 'nvEncMFThevcx.dll']) + + print('\nEnabling NVIDIA CUVID/NVDEC support:') + copy_to_system32( + nvidia_directory, ['nvcuvid64.dll', 'nvDecMFTMjpeg.dll', 'nvDecMFTMjpegx.dll'], + {'nvcuvid64.dll': 'nvcuvid.dll'} + ) + + print('\nEnabling NVIDIA CUDA support:') + copy_to_system32( + nvidia_directory, ['nvcuda64.dll', 'nvcuda_loader64.dll', 'nvptxJitCompiler64.dll'], + {'nvcuda_loader64.dll': 'nvcuda.dll'} + ) + + print('\n') + +# Attempt to locate the AMD Display Driver directory in the host system's driver store +amd_sentinel_file = glob.glob('C:\\Windows\\System32\\HostDriverStore\\FileRepository\\u*.inf_amd64_*\\*\\aticfx64.dll') +if amd_sentinel_file: + amd_directory = os.path.dirname(amd_sentinel_file[0]) + print(f'Found AMD Display Driver directory: {amd_directory}') + + print('\nCopying AMD DirectX driver files:') + copy_to_system32(amd_directory, ['aticfx64.dll', 'atidxx64.dll']) + + print('\nEnabling AMD Display Library (ADL) support:') + copy_to_system32(amd_directory, ['atiadlxx.dll', 'atiadlxy.dll']) + + print('\nEnabling AMD Advanced Media Framework (AMF) support:') + copy_to_system32(amd_directory, ['amfrt64.dll', 'amfrtdrv64.dll', 'amdihk64.dll']) + + print('\n') diff --git a/dockerfile/directx/install-components.bat b/dockerfile/directx/install-components.bat new file mode 100644 index 000000000..95c42380a --- /dev/null +++ b/dockerfile/directx/install-components.bat @@ -0,0 +1,9 @@ +REM Copyright (c) Microsoft Corporation - All rights reserved +REM Licensed under the MIT License + +curl -s -L https://aka.ms/vs/17/release/vs_BuildTools.exe -o "vs_BuildTools.exe" +start /b /wait vs_BuildTools.exe --config %SB_HOME%\dockerfile\directx\mini_vsconfig.json --wait --quiet --norestart --nocache +if %errorlevel% neq 0 ( + exit /b %errorlevel% +) +del "vs_BuildTools.exe" diff --git a/dockerfile/directx/mini_vsconfig.json b/dockerfile/directx/mini_vsconfig.json new file mode 100644 index 000000000..f22b84143 --- /dev/null +++ b/dockerfile/directx/mini_vsconfig.json @@ -0,0 +1,14 @@ +{ + "version": "1.0", + "components": [ + "Microsoft.VisualStudio.Component.Windows10SDK.19041", + "Microsoft.VisualStudio.Workload.VCTools", + "Microsoft.Component.MSBuild", + "Microsoft.VisualStudio.Component.CoreBuildTools", + "Microsoft.VisualStudio.Workload.MSBuildTools", + "Microsoft.VisualStudio.Component.VC.CoreBuildTools", + "Microsoft.VisualStudio.Component.VC.Tools.x86.x64", + "Microsoft.VisualStudio.Component.VC.14.35.17.5.ATL.Spectre", + "Microsoft.VisualStudio.Component.VC.14.35.17.5.MFC.Spectre" + ] +} diff --git a/dockerfile/directx12.dockerfile b/dockerfile/directx12.dockerfile new file mode 100644 index 000000000..1a958d69a --- /dev/null +++ b/dockerfile/directx12.dockerfile @@ -0,0 +1,65 @@ +FROM mcr.microsoft.com/windows:2004 + + +# Install Python and additional packages +# Download Python +ADD https://www.python.org/ftp/python/3.9.7/python-3.9.7-amd64.exe python-installer.exe +# Install Python +RUN python-installer.exe /quiet InstallAllUsers=1 PrependPath=1 && DEL python-installer.exe +# Verify Python Was Successfully Installed +RUN python --version && \ + python -m ensurepip --upgrade + +# Install choco and install some necessary packages +RUN powershell -Command "Set-ExecutionPolicy Bypass -Scope Process -Force; \ + [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; \ + iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1'))" +RUN choco install -y vcredist-all vim git make + +# Retrieve the DirectX runtime files required by the Unreal Engine, since even the full Windows base image does not include them +RUN mkdir C:\GatheredDlls +RUN curl -s -L "https://download.microsoft.com/download/8/4/A/84A35BF1-DAFE-4AE8-82AF-AD2AE20B6B14/directx_Jun2010_redist.exe" --output %TEMP%\directx_redist.exe && \ + start /wait %TEMP%\directx_redist.exe /Q /T:%TEMP%\DirectX && \ + expand %TEMP%\DirectX\APR2007_xinput_x64.cab -F:xinput1_3.dll C:\GatheredDlls\ && \ + expand %TEMP%\DirectX\Feb2010_X3DAudio_x64.cab -F:X3DAudio1_7.dll C:\GatheredDlls\ && \ + expand %TEMP%\DirectX\Jun2010_D3DCompiler_43_x64.cab -F:D3DCompiler_43.dll C:\GatheredDlls\ && \ + expand %TEMP%\DirectX\Jun2010_XAudio_x64.cab -F:XAudio2_7.dll C:\GatheredDlls\ && \ + expand %TEMP%\DirectX\Jun2010_XAudio_x64.cab -F:XAPOFX1_5.dll C:\GatheredDlls\ && \ + break + +# Retrieve the DirectX shader compiler files needed for DirectX Raytracing (DXR) +RUN curl -s -L "https://github.com/microsoft/DirectXShaderCompiler/releases/download/v1.6.2104/dxc_2021_04-20.zip" --output %TEMP%\dxc.zip && \ + powershell -Command "Expand-Archive -Path \"$env:TEMP\dxc.zip\" -DestinationPath $env:TEMP" && \ + xcopy /y %TEMP%\bin\x64\dxcompiler.dll C:\GatheredDlls\ && \ + xcopy /y %TEMP%\bin\x64\dxil.dll C:\GatheredDlls\ && \ + break + +# Copy the required DLLs to System32 dir +RUN xcopy C:\GatheredDlls\* C:\windows\System32\ /i + +ENV SB_HOME="C:/superbench" \ + SB_MICRO_PATH="C:/superbench" \ + WindowsSDKDir="\\Program Files (x86)\\Windows Kits\\10\\" + +RUN setx INCLUDE "%include%;%WindowsSDKDir%\\Include" /M && \ + setx LIB "%lib%;%WindowsSDKDir%\\Lib" /M && \ + setx PATH "%path%;%SB_MICRO_PATH%\\bin" /M + +WORKDIR ${SB_HOME} +COPY ./ ${SB_HOME} + +# Download vs_BuildTools.exe if not already present +RUN mkdir "%SB_MICRO_PATH%/bin" +RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "%SB_MICRO_PATH%/bin/nuget.exe" +# Run the setup script to install the visual studio components +RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat" + +# Install Superbench +RUN python -m pip install setuptools==65.0.0 && \ + python -m pip install --no-cache-dir .[amdworker] && \ + make directxbuild + +# Run the entrypoint script for enabling vendor-specific graphics APIs +RUN powershell -Command "Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine -Force" +CMD [ "python", "dockerfile/directx/enable-graphics-apis.py" ] +ENTRYPOINT [ "cmd.exe" ] diff --git a/superbench/benchmarks/build.bat b/superbench/benchmarks/build.bat new file mode 100644 index 000000000..8639e1771 --- /dev/null +++ b/superbench/benchmarks/build.bat @@ -0,0 +1,18 @@ +@echo off +REM Copyright (c) Microsoft Corporation - All rights reserved +REM Licensed under the MIT License + + +SETLOCAL EnableDelayedExpansion + +for /r %%F in (*.vcxproj) do ( + echo Found .vcxproj file: %%~dpF%%~nxF + SET "PROJ_PATH=%%~dpF%%~nxF" + SET "MSBUILD=C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Current\Bin\MSBuild.exe" + REM Download dependencies + "!MSBUILD!" "!PROJ_PATH!" -t:restore -p:RestorePackagesConfig=true + REM Build project + "!MSBUILD!" "!PROJ_PATH!" /p:Configuration=Release /p:AdditionalLibraryDirectories="%WindowsSDKDir%\Lib" /p:AdditionalIncludeDirectories="%WindowsSDKDir%\Include" /p:OutDir="%SB_MICRO_PATH%\bin" +) + +endlocal From 3a6622f7d3cf09530523fde077fc7154c8510bc7 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 29 Jun 2023 02:06:14 +0000 Subject: [PATCH 11/33] Benchmarks: Add benchmark - Add source code of DirectXGPUCoreFLOPs microbenchmark (#488) **Description** Add source code of DirectXGPUCoreFLOPs microbenchmark. --------- Co-authored-by: v-junlinlv --- .gitignore | 73 + .../BenchmarkOptions.h | 68 + .../GPUCore.cpp | 507 +++ .../directx_gemm_flops_performance/GPUCore.h | 151 + .../GPUCore.vcxproj | 110 + .../directx_gemm_flops_performance/Main.cpp | 11 + .../packages.config | 4 + .../directx_third_party/DXSampleHelper.h | 275 ++ .../directx_third_party/d3dx12.h | 3258 +++++++++++++++++ .../directx_utils/D3D12Timer.cpp | 80 + .../directx_utils/D3D12Timer.h | 54 + .../micro_benchmarks/directx_utils/Options.h | 113 + 12 files changed, 4704 insertions(+) create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/Main.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/packages.config create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/DXSampleHelper.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/d3dx12.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_utils/Options.h diff --git a/.gitignore b/.gitignore index 8872e5df4..e1ab18ca4 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,76 @@ dmypy.json # Cython debug symbols cython_debug/ + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h new file mode 100644 index 000000000..c5207bb4f --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h @@ -0,0 +1,68 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include "../directx_utils/Options.h" + +namespace Option { +enum Precision { + F16, + F32, +}; +using PrecisionType = Option::Precision; +} // namespace Option + +class BenchmarkOptions : public Options { + public: + // Number of warm up rounds to run. + int num_warm_up = 0; + // The number of benchmark runs. + int num_loops = 0; + // Dimension m of GEMM. + int m = 0; + // Dimension n of GEMM. + int n = 0; + // Dimension k of GEMM. + int k = 0; + // The precision of calculate. + Option::PrecisionType mode_precision = Option::F32; + + /** + * @brief Construct a new GPUCoreOptions object. + */ + BenchmarkOptions(int argc, char *argv[]) : Options(argc, argv) {} + + /** + * @brief Parse the arguments. + */ + virtual void parse_arguments() { + + num_loops = get_cmd_line_argument_int("--num_loops", 10); + num_warm_up = get_cmd_line_argument_int("--num_loops", 0); + m = get_cmd_line_argument_int("--m", 16 * 256); + n = get_cmd_line_argument_int("--n", 16 * 256); + k = get_cmd_line_argument_int("--k", 16 * 256); + if (get_cmd_line_argument_bool("--f16")) { + mode_precision = Option::F16; + } + if (get_cmd_line_argument_bool("--f32")) { + mode_precision = Option::F32; + } + } + + /** + * @brief Get the option usage. + */ + void get_option_usage() override { + std::cout << "Usage: " << std::endl; + std::cout << " --help: Print help message." << std::endl; + std::cout << " --num_loops: The number of benchmark runs." << std::endl; + std::cout << " --num_warm_up: The number of warmup runs." << std::endl; + std::cout << " --m: m dimension of GEMM." << std::endl; + std::cout << " --n: n dimension of GEMM." << std::endl; + std::cout << " --k: l dimension of GEMM." << std::endl; + std::cout << " --fp16: half precision to compute." << std::endl; + std::cout << " --fp32: float precision to compute." << std::endl; + } +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp new file mode 100644 index 000000000..206c49f90 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp @@ -0,0 +1,507 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// GPUCore.cpp : This file contains the 'main' function. Program execution begins and ends there. +#include +#include +#include +#include +#include + +#include + +#include "GPUCore.h" + +/** + * @brief Setup GPU and start benchmark. + */ +void GPUCore::Run() { + int m = opts->m; + int n = opts->n; + int k = opts->k; + + // Setup GPU objects like device and command list. + CreatePipeline(); + + int loops = opts->num_loops; + std::cout << "GPUCoreFLOPs" << std::endl; + + switch (opts->mode_precision) { + case Option::F32: { + // Prepare input and output data and buffers. + PrepareData(opts->m, opts->n, opts->k); + // Setup pipeline and compile operator. + SetupAndCompileOp(opts->m, opts->n, opts->k, DML_TENSOR_DATA_TYPE_FLOAT32); + InitializeOp(opts->m, opts->n, opts->k); + for (int i = 0; i < opts->num_warm_up; ++i) { + ExecuteComputeOp(); + } + for (int i = 0; i < loops; ++i) { + gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); + // Do FLOPs job. + double timeInMs = ExecuteComputeOp(); + auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs; + std::cout << flops << " TFLOPs" << std::endl; +#if defined _PRINT_RESULT + PrintResultForDebug(m, n); +#endif + } + } break; + case Option::F16: { + PrepareData(opts->m, opts->n, opts->k); + SetupAndCompileOp(opts->m, opts->n, opts->k, DML_TENSOR_DATA_TYPE_FLOAT16); + InitializeOp(opts->m, opts->n, opts->k); + for (int i = 0; i < opts->num_warm_up; ++i) { + ExecuteComputeOp(); + } + for (int i = 0; i < loops; ++i) { + gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); + // Do FLOPs job. + double timeInMs = ExecuteComputeOp(); + auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs; + std::cout << flops << " TFLOPs" << std::endl; +#if defined _PRINT_RESULT + PrintResultForDebug(m, n); +#endif + } + } break; + default: + std::cout << "Error: Unsupported precision mode." << std::endl; + break; + } +} + +/** + * @brief Create pipeline including + * create device object, command list, command queue + * and synchronization objects. + */ +void GPUCore::CreatePipeline() { + UINT dxgiFactoryFlags = 0; + +#if defined(_DEBUG) + // Enable the debug layer (requires the Graphics Tools "optional feature"). + // NOTE: Enabling the debug layer after device creation will invalidate the active device. + { + ComPtr debugController; + if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) { + debugController->EnableDebugLayer(); + + // Enable additional debug layers. + dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG; + } + } +#endif + + ComPtr factory; + ThrowIfFailed(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory))); + + ComPtr hardwareAdapter; + GetHardwareAdapter(factory.Get(), &hardwareAdapter); + + // Create GPU device object. + ThrowIfFailed(D3D12CreateDevice(hardwareAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device))); + + DML_CREATE_DEVICE_FLAGS dmlCreateDeviceFlags = DML_CREATE_DEVICE_FLAG_NONE; + +#if defined(_DEBUG) + // If the project is in a debug build, then enable the Direct3D 12 debug layer. + // This is optional (starting in DML_FEATURE_LEVEL_5_2) but strongly recommended! + + // If the project is in a debug build, then enable debugging via DirectML debug layers with this flag. + dmlCreateDeviceFlags |= DML_CREATE_DEVICE_FLAG_DEBUG; +#endif + + ThrowIfFailed(DMLCreateDevice(m_device.Get(), dmlCreateDeviceFlags, IID_PPV_ARGS(m_dmlDevice.GetAddressOf()))); + + D3D12_COMMAND_QUEUE_DESC queueDesc; + // Initialize command queue. + ZeroMemory(&queueDesc, sizeof(queueDesc)); + + // Describe and create the command queue. + queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; + queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; + + D3D12_COMMAND_QUEUE_DESC cqd3 = {}; + cqd3.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; + m_device->CreateCommandQueue(&cqd3, IID_PPV_ARGS(&m_commandQueue)); + + m_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_commandAllocator)); + + m_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_commandAllocator.Get(), nullptr, + IID_PPV_ARGS(&m_commandList)); + + // Create fence. + ThrowIfFailed(m_device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_fence))); + m_currentFence = 1; + // Create an event handle to use for GPU synchronization. + m_eventHandle = CreateEvent(0, false, false, 0); +} + +/** + * @brief Calculates the minimum number of bytes required to store a buffer tensor with the specified type, sizes, and + strides. The formula can be expressed as the following: + + IndexOfLastElement = dot(Sizes - 1, Strides); + MinimumImpliedSizeInBytes = roundup((IndexOfLastElement + 1) * ElementSizeInBytes, 4) + + In other words, the minimum size of a tensor is the index of the one-past-the-end element, multiplied by the + element size (e.g. 2 bytes for a FLOAT16 tensor). Additionally DirectML requires that all buffers bound must have + a total size which is DWORD-aligned, and hence the minimum implied size in bytes must be rounded up to the nearest + 4-byte boundary. + + Refer to DirectMLX.h (https://github.com/microsoft/DirectML/blob/master/Libraries/DirectMLX.h). + */ +inline UINT64 DMLCalcBufferTensorSize(DML_TENSOR_DATA_TYPE dataType, UINT tensorElementCount) { + UINT elementSizeInBytes = 0; + switch (dataType) { + case DML_TENSOR_DATA_TYPE_FLOAT32: + elementSizeInBytes = 4; + break; + case DML_TENSOR_DATA_TYPE_FLOAT16: + elementSizeInBytes = 2; + break; + default: + return 0; // Invalid data type + } + UINT64 minimumImpliedSizeInBytes = 0; + // Aligh size in 4 bytes in memory + // Round up to nearest multiple 4 bytes + minimumImpliedSizeInBytes = (tensorElementCount * elementSizeInBytes + 3) & ~3ull; + return minimumImpliedSizeInBytes; +} + +/** + * @brief Create and initialize DML_TENSOR_DESC. + */ +std::unique_ptr GPUCore::CreateTensorDesc(DML_TENSOR_DATA_TYPE dataType, UINT *tensorSizes, + int dimensionCount) { + std::unique_ptr tensorDesc = std::make_unique(); + std::unique_ptr bufferDesc = std::make_unique(); + + // Initialize tensorDesc + tensorDesc->Type = DML_TENSOR_TYPE_BUFFER; + + // Initialize bufferDesc + UINT tensorElementCount = tensorSizes[0] * tensorSizes[1] * tensorSizes[2] * tensorSizes[3]; + bufferDesc->DataType = dataType; + bufferDesc->Flags = DML_TENSOR_FLAG_NONE; + bufferDesc->DimensionCount = dimensionCount; + bufferDesc->Sizes = tensorSizes; + bufferDesc->Strides = nullptr; + bufferDesc->TotalTensorSizeInBytes = DMLCalcBufferTensorSize(dataType, tensorElementCount); + + // Assign bufferDesc to tensorDesc + tensorDesc->Desc = bufferDesc.release(); + + return tensorDesc; +} + +/** + * @brief Setup and compile DirectML operator. + */ +void GPUCore::SetupAndCompileOp(int m, int n, int k, DML_TENSOR_DATA_TYPE dataType) { + // Create DirectML operator(s). Operators represent abstract functions such as "multiply", "reduce", + // "convolution", or even compound operations such as recurrent neural nets. This example creates an instance of + // the Identity operator, which applies the function f(x) = x for all elements in a tensor. + std::unique_ptr dmlGEMMOperatorDesc = std::make_unique(); + + UINT tensorSizesA[4] = {1, 1, static_cast(m), static_cast(k)}; + std::unique_ptr dmlTensorDescA = CreateTensorDesc(dataType, tensorSizesA, ARRAYSIZE(tensorSizesA)); + dmlGEMMOperatorDesc->ATensor = dmlTensorDescA.release(); + UINT tensorSizesB[4] = {1, 1, static_cast(k), static_cast(n)}; + std::unique_ptr dmlTensorDescB = CreateTensorDesc(dataType, tensorSizesB, ARRAYSIZE(tensorSizesB)); + dmlGEMMOperatorDesc->BTensor = dmlTensorDescB.release(); + UINT tensorSizes[4] = {1, 1, static_cast(m), static_cast(n)}; + std::unique_ptr dmlTensorDescC = CreateTensorDesc(dataType, tensorSizes, ARRAYSIZE(tensorSizes)); + dmlGEMMOperatorDesc->OutputTensor = dmlTensorDescC.release(); + + dmlGEMMOperatorDesc->CTensor = nullptr; + dmlGEMMOperatorDesc->TransA = DML_MATRIX_TRANSFORM_NONE; + dmlGEMMOperatorDesc->TransB = DML_MATRIX_TRANSFORM_NONE; + dmlGEMMOperatorDesc->Alpha = 1.0f; + dmlGEMMOperatorDesc->Beta = 0.0f; + + std::unique_ptr dmlOperatorDesc = std::make_unique(); + + dmlOperatorDesc->Type = DML_OPERATOR_GEMM; + dmlOperatorDesc->Desc = dmlGEMMOperatorDesc.release(); + ComPtr dmlOperator; + ThrowIfFailed(m_dmlDevice->CreateOperator(dmlOperatorDesc.release(), IID_PPV_ARGS(dmlOperator.GetAddressOf()))); + + ThrowIfFailed(m_dmlDevice->CompileOperator(dmlOperator.Get(), DML_EXECUTION_FLAG_NONE, + IID_PPV_ARGS(m_dmlCompiledOperator.GetAddressOf()))); +} + +/** + * @brief Prepare input and output data and buffers of the tensor elements.. + */ +template void GPUCore::PrepareData(const int m, const int n, const int k) { + // Define the tensors. + std::vector dataA(m * k); + std::vector dataB(n * k); + + // Prepare input data. + std::fill(dataA.begin(), dataA.end(), 1); + std::fill(dataB.begin(), dataB.end(), 1); + + UINT64 byteSize = m * k * sizeof(T); + + // Setup input buffer A and upload input data. + m_inputBufferA = + CreateDefaultBuffer(m_device.Get(), m_commandList.Get(), dataA.data(), byteSize, m_inputUploadBufferA); + + byteSize = n * k * sizeof(T); + // Setup input buffer B and upload input data. + m_inputBufferB = + CreateDefaultBuffer(m_device.Get(), m_commandList.Get(), dataB.data(), byteSize, m_inputUploadBufferB); + + byteSize = m * n * sizeof(T); + // Create output buffer. + ThrowIfFailed(m_device->CreateCommittedResource( + get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)), D3D12_HEAP_FLAG_NONE, + get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(byteSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, nullptr, IID_PPV_ARGS(&m_outputBuffer))); + + // Create readback buffer. + ThrowIfFailed( + m_device->CreateCommittedResource(get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK)), + D3D12_HEAP_FLAG_NONE, get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(byteSize)), + D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readBackBuffer))); + CloseExecuteResetWait(); +} + +/** + * @brief Initialize DirectML operator. + */ +template void GPUCore::InitializeOp(int m, int n, int k) { + ComPtr dmlOperatorInitializer; + + IDMLCompiledOperator *dmlCompiledOperators[] = {m_dmlCompiledOperator.Get()}; + ThrowIfFailed(m_dmlDevice->CreateOperatorInitializer(ARRAYSIZE(dmlCompiledOperators), dmlCompiledOperators, + IID_PPV_ARGS(dmlOperatorInitializer.GetAddressOf()))); + + // Query the operator for the required size (in descriptors) of its binding table. + // You need to initialize an operator exactly once before it can be executed, and + // the two stages require different numbers of descriptors for binding. For simplicity, + // we create a single descriptor heap that's large enough to satisfy them both. + DML_BINDING_PROPERTIES initializeBindingProperties = dmlOperatorInitializer->GetBindingProperties(); + DML_BINDING_PROPERTIES executeBindingProperties = m_dmlCompiledOperator->GetBindingProperties(); + UINT descriptorCount = + initializeBindingProperties.RequiredDescriptorCount > executeBindingProperties.RequiredDescriptorCount + ? initializeBindingProperties.RequiredDescriptorCount + : executeBindingProperties.RequiredDescriptorCount; + + // Create descriptor heaps. + std::unique_ptr descriptorHeapDesc = std::make_unique(); + descriptorHeapDesc->Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; + descriptorHeapDesc->NumDescriptors = descriptorCount; + descriptorHeapDesc->Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; + ThrowIfFailed(m_device->CreateDescriptorHeap(descriptorHeapDesc.release(), _uuidof(m_descriptorHeap), + (void **)m_descriptorHeap.GetAddressOf())); + + // Set the descriptor heap(s). + ID3D12DescriptorHeap *d3D12DescriptorHeaps[] = {m_descriptorHeap.Get()}; + m_commandList->SetDescriptorHeaps(ARRAYSIZE(d3D12DescriptorHeaps), d3D12DescriptorHeaps); + + // Create a binding table over the descriptor heap we just created + std::unique_ptr dmlBindingTableDesc = std::make_unique(); + dmlBindingTableDesc->CPUDescriptorHandle = m_descriptorHeap->GetCPUDescriptorHandleForHeapStart(); + dmlBindingTableDesc->GPUDescriptorHandle = m_descriptorHeap->GetGPUDescriptorHandleForHeapStart(); + dmlBindingTableDesc->Dispatchable = dmlOperatorInitializer.Get(); + dmlBindingTableDesc->SizeInDescriptors = descriptorCount; + ThrowIfFailed( + m_dmlDevice->CreateBindingTable(dmlBindingTableDesc.get(), IID_PPV_ARGS(m_bindingTable.GetAddressOf()))); + + // Create the temporary and persistent resources that are necessary for executing an operator. + // The temporary resource is scratch memory (used internally by DirectML), whose contents you don't need to define. + // The persistent resource is long-lived, and you need to initialize it using the IDMLOperatorInitializer. + UINT64 temporaryResourceSize = + max(initializeBindingProperties.TemporaryResourceSize, executeBindingProperties.TemporaryResourceSize); + UINT64 persistentResourceSize = executeBindingProperties.PersistentResourceSize; + + // Bind and initialize the operator on the GPU. + ComPtr temporaryBuffer; + if (temporaryResourceSize != 0) { + ThrowIfFailed(m_device->CreateCommittedResource( + get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)), D3D12_HEAP_FLAG_NONE, + get_rvalue_ptr( + CD3DX12_RESOURCE_DESC::Buffer(temporaryResourceSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)), + D3D12_RESOURCE_STATE_COMMON, nullptr, IID_PPV_ARGS(temporaryBuffer.GetAddressOf()))); + + if (initializeBindingProperties.TemporaryResourceSize != 0) { + DML_BUFFER_BINDING bufferBinding{temporaryBuffer.Get(), 0, temporaryResourceSize}; + DML_BINDING_DESC bindingDesc{DML_BINDING_TYPE_BUFFER, &bufferBinding}; + m_bindingTable->BindTemporaryResource(&bindingDesc); + } + } + + ComPtr persistentBuffer; + if (persistentResourceSize != 0) { + ThrowIfFailed(m_device->CreateCommittedResource( + get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)), D3D12_HEAP_FLAG_NONE, + get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(persistentResourceSize)), D3D12_RESOURCE_STATE_COMMON, nullptr, + IID_PPV_ARGS(persistentBuffer.GetAddressOf()))); + + // The persistent resource should be bound as the output to the IDMLOperatorInitializer. + DML_BUFFER_BINDING bufferBinding{persistentBuffer.Get(), 0, persistentResourceSize}; + DML_BINDING_DESC bindingDesc{DML_BINDING_TYPE_BUFFER, &bufferBinding}; + m_bindingTable->BindOutputs(1, &bindingDesc); + } + + ThrowIfFailed(m_dmlDevice->CreateCommandRecorder(IID_PPV_ARGS(&m_dmlCommandRecorder))); + + // Record execution of the operator initializer. + m_dmlCommandRecorder->RecordDispatch(m_commandList.Get(), dmlOperatorInitializer.Get(), m_bindingTable.Get()); + CloseExecuteResetWait(); + + // Bind and execute the operator on the GPU. + m_commandList->SetDescriptorHeaps(ARRAYSIZE(d3D12DescriptorHeaps), d3D12DescriptorHeaps); + + // Reset the binding table to bind for the operator we want to execute (it was previously used to bind for the + // initializer). + dmlBindingTableDesc->Dispatchable = m_dmlCompiledOperator.Get(); + + ThrowIfFailed(m_bindingTable->Reset(dmlBindingTableDesc.get())); + + if (temporaryResourceSize != 0) { + DML_BUFFER_BINDING bufferBinding{temporaryBuffer.Get(), 0, temporaryResourceSize}; + DML_BINDING_DESC bindingDesc{DML_BINDING_TYPE_BUFFER, &bufferBinding}; + m_bindingTable->BindTemporaryResource(&bindingDesc); + } + + if (persistentResourceSize != 0) { + DML_BUFFER_BINDING bufferBinding{persistentBuffer.Get(), 0, persistentResourceSize}; + DML_BINDING_DESC bindingDesc{DML_BINDING_TYPE_BUFFER, &bufferBinding}; + m_bindingTable->BindPersistentResource(&bindingDesc); + } + + CloseExecuteResetWait(); + + DML_BUFFER_BINDING inputBufferBindingA{m_inputBufferA.Get(), 0, sizeof(T) * m * k}; + DML_BINDING_DESC inputBindingDescA{DML_BINDING_TYPE_BUFFER, &inputBufferBindingA}; + + DML_BUFFER_BINDING inputBufferBindingB{m_inputBufferB.Get(), 0, sizeof(T) * n * k}; + DML_BINDING_DESC inputBindingDescB{DML_BINDING_TYPE_BUFFER, &inputBufferBindingB}; + + DML_BUFFER_BINDING bufferBinding = {nullptr, 0, 0}; + DML_BINDING_DESC inputBindingDesc{DML_BINDING_TYPE_NONE, &bufferBinding}; + + std::array inputBindings = {inputBindingDescA, inputBindingDescB, inputBindingDesc}; + m_bindingTable->BindInputs(3, inputBindings.data()); + + DML_BUFFER_BINDING outputBufferBinding{m_outputBuffer.Get(), 0, sizeof(T) * n * m}; + DML_BINDING_DESC outputBindingDesc{DML_BINDING_TYPE_BUFFER, &outputBufferBinding}; + + m_bindingTable->BindOutputs(1, &outputBindingDesc); +} + +#if defined _PRINT_RESULT +/** + * @brief Print the result of the benchmark for debug. + */ +template void GPUCore::PrintResultForDebug(int m, int n) { + // The output buffer now contains the result of the identity operator, + // so read it back if you want the CPU to access it. + m_commandList->ResourceBarrier( + 1, get_rvalue_ptr(CD3DX12_RESOURCE_BARRIER::Transition( + m_outputBuffer.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE))); + + m_commandList->CopyResource(m_readBackBuffer.Get(), m_outputBuffer.Get()); + + CloseExecuteResetWait(); + D3D12_RANGE tensorBufferRange{0, static_cast(sizeof(T) * n * m)}; + T *outputBufferData{}; + ThrowIfFailed(m_readBackBuffer->Map(0, &tensorBufferRange, reinterpret_cast(&outputBufferData))); + std::string outputString = "output tensor: "; + for (size_t tensorElementIndex{0}; tensorElementIndex < static_cast(m * n); + ++tensorElementIndex, ++outputBufferData) { + outputString += std::to_string(*outputBufferData) + ' '; + } + + std::cout << outputString << std::endl; + D3D12_RANGE emptyRange{0, 0}; + m_readBackBuffer->Unmap(0, &emptyRange); +} +#endif + +/** + * @brief Execute the computation GEMM op. + * @return the elapsed time in ms. + */ +double GPUCore::ExecuteComputeOp() { + + // Execute the compiled GEMM operator and record the GPU time. + this->gpuTimer.start(m_commandList.Get(), 0); + m_dmlCommandRecorder->RecordDispatch(m_commandList.Get(), m_dmlCompiledOperator.Get(), m_bindingTable.Get()); + this->gpuTimer.stop(m_commandList.Get(), 0); + this->gpuTimer.resolveQueryToCPU(m_commandList.Get(), 0); + CloseExecuteResetWait(); + double timeInMs = this->gpuTimer.getElapsedMsByTimestampPair(0); + return timeInMs; +} + +/** + * @brief Close and execute command list, wait until command completed. + */ +void GPUCore::CloseExecuteResetWait(DWORD dwMilliseconds) { + m_commandList->Close(); + ID3D12CommandList *commandLists[] = {m_commandList.Get()}; + m_commandQueue->ExecuteCommandLists(ARRAYSIZE(commandLists), commandLists); + // Wait until command completed. + // Signal and increment the fence value. + const UINT64 fenceL = m_currentFence; + m_commandQueue->Signal(m_fence.Get(), fenceL); + m_currentFence++; + + // Wait until command queue is done. + if (m_fence->GetCompletedValue() < fenceL) { + m_fence->SetEventOnCompletion(fenceL, m_eventHandle); + WaitForSingleObject(m_eventHandle, dwMilliseconds); + } + ThrowIfFailed(m_commandAllocator->Reset()); + ThrowIfFailed(m_commandList->Reset(m_commandAllocator.Get(), nullptr)); +} + +/** + * @brief Create a default buffer and upload data with the upload buffer. + * @param device the GPU device object. + * @param cmdList the GPU command list object. + * @param initData the data that need to upload. + * @param byteSize the size of data that need to upload. + * @param uploadBuffer the upload that use for upload data. + * @return a default buffer object. + */ +Microsoft::WRL::ComPtr +GPUCore::CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, const void *initData, + UINT64 byteSize, Microsoft::WRL::ComPtr &uploadBuffer) { + ComPtr defaultBuffer; + + // Create the default buffer on GPU side. + CD3DX12_HEAP_PROPERTIES DefaultHeap(D3D12_HEAP_TYPE_DEFAULT); + CD3DX12_RESOURCE_DESC defaultResourceDesc = + CD3DX12_RESOURCE_DESC::Buffer(byteSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + ThrowIfFailed(device->CreateCommittedResource(&DefaultHeap, D3D12_HEAP_FLAG_NONE, &defaultResourceDesc, + D3D12_RESOURCE_STATE_COPY_DEST, nullptr, + IID_PPV_ARGS(defaultBuffer.GetAddressOf()))); + + // Create upload buffer to upload data. + CD3DX12_HEAP_PROPERTIES UploadHeap(D3D12_HEAP_TYPE_UPLOAD); + CD3DX12_RESOURCE_DESC UploadResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(byteSize); + ThrowIfFailed(device->CreateCommittedResource(&UploadHeap, D3D12_HEAP_FLAG_NONE, &UploadResourceDesc, + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, + IID_PPV_ARGS(uploadBuffer.GetAddressOf()))); + + // Upload data to GPU side. + D3D12_SUBRESOURCE_DATA subResourceData = {}; + subResourceData.pData = initData; + subResourceData.RowPitch = byteSize; + subResourceData.SlicePitch = subResourceData.RowPitch; + + UpdateSubresources<1>(cmdList, defaultBuffer.Get(), uploadBuffer.Get(), 0, 0, 1, &subResourceData); + CD3DX12_RESOURCE_BARRIER ReadBarrier = CD3DX12_RESOURCE_BARRIER::Transition( + defaultBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_GENERIC_READ); + cmdList->ResourceBarrier(1, &ReadBarrier); + + return defaultBuffer; +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.h new file mode 100644 index 000000000..e619a75ea --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.h @@ -0,0 +1,151 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// linker +#pragma comment(lib, "dxguid.lib") +#pragma comment(lib, "dxgi.lib") +#pragma comment(lib, "D3D12.lib") +#pragma comment(lib, "d3dcompiler.lib") + +#if defined(_DEBUG) +#include +#endif + +#include "../directx_third_party/DXSampleHelper.h" +#include "../directx_third_party/d3dx12.h" +#include "../directx_utils/D3D12Timer.h" +#include "BenchmarkOptions.h" + +using namespace std; +using namespace DirectX; +// Note that while ComPtr is used to manage the lifetime of resources on the CPU, +// it has no understanding of the lifetime of resources on the GPU. Apps must account +// for the GPU lifetime of resources to avoid destroying objects that may still be +// referenced by the GPU. +// An example of this can be found in the class method: OnDestroy(). +using Microsoft::WRL::ComPtr; + +template T *get_rvalue_ptr(T &&v) { return &v; } + +class GPUCore { + public: + GPUCore(BenchmarkOptions *opts) : opts(opts) {} + ~GPUCore() {} + + /** + * @brief Setup GPU and start benchmark. + */ + void Run(); + + /** + * @brief Create pipeline including + * create device object, command list, command queue + * and synchronization objects. + */ + void CreatePipeline(); + + /** + * @brief Prepare input and output data and buffers of the tensor elements.. + */ + template void PrepareData(const int m, const int n, const int k); + + /** + * @brief Create and initialize DML_TENSOR_DESC. + */ + std::unique_ptr CreateTensorDesc(DML_TENSOR_DATA_TYPE dataType, UINT *tensorSizes, + int dimensionCount); + + /** + * @brief Setup and compile DirectML operator. + */ + void SetupAndCompileOp(int m, int n, int k, DML_TENSOR_DATA_TYPE dataType); + + /** + * @brief Initialize DirectML operator. + */ + template void InitializeOp(int m, int n, int k); + + /** + * @brief Execute the computation GEMM op. + * @return the elapsed time in ms. + */ + double ExecuteComputeOp(); + + /** + * @brief Close and execute command list, wait until command completed. + */ + void CloseExecuteResetWait(DWORD dwMilliseconds = 300000); + +#if defined _PRINT_RESULT + /** + * @brief Print the result of the benchmark for debug. + */ + template void PrintResultForDebug(int m, int n); +#endif + + /** + * @brief Create a default buffer and upload data with the upload buffer. + * @param device the GPU device object. + * @param cmdList the GPU command list object. + * @param initData the data that need to upload. + * @param byteSize the size of data that need to upload. + * @param UploadBuffer the upload that use for upload data. + * @return a default buffer object. + */ + Microsoft::WRL::ComPtr CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, + const void *initData, UINT64 byteSize, + Microsoft::WRL::ComPtr &UploadBuffer); + + private: + // Pipeline objects. + ComPtr m_device = nullptr; + ComPtr m_commandAllocator = nullptr; + ComPtr m_commandQueue = nullptr; + ComPtr m_commandList = nullptr; + ComPtr m_dmlDevice = nullptr; + ComPtr m_dmlCommandRecorder = nullptr; + ComPtr m_dmlCompiledOperator = nullptr; + ComPtr m_bindingTable = nullptr; + ComPtr m_descriptorHeap = nullptr; + + // Input buffer to pass data into GPU. + ComPtr m_inputBufferA = nullptr; + ComPtr m_inputUploadBufferA = nullptr; + ComPtr m_inputBufferB = nullptr; + ComPtr m_inputUploadBufferB = nullptr; + + // Output buffer that result output on GPU. + ComPtr m_outputBuffer = nullptr; + + // Readback buffer to copy data from GPU side to CPU side. + ComPtr m_readBackBuffer = nullptr; + + // Synchronization objects. + ComPtr m_fence = nullptr; + UINT64 m_currentFence = 0; + HANDLE m_eventHandle = nullptr; + + // GPU timer. + D3D12::D3D12Timer gpuTimer; + + // Options. + BenchmarkOptions *opts; +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj new file mode 100644 index 000000000..109d39305 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj @@ -0,0 +1,110 @@ + + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + Win32Proj + {8407ef34-a93c-473a-8fac-2598b2695b61} + GPUCore + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + + + + + + + + + + + + + false + true + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);_PRINT_RESULT + true + true + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions); + true + + + Console + true + true + true + + + + + + + + + + + + + + + + + + + + true + + + + + + + + This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. + + + + + \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/Main.cpp new file mode 100644 index 000000000..9e403f5de --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/Main.cpp @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "GPUCore.h" + +int main(int argc, char *argv[]) { + std::unique_ptr opts = std::make_unique(argc, argv); + opts->init(); + std::unique_ptr gpucopy = std::make_unique(opts.get()); + gpucopy->Run(); +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/packages.config b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/packages.config new file mode 100644 index 000000000..0bf9cc34c --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/packages.config @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/DXSampleHelper.h b/superbench/benchmarks/micro_benchmarks/directx_third_party/DXSampleHelper.h new file mode 100644 index 000000000..780bb0896 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/DXSampleHelper.h @@ -0,0 +1,275 @@ +//********************************************************* +// +// Copyright (c) Microsoft. All rights reserved. +// This code is licensed under the MIT License (MIT). +// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY +// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR +// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. +// +//********************************************************* + +#pragma once +#include +#include +#include +#include + +// Note that while ComPtr is used to manage the lifetime of resources on the CPU, +// it has no understanding of the lifetime of resources on the GPU. Apps must account +// for the GPU lifetime of resources to avoid destroying objects that may still be +// referenced by the GPU. +using Microsoft::WRL::ComPtr; + +inline std::string HrToString(HRESULT hr) { + char s_str[64] = {}; + sprintf_s(s_str, "HRESULT of 0x%08X", static_cast(hr)); + return std::string(s_str); +} + +class HrException : public std::runtime_error { + public: + HrException(HRESULT hr) : std::runtime_error(HrToString(hr)), m_hr(hr) {} + HRESULT Error() const { return m_hr; } + + private: + const HRESULT m_hr; +}; + +#define SAFE_RELEASE(p) \ + if (p) \ + (p)->Release() + +inline void ThrowIfFailed(HRESULT hr) { + if (FAILED(hr)) { + throw HrException(hr); + } +} + +inline void GetAssetsPath(_Out_writes_(pathSize) WCHAR *path, UINT pathSize) { + if (path == nullptr) { + throw std::exception(); + } + + DWORD size = GetModuleFileName(nullptr, path, pathSize); + if (size == 0 || size == pathSize) { + // Method failed or path was truncated. + throw std::exception(); + } + + WCHAR *lastSlash = wcsrchr(path, L'\\'); + if (lastSlash) { + *(lastSlash + 1) = L'\0'; + } +} + +inline HRESULT ReadDataFromFile(LPCWSTR filename, byte **data, UINT *size) { + using namespace Microsoft::WRL; + + CREATEFILE2_EXTENDED_PARAMETERS extendedParams = {}; + extendedParams.dwSize = sizeof(CREATEFILE2_EXTENDED_PARAMETERS); + extendedParams.dwFileAttributes = FILE_ATTRIBUTE_NORMAL; + extendedParams.dwFileFlags = FILE_FLAG_SEQUENTIAL_SCAN; + extendedParams.dwSecurityQosFlags = SECURITY_ANONYMOUS; + extendedParams.lpSecurityAttributes = nullptr; + extendedParams.hTemplateFile = nullptr; + + Wrappers::FileHandle file(CreateFile2(filename, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, &extendedParams)); + if (file.Get() == INVALID_HANDLE_VALUE) { + throw std::exception(); + } + + FILE_STANDARD_INFO fileInfo = {}; + if (!GetFileInformationByHandleEx(file.Get(), FileStandardInfo, &fileInfo, sizeof(fileInfo))) { + throw std::exception(); + } + + if (fileInfo.EndOfFile.HighPart != 0) { + throw std::exception(); + } + + *data = reinterpret_cast(malloc(fileInfo.EndOfFile.LowPart)); + *size = fileInfo.EndOfFile.LowPart; + + if (!ReadFile(file.Get(), *data, fileInfo.EndOfFile.LowPart, nullptr, nullptr)) { + throw std::exception(); + } + + return S_OK; +} + +inline HRESULT ReadDataFromDDSFile(LPCWSTR filename, byte **data, UINT *offset, UINT *size) { + if (FAILED(ReadDataFromFile(filename, data, size))) { + return E_FAIL; + } + + // DDS files always start with the same magic number. + static const UINT DDS_MAGIC = 0x20534444; + UINT magicNumber = *reinterpret_cast(*data); + if (magicNumber != DDS_MAGIC) { + return E_FAIL; + } + + struct DDS_PIXELFORMAT { + UINT size; + UINT flags; + UINT fourCC; + UINT rgbBitCount; + UINT rBitMask; + UINT gBitMask; + UINT bBitMask; + UINT aBitMask; + }; + + struct DDS_HEADER { + UINT size; + UINT flags; + UINT height; + UINT width; + UINT pitchOrLinearSize; + UINT depth; + UINT mipMapCount; + UINT reserved1[11]; + DDS_PIXELFORMAT ddsPixelFormat; + UINT caps; + UINT caps2; + UINT caps3; + UINT caps4; + UINT reserved2; + }; + + auto ddsHeader = reinterpret_cast(*data + sizeof(UINT)); + if (ddsHeader->size != sizeof(DDS_HEADER) || ddsHeader->ddsPixelFormat.size != sizeof(DDS_PIXELFORMAT)) { + return E_FAIL; + } + + const ptrdiff_t ddsDataOffset = sizeof(UINT) + sizeof(DDS_HEADER); + *offset = ddsDataOffset; + *size = *size - ddsDataOffset; + + return S_OK; +} + +// Assign a name to the object to aid with debugging. +#if defined(_DEBUG) || defined(DBG) +inline void SetName(ID3D12Object *pObject, LPCWSTR name) { pObject->SetName(name); } +inline void SetNameIndexed(ID3D12Object *pObject, LPCWSTR name, UINT index) { + WCHAR fullName[50]; + if (swprintf_s(fullName, L"%s[%u]", name, index) > 0) { + pObject->SetName(fullName); + } +} +#else +inline void SetName(ID3D12Object *, LPCWSTR) {} +inline void SetNameIndexed(ID3D12Object *, LPCWSTR, UINT) {} +#endif + +// Naming helper for ComPtr. +// Assigns the name of the variable as the name of the object. +// The indexed variant will include the index in the name of the object. +#define NAME_D3D12_OBJECT(x) SetName((x).Get(), L#x) +#define NAME_D3D12_OBJECT_INDEXED(x, n) SetNameIndexed((x)[n].Get(), L#x, n) + +inline UINT CalculateConstantBufferByteSize(UINT byteSize) { + // Constant buffer size is required to be aligned. + return (byteSize + (D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1)) & + ~(D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT - 1); +} + +#ifdef D3D_COMPILE_STANDARD_FILE_INCLUDE +inline Microsoft::WRL::ComPtr CompileShader(const std::wstring &filename, const D3D_SHADER_MACRO *defines, + const std::string &entrypoint, const std::string &target) { + UINT compileFlags = 0; +#if defined(_DEBUG) || defined(DBG) + compileFlags = D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION; +#endif + + HRESULT hr; + + Microsoft::WRL::ComPtr byteCode = nullptr; + Microsoft::WRL::ComPtr errors; + hr = D3DCompileFromFile(filename.c_str(), defines, D3D_COMPILE_STANDARD_FILE_INCLUDE, entrypoint.c_str(), + target.c_str(), compileFlags, 0, &byteCode, &errors); + + if (errors != nullptr) { + OutputDebugStringA((char *)errors->GetBufferPointer()); + } + ThrowIfFailed(hr); + + return byteCode; +} +#endif + +// Resets all elements in a ComPtr array. +template void ResetComPtrArray(T *comPtrArray) { + for (auto &i : *comPtrArray) { + i.Reset(); + } +} + +// Resets all elements in a unique_ptr array. +template void ResetUniquePtrArray(T *uniquePtrArray) { + for (auto &i : *uniquePtrArray) { + i.reset(); + } +} + +/** + * @brief Helper function for acquiring the first available hardware adapter that supports Direct3D 12. + * If no such adapter can be found, *ppAdapter will be set to nullptr. + * @param pFactory a pointer to factory object. + * @param[out] ppAdapter a pointer of pointer to a adapter. + * @param requestHighPerformanceAdapter the option of adapter. + */ +inline void GetHardwareAdapter(IDXGIFactory1 *pFactory, IDXGIAdapter1 **ppAdapter, + bool requestHighPerformanceAdapter = FALSE) { + *ppAdapter = nullptr; + + ComPtr adapter; + + ComPtr factory6; + if (SUCCEEDED(pFactory->QueryInterface(IID_PPV_ARGS(&factory6)))) { + for (UINT adapterIndex = 0; SUCCEEDED(factory6->EnumAdapterByGpuPreference( + adapterIndex, + requestHighPerformanceAdapter == true ? DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE + : DXGI_GPU_PREFERENCE_UNSPECIFIED, + IID_PPV_ARGS(&adapter))); + ++adapterIndex) { + DXGI_ADAPTER_DESC1 desc; + adapter->GetDesc1(&desc); + + if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) { + // Don't select the Basic Render Driver adapter. + // If you want a software adapter, pass in "/warp" on the command line. + continue; + } + + // Check to see whether the adapter supports Direct3D 12, but don't create the + // actual device yet. + if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, _uuidof(ID3D12Device), nullptr))) { + break; + } + } + } + + if (adapter.Get() == nullptr) { + for (UINT adapterIndex = 0; SUCCEEDED(pFactory->EnumAdapters1(adapterIndex, &adapter)); ++adapterIndex) { + DXGI_ADAPTER_DESC1 desc; + adapter->GetDesc1(&desc); + + if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) { + // Don't select the Basic Render Driver adapter. + // If you want a software adapter, pass in "/warp" on the command line. + continue; + } + + // Check to see whether the adapter supports Direct3D 12, but don't create the + // actual device yet. + if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, _uuidof(ID3D12Device), nullptr))) { + break; + } + } + } + + *ppAdapter = adapter.Detach(); +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/d3dx12.h b/superbench/benchmarks/micro_benchmarks/directx_third_party/d3dx12.h new file mode 100644 index 000000000..17b2b79a7 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/d3dx12.h @@ -0,0 +1,3258 @@ +//********************************************************* +// +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License (MIT). +// +//********************************************************* + +#ifndef __D3DX12_H__ +#define __D3DX12_H__ + +#include "d3d12.h" + +#if defined(__cplusplus) + +struct CD3DX12_DEFAULT {}; +extern const DECLSPEC_SELECTANY CD3DX12_DEFAULT D3D12_DEFAULT; + +//------------------------------------------------------------------------------------------------ +inline bool operator==(const D3D12_VIEWPORT &l, const D3D12_VIEWPORT &r) noexcept { + return l.TopLeftX == r.TopLeftX && l.TopLeftY == r.TopLeftY && l.Width == r.Width && l.Height == r.Height && + l.MinDepth == r.MinDepth && l.MaxDepth == r.MaxDepth; +} + +//------------------------------------------------------------------------------------------------ +inline bool operator!=(const D3D12_VIEWPORT &l, const D3D12_VIEWPORT &r) noexcept { return !(l == r); } + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RECT : public D3D12_RECT { + CD3DX12_RECT() = default; + explicit CD3DX12_RECT(const D3D12_RECT &o) noexcept : D3D12_RECT(o) {} + explicit CD3DX12_RECT(LONG Left, LONG Top, LONG Right, LONG Bottom) noexcept { + left = Left; + top = Top; + right = Right; + bottom = Bottom; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_VIEWPORT : public D3D12_VIEWPORT { + CD3DX12_VIEWPORT() = default; + explicit CD3DX12_VIEWPORT(const D3D12_VIEWPORT &o) noexcept : D3D12_VIEWPORT(o) {} + explicit CD3DX12_VIEWPORT(FLOAT topLeftX, FLOAT topLeftY, FLOAT width, FLOAT height, + FLOAT minDepth = D3D12_MIN_DEPTH, FLOAT maxDepth = D3D12_MAX_DEPTH) noexcept { + TopLeftX = topLeftX; + TopLeftY = topLeftY; + Width = width; + Height = height; + MinDepth = minDepth; + MaxDepth = maxDepth; + } + explicit CD3DX12_VIEWPORT(_In_ ID3D12Resource *pResource, UINT mipSlice = 0, FLOAT topLeftX = 0.0f, + FLOAT topLeftY = 0.0f, FLOAT minDepth = D3D12_MIN_DEPTH, + FLOAT maxDepth = D3D12_MAX_DEPTH) noexcept { + auto Desc = pResource->GetDesc(); + const UINT64 SubresourceWidth = Desc.Width >> mipSlice; + const UINT64 SubresourceHeight = Desc.Height >> mipSlice; + switch (Desc.Dimension) { + case D3D12_RESOURCE_DIMENSION_BUFFER: + TopLeftX = topLeftX; + TopLeftY = 0.0f; + Width = float(Desc.Width) - topLeftX; + Height = 1.0f; + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE1D: + TopLeftX = topLeftX; + TopLeftY = 0.0f; + Width = (SubresourceWidth ? float(SubresourceWidth) : 1.0f) - topLeftX; + Height = 1.0f; + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE2D: + case D3D12_RESOURCE_DIMENSION_TEXTURE3D: + TopLeftX = topLeftX; + TopLeftY = topLeftY; + Width = (SubresourceWidth ? float(SubresourceWidth) : 1.0f) - topLeftX; + Height = (SubresourceHeight ? float(SubresourceHeight) : 1.0f) - topLeftY; + break; + default: + break; + } + + MinDepth = minDepth; + MaxDepth = maxDepth; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_BOX : public D3D12_BOX { + CD3DX12_BOX() = default; + explicit CD3DX12_BOX(const D3D12_BOX &o) noexcept : D3D12_BOX(o) {} + explicit CD3DX12_BOX(LONG Left, LONG Right) noexcept { + left = static_cast(Left); + top = 0; + front = 0; + right = static_cast(Right); + bottom = 1; + back = 1; + } + explicit CD3DX12_BOX(LONG Left, LONG Top, LONG Right, LONG Bottom) noexcept { + left = static_cast(Left); + top = static_cast(Top); + front = 0; + right = static_cast(Right); + bottom = static_cast(Bottom); + back = 1; + } + explicit CD3DX12_BOX(LONG Left, LONG Top, LONG Front, LONG Right, LONG Bottom, LONG Back) noexcept { + left = static_cast(Left); + top = static_cast(Top); + front = static_cast(Front); + right = static_cast(Right); + bottom = static_cast(Bottom); + back = static_cast(Back); + } +}; +inline bool operator==(const D3D12_BOX &l, const D3D12_BOX &r) noexcept { + return l.left == r.left && l.top == r.top && l.front == r.front && l.right == r.right && l.bottom == r.bottom && + l.back == r.back; +} +inline bool operator!=(const D3D12_BOX &l, const D3D12_BOX &r) noexcept { return !(l == r); } + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_DEPTH_STENCIL_DESC : public D3D12_DEPTH_STENCIL_DESC { + CD3DX12_DEPTH_STENCIL_DESC() = default; + explicit CD3DX12_DEPTH_STENCIL_DESC(const D3D12_DEPTH_STENCIL_DESC &o) noexcept : D3D12_DEPTH_STENCIL_DESC(o) {} + explicit CD3DX12_DEPTH_STENCIL_DESC(CD3DX12_DEFAULT) noexcept { + DepthEnable = TRUE; + DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL; + DepthFunc = D3D12_COMPARISON_FUNC_LESS; + StencilEnable = FALSE; + StencilReadMask = D3D12_DEFAULT_STENCIL_READ_MASK; + StencilWriteMask = D3D12_DEFAULT_STENCIL_WRITE_MASK; + const D3D12_DEPTH_STENCILOP_DESC defaultStencilOp = {D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP, + D3D12_STENCIL_OP_KEEP, D3D12_COMPARISON_FUNC_ALWAYS}; + FrontFace = defaultStencilOp; + BackFace = defaultStencilOp; + } + explicit CD3DX12_DEPTH_STENCIL_DESC(BOOL depthEnable, D3D12_DEPTH_WRITE_MASK depthWriteMask, + D3D12_COMPARISON_FUNC depthFunc, BOOL stencilEnable, UINT8 stencilReadMask, + UINT8 stencilWriteMask, D3D12_STENCIL_OP frontStencilFailOp, + D3D12_STENCIL_OP frontStencilDepthFailOp, D3D12_STENCIL_OP frontStencilPassOp, + D3D12_COMPARISON_FUNC frontStencilFunc, D3D12_STENCIL_OP backStencilFailOp, + D3D12_STENCIL_OP backStencilDepthFailOp, D3D12_STENCIL_OP backStencilPassOp, + D3D12_COMPARISON_FUNC backStencilFunc) noexcept { + DepthEnable = depthEnable; + DepthWriteMask = depthWriteMask; + DepthFunc = depthFunc; + StencilEnable = stencilEnable; + StencilReadMask = stencilReadMask; + StencilWriteMask = stencilWriteMask; + FrontFace.StencilFailOp = frontStencilFailOp; + FrontFace.StencilDepthFailOp = frontStencilDepthFailOp; + FrontFace.StencilPassOp = frontStencilPassOp; + FrontFace.StencilFunc = frontStencilFunc; + BackFace.StencilFailOp = backStencilFailOp; + BackFace.StencilDepthFailOp = backStencilDepthFailOp; + BackFace.StencilPassOp = backStencilPassOp; + BackFace.StencilFunc = backStencilFunc; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_DEPTH_STENCIL_DESC1 : public D3D12_DEPTH_STENCIL_DESC1 { + CD3DX12_DEPTH_STENCIL_DESC1() = default; + explicit CD3DX12_DEPTH_STENCIL_DESC1(const D3D12_DEPTH_STENCIL_DESC1 &o) noexcept : D3D12_DEPTH_STENCIL_DESC1(o) {} + explicit CD3DX12_DEPTH_STENCIL_DESC1(const D3D12_DEPTH_STENCIL_DESC &o) noexcept { + DepthEnable = o.DepthEnable; + DepthWriteMask = o.DepthWriteMask; + DepthFunc = o.DepthFunc; + StencilEnable = o.StencilEnable; + StencilReadMask = o.StencilReadMask; + StencilWriteMask = o.StencilWriteMask; + FrontFace.StencilFailOp = o.FrontFace.StencilFailOp; + FrontFace.StencilDepthFailOp = o.FrontFace.StencilDepthFailOp; + FrontFace.StencilPassOp = o.FrontFace.StencilPassOp; + FrontFace.StencilFunc = o.FrontFace.StencilFunc; + BackFace.StencilFailOp = o.BackFace.StencilFailOp; + BackFace.StencilDepthFailOp = o.BackFace.StencilDepthFailOp; + BackFace.StencilPassOp = o.BackFace.StencilPassOp; + BackFace.StencilFunc = o.BackFace.StencilFunc; + DepthBoundsTestEnable = FALSE; + } + explicit CD3DX12_DEPTH_STENCIL_DESC1(CD3DX12_DEFAULT) noexcept { + DepthEnable = TRUE; + DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL; + DepthFunc = D3D12_COMPARISON_FUNC_LESS; + StencilEnable = FALSE; + StencilReadMask = D3D12_DEFAULT_STENCIL_READ_MASK; + StencilWriteMask = D3D12_DEFAULT_STENCIL_WRITE_MASK; + const D3D12_DEPTH_STENCILOP_DESC defaultStencilOp = {D3D12_STENCIL_OP_KEEP, D3D12_STENCIL_OP_KEEP, + D3D12_STENCIL_OP_KEEP, D3D12_COMPARISON_FUNC_ALWAYS}; + FrontFace = defaultStencilOp; + BackFace = defaultStencilOp; + DepthBoundsTestEnable = FALSE; + } + explicit CD3DX12_DEPTH_STENCIL_DESC1(BOOL depthEnable, D3D12_DEPTH_WRITE_MASK depthWriteMask, + D3D12_COMPARISON_FUNC depthFunc, BOOL stencilEnable, UINT8 stencilReadMask, + UINT8 stencilWriteMask, D3D12_STENCIL_OP frontStencilFailOp, + D3D12_STENCIL_OP frontStencilDepthFailOp, D3D12_STENCIL_OP frontStencilPassOp, + D3D12_COMPARISON_FUNC frontStencilFunc, D3D12_STENCIL_OP backStencilFailOp, + D3D12_STENCIL_OP backStencilDepthFailOp, D3D12_STENCIL_OP backStencilPassOp, + D3D12_COMPARISON_FUNC backStencilFunc, BOOL depthBoundsTestEnable) noexcept { + DepthEnable = depthEnable; + DepthWriteMask = depthWriteMask; + DepthFunc = depthFunc; + StencilEnable = stencilEnable; + StencilReadMask = stencilReadMask; + StencilWriteMask = stencilWriteMask; + FrontFace.StencilFailOp = frontStencilFailOp; + FrontFace.StencilDepthFailOp = frontStencilDepthFailOp; + FrontFace.StencilPassOp = frontStencilPassOp; + FrontFace.StencilFunc = frontStencilFunc; + BackFace.StencilFailOp = backStencilFailOp; + BackFace.StencilDepthFailOp = backStencilDepthFailOp; + BackFace.StencilPassOp = backStencilPassOp; + BackFace.StencilFunc = backStencilFunc; + DepthBoundsTestEnable = depthBoundsTestEnable; + } + operator D3D12_DEPTH_STENCIL_DESC() const noexcept { + D3D12_DEPTH_STENCIL_DESC D; + D.DepthEnable = DepthEnable; + D.DepthWriteMask = DepthWriteMask; + D.DepthFunc = DepthFunc; + D.StencilEnable = StencilEnable; + D.StencilReadMask = StencilReadMask; + D.StencilWriteMask = StencilWriteMask; + D.FrontFace.StencilFailOp = FrontFace.StencilFailOp; + D.FrontFace.StencilDepthFailOp = FrontFace.StencilDepthFailOp; + D.FrontFace.StencilPassOp = FrontFace.StencilPassOp; + D.FrontFace.StencilFunc = FrontFace.StencilFunc; + D.BackFace.StencilFailOp = BackFace.StencilFailOp; + D.BackFace.StencilDepthFailOp = BackFace.StencilDepthFailOp; + D.BackFace.StencilPassOp = BackFace.StencilPassOp; + D.BackFace.StencilFunc = BackFace.StencilFunc; + return D; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_BLEND_DESC : public D3D12_BLEND_DESC { + CD3DX12_BLEND_DESC() = default; + explicit CD3DX12_BLEND_DESC(const D3D12_BLEND_DESC &o) noexcept : D3D12_BLEND_DESC(o) {} + explicit CD3DX12_BLEND_DESC(CD3DX12_DEFAULT) noexcept { + AlphaToCoverageEnable = FALSE; + IndependentBlendEnable = FALSE; + const D3D12_RENDER_TARGET_BLEND_DESC defaultRenderTargetBlendDesc = { + FALSE, + FALSE, + D3D12_BLEND_ONE, + D3D12_BLEND_ZERO, + D3D12_BLEND_OP_ADD, + D3D12_BLEND_ONE, + D3D12_BLEND_ZERO, + D3D12_BLEND_OP_ADD, + D3D12_LOGIC_OP_NOOP, + D3D12_COLOR_WRITE_ENABLE_ALL, + }; + for (UINT i = 0; i < D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT; ++i) + RenderTarget[i] = defaultRenderTargetBlendDesc; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RASTERIZER_DESC : public D3D12_RASTERIZER_DESC { + CD3DX12_RASTERIZER_DESC() = default; + explicit CD3DX12_RASTERIZER_DESC(const D3D12_RASTERIZER_DESC &o) noexcept : D3D12_RASTERIZER_DESC(o) {} + explicit CD3DX12_RASTERIZER_DESC(CD3DX12_DEFAULT) noexcept { + FillMode = D3D12_FILL_MODE_SOLID; + CullMode = D3D12_CULL_MODE_BACK; + FrontCounterClockwise = FALSE; + DepthBias = D3D12_DEFAULT_DEPTH_BIAS; + DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP; + SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS; + DepthClipEnable = TRUE; + MultisampleEnable = FALSE; + AntialiasedLineEnable = FALSE; + ForcedSampleCount = 0; + ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF; + } + explicit CD3DX12_RASTERIZER_DESC(D3D12_FILL_MODE fillMode, D3D12_CULL_MODE cullMode, BOOL frontCounterClockwise, + INT depthBias, FLOAT depthBiasClamp, FLOAT slopeScaledDepthBias, + BOOL depthClipEnable, BOOL multisampleEnable, BOOL antialiasedLineEnable, + UINT forcedSampleCount, + D3D12_CONSERVATIVE_RASTERIZATION_MODE conservativeRaster) noexcept { + FillMode = fillMode; + CullMode = cullMode; + FrontCounterClockwise = frontCounterClockwise; + DepthBias = depthBias; + DepthBiasClamp = depthBiasClamp; + SlopeScaledDepthBias = slopeScaledDepthBias; + DepthClipEnable = depthClipEnable; + MultisampleEnable = multisampleEnable; + AntialiasedLineEnable = antialiasedLineEnable; + ForcedSampleCount = forcedSampleCount; + ConservativeRaster = conservativeRaster; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RESOURCE_ALLOCATION_INFO : public D3D12_RESOURCE_ALLOCATION_INFO { + CD3DX12_RESOURCE_ALLOCATION_INFO() = default; + explicit CD3DX12_RESOURCE_ALLOCATION_INFO(const D3D12_RESOURCE_ALLOCATION_INFO &o) noexcept + : D3D12_RESOURCE_ALLOCATION_INFO(o) {} + CD3DX12_RESOURCE_ALLOCATION_INFO(UINT64 size, UINT64 alignment) noexcept { + SizeInBytes = size; + Alignment = alignment; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_HEAP_PROPERTIES : public D3D12_HEAP_PROPERTIES { + CD3DX12_HEAP_PROPERTIES() = default; + explicit CD3DX12_HEAP_PROPERTIES(const D3D12_HEAP_PROPERTIES &o) noexcept : D3D12_HEAP_PROPERTIES(o) {} + CD3DX12_HEAP_PROPERTIES(D3D12_CPU_PAGE_PROPERTY cpuPageProperty, D3D12_MEMORY_POOL memoryPoolPreference, + UINT creationNodeMask = 1, UINT nodeMask = 1) + noexcept { + Type = D3D12_HEAP_TYPE_CUSTOM; + CPUPageProperty = cpuPageProperty; + MemoryPoolPreference = memoryPoolPreference; + CreationNodeMask = creationNodeMask; + VisibleNodeMask = nodeMask; + } + explicit CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE type, UINT creationNodeMask = 1, UINT nodeMask = 1) noexcept { + Type = type; + CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; + MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; + CreationNodeMask = creationNodeMask; + VisibleNodeMask = nodeMask; + } + bool IsCPUAccessible() const noexcept { + return Type == D3D12_HEAP_TYPE_UPLOAD || Type == D3D12_HEAP_TYPE_READBACK || + (Type == D3D12_HEAP_TYPE_CUSTOM && (CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE || + CPUPageProperty == D3D12_CPU_PAGE_PROPERTY_WRITE_BACK)); + } +}; +inline bool operator==(const D3D12_HEAP_PROPERTIES &l, const D3D12_HEAP_PROPERTIES &r) noexcept { + return l.Type == r.Type && l.CPUPageProperty == r.CPUPageProperty && + l.MemoryPoolPreference == r.MemoryPoolPreference && l.CreationNodeMask == r.CreationNodeMask && + l.VisibleNodeMask == r.VisibleNodeMask; +} +inline bool operator!=(const D3D12_HEAP_PROPERTIES &l, const D3D12_HEAP_PROPERTIES &r) noexcept { return !(l == r); } + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_HEAP_DESC : public D3D12_HEAP_DESC { + CD3DX12_HEAP_DESC() = default; + explicit CD3DX12_HEAP_DESC(const D3D12_HEAP_DESC &o) noexcept : D3D12_HEAP_DESC(o) {} + CD3DX12_HEAP_DESC(UINT64 size, D3D12_HEAP_PROPERTIES properties, UINT64 alignment = 0, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) + noexcept { + SizeInBytes = size; + Properties = properties; + Alignment = alignment; + Flags = flags; + } + CD3DX12_HEAP_DESC(UINT64 size, D3D12_HEAP_TYPE type, UINT64 alignment = 0, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) + noexcept { + SizeInBytes = size; + Properties = CD3DX12_HEAP_PROPERTIES(type); + Alignment = alignment; + Flags = flags; + } + CD3DX12_HEAP_DESC(UINT64 size, D3D12_CPU_PAGE_PROPERTY cpuPageProperty, D3D12_MEMORY_POOL memoryPoolPreference, + UINT64 alignment = 0, D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) + noexcept { + SizeInBytes = size; + Properties = CD3DX12_HEAP_PROPERTIES(cpuPageProperty, memoryPoolPreference); + Alignment = alignment; + Flags = flags; + } + CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, D3D12_HEAP_PROPERTIES properties, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) + noexcept { + SizeInBytes = resAllocInfo.SizeInBytes; + Properties = properties; + Alignment = resAllocInfo.Alignment; + Flags = flags; + } + CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, D3D12_HEAP_TYPE type, + D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) + noexcept { + SizeInBytes = resAllocInfo.SizeInBytes; + Properties = CD3DX12_HEAP_PROPERTIES(type); + Alignment = resAllocInfo.Alignment; + Flags = flags; + } + CD3DX12_HEAP_DESC(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, D3D12_CPU_PAGE_PROPERTY cpuPageProperty, + D3D12_MEMORY_POOL memoryPoolPreference, D3D12_HEAP_FLAGS flags = D3D12_HEAP_FLAG_NONE) + noexcept { + SizeInBytes = resAllocInfo.SizeInBytes; + Properties = CD3DX12_HEAP_PROPERTIES(cpuPageProperty, memoryPoolPreference); + Alignment = resAllocInfo.Alignment; + Flags = flags; + } + bool IsCPUAccessible() const noexcept { + return static_cast(&Properties)->IsCPUAccessible(); + } +}; +inline bool operator==(const D3D12_HEAP_DESC &l, const D3D12_HEAP_DESC &r) noexcept { + return l.SizeInBytes == r.SizeInBytes && l.Properties == r.Properties && l.Alignment == r.Alignment && + l.Flags == r.Flags; +} +inline bool operator!=(const D3D12_HEAP_DESC &l, const D3D12_HEAP_DESC &r) noexcept { return !(l == r); } + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_CLEAR_VALUE : public D3D12_CLEAR_VALUE { + CD3DX12_CLEAR_VALUE() = default; + explicit CD3DX12_CLEAR_VALUE(const D3D12_CLEAR_VALUE &o) noexcept : D3D12_CLEAR_VALUE(o) {} + CD3DX12_CLEAR_VALUE(DXGI_FORMAT format, const FLOAT color[4]) noexcept { + Format = format; + memcpy(Color, color, sizeof(Color)); + } + CD3DX12_CLEAR_VALUE(DXGI_FORMAT format, FLOAT depth, UINT8 stencil) noexcept { + Format = format; + memset(&Color, 0, sizeof(Color)); + /* Use memcpy to preserve NAN values */ + memcpy(&DepthStencil.Depth, &depth, sizeof(depth)); + DepthStencil.Stencil = stencil; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RANGE : public D3D12_RANGE { + CD3DX12_RANGE() = default; + explicit CD3DX12_RANGE(const D3D12_RANGE &o) noexcept : D3D12_RANGE(o) {} + CD3DX12_RANGE(SIZE_T begin, SIZE_T end) noexcept { + Begin = begin; + End = end; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RANGE_UINT64 : public D3D12_RANGE_UINT64 { + CD3DX12_RANGE_UINT64() = default; + explicit CD3DX12_RANGE_UINT64(const D3D12_RANGE_UINT64 &o) noexcept : D3D12_RANGE_UINT64(o) {} + CD3DX12_RANGE_UINT64(UINT64 begin, UINT64 end) noexcept { + Begin = begin; + End = end; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_SUBRESOURCE_RANGE_UINT64 : public D3D12_SUBRESOURCE_RANGE_UINT64 { + CD3DX12_SUBRESOURCE_RANGE_UINT64() = default; + explicit CD3DX12_SUBRESOURCE_RANGE_UINT64(const D3D12_SUBRESOURCE_RANGE_UINT64 &o) noexcept + : D3D12_SUBRESOURCE_RANGE_UINT64(o) {} + CD3DX12_SUBRESOURCE_RANGE_UINT64(UINT subresource, const D3D12_RANGE_UINT64 &range) noexcept { + Subresource = subresource; + Range = range; + } + CD3DX12_SUBRESOURCE_RANGE_UINT64(UINT subresource, UINT64 begin, UINT64 end) noexcept { + Subresource = subresource; + Range.Begin = begin; + Range.End = end; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_SHADER_BYTECODE : public D3D12_SHADER_BYTECODE { + CD3DX12_SHADER_BYTECODE() = default; + explicit CD3DX12_SHADER_BYTECODE(const D3D12_SHADER_BYTECODE &o) noexcept : D3D12_SHADER_BYTECODE(o) {} + CD3DX12_SHADER_BYTECODE(_In_ ID3DBlob *pShaderBlob) noexcept { + pShaderBytecode = pShaderBlob->GetBufferPointer(); + BytecodeLength = pShaderBlob->GetBufferSize(); + } + CD3DX12_SHADER_BYTECODE(const void *_pShaderBytecode, SIZE_T bytecodeLength) noexcept { + pShaderBytecode = _pShaderBytecode; + BytecodeLength = bytecodeLength; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_TILED_RESOURCE_COORDINATE : public D3D12_TILED_RESOURCE_COORDINATE { + CD3DX12_TILED_RESOURCE_COORDINATE() = default; + explicit CD3DX12_TILED_RESOURCE_COORDINATE(const D3D12_TILED_RESOURCE_COORDINATE &o) noexcept + : D3D12_TILED_RESOURCE_COORDINATE(o) {} + CD3DX12_TILED_RESOURCE_COORDINATE(UINT x, UINT y, UINT z, UINT subresource) noexcept { + X = x; + Y = y; + Z = z; + Subresource = subresource; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_TILE_REGION_SIZE : public D3D12_TILE_REGION_SIZE { + CD3DX12_TILE_REGION_SIZE() = default; + explicit CD3DX12_TILE_REGION_SIZE(const D3D12_TILE_REGION_SIZE &o) noexcept : D3D12_TILE_REGION_SIZE(o) {} + CD3DX12_TILE_REGION_SIZE(UINT numTiles, BOOL useBox, UINT width, UINT16 height, UINT16 depth) noexcept { + NumTiles = numTiles; + UseBox = useBox; + Width = width; + Height = height; + Depth = depth; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_SUBRESOURCE_TILING : public D3D12_SUBRESOURCE_TILING { + CD3DX12_SUBRESOURCE_TILING() = default; + explicit CD3DX12_SUBRESOURCE_TILING(const D3D12_SUBRESOURCE_TILING &o) noexcept : D3D12_SUBRESOURCE_TILING(o) {} + CD3DX12_SUBRESOURCE_TILING(UINT widthInTiles, UINT16 heightInTiles, UINT16 depthInTiles, + UINT startTileIndexInOverallResource) + noexcept { + WidthInTiles = widthInTiles; + HeightInTiles = heightInTiles; + DepthInTiles = depthInTiles; + StartTileIndexInOverallResource = startTileIndexInOverallResource; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_TILE_SHAPE : public D3D12_TILE_SHAPE { + CD3DX12_TILE_SHAPE() = default; + explicit CD3DX12_TILE_SHAPE(const D3D12_TILE_SHAPE &o) noexcept : D3D12_TILE_SHAPE(o) {} + CD3DX12_TILE_SHAPE(UINT widthInTexels, UINT heightInTexels, UINT depthInTexels) noexcept { + WidthInTexels = widthInTexels; + HeightInTexels = heightInTexels; + DepthInTexels = depthInTexels; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RESOURCE_BARRIER : public D3D12_RESOURCE_BARRIER { + CD3DX12_RESOURCE_BARRIER() = default; + explicit CD3DX12_RESOURCE_BARRIER(const D3D12_RESOURCE_BARRIER &o) noexcept : D3D12_RESOURCE_BARRIER(o) {} + static inline CD3DX12_RESOURCE_BARRIER + Transition(_In_ ID3D12Resource *pResource, D3D12_RESOURCE_STATES stateBefore, D3D12_RESOURCE_STATES stateAfter, + UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES, + D3D12_RESOURCE_BARRIER_FLAGS flags = D3D12_RESOURCE_BARRIER_FLAG_NONE) noexcept { + CD3DX12_RESOURCE_BARRIER result = {}; + D3D12_RESOURCE_BARRIER &barrier = result; + result.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + result.Flags = flags; + barrier.Transition.pResource = pResource; + barrier.Transition.StateBefore = stateBefore; + barrier.Transition.StateAfter = stateAfter; + barrier.Transition.Subresource = subresource; + return result; + } + static inline CD3DX12_RESOURCE_BARRIER Aliasing(_In_ ID3D12Resource *pResourceBefore, + _In_ ID3D12Resource *pResourceAfter) noexcept { + CD3DX12_RESOURCE_BARRIER result = {}; + D3D12_RESOURCE_BARRIER &barrier = result; + result.Type = D3D12_RESOURCE_BARRIER_TYPE_ALIASING; + barrier.Aliasing.pResourceBefore = pResourceBefore; + barrier.Aliasing.pResourceAfter = pResourceAfter; + return result; + } + static inline CD3DX12_RESOURCE_BARRIER UAV(_In_ ID3D12Resource *pResource) noexcept { + CD3DX12_RESOURCE_BARRIER result = {}; + D3D12_RESOURCE_BARRIER &barrier = result; + result.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV; + barrier.UAV.pResource = pResource; + return result; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_PACKED_MIP_INFO : public D3D12_PACKED_MIP_INFO { + CD3DX12_PACKED_MIP_INFO() = default; + explicit CD3DX12_PACKED_MIP_INFO(const D3D12_PACKED_MIP_INFO &o) noexcept : D3D12_PACKED_MIP_INFO(o) {} + CD3DX12_PACKED_MIP_INFO(UINT8 numStandardMips, UINT8 numPackedMips, UINT numTilesForPackedMips, + UINT startTileIndexInOverallResource) + noexcept { + NumStandardMips = numStandardMips; + NumPackedMips = numPackedMips; + NumTilesForPackedMips = numTilesForPackedMips; + StartTileIndexInOverallResource = startTileIndexInOverallResource; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_SUBRESOURCE_FOOTPRINT : public D3D12_SUBRESOURCE_FOOTPRINT { + CD3DX12_SUBRESOURCE_FOOTPRINT() = default; + explicit CD3DX12_SUBRESOURCE_FOOTPRINT(const D3D12_SUBRESOURCE_FOOTPRINT &o) noexcept + : D3D12_SUBRESOURCE_FOOTPRINT(o) {} + CD3DX12_SUBRESOURCE_FOOTPRINT(DXGI_FORMAT format, UINT width, UINT height, UINT depth, UINT rowPitch) noexcept { + Format = format; + Width = width; + Height = height; + Depth = depth; + RowPitch = rowPitch; + } + explicit CD3DX12_SUBRESOURCE_FOOTPRINT(const D3D12_RESOURCE_DESC &resDesc, UINT rowPitch) noexcept { + Format = resDesc.Format; + Width = UINT(resDesc.Width); + Height = resDesc.Height; + Depth = (resDesc.Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? resDesc.DepthOrArraySize : 1); + RowPitch = rowPitch; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_TEXTURE_COPY_LOCATION : public D3D12_TEXTURE_COPY_LOCATION { + CD3DX12_TEXTURE_COPY_LOCATION() = default; + explicit CD3DX12_TEXTURE_COPY_LOCATION(const D3D12_TEXTURE_COPY_LOCATION &o) noexcept + : D3D12_TEXTURE_COPY_LOCATION(o) {} + CD3DX12_TEXTURE_COPY_LOCATION(_In_ ID3D12Resource *pRes) noexcept { + pResource = pRes; + Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; + PlacedFootprint = {}; + } + CD3DX12_TEXTURE_COPY_LOCATION(_In_ ID3D12Resource *pRes, D3D12_PLACED_SUBRESOURCE_FOOTPRINT const &Footprint) + noexcept { + pResource = pRes; + Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; + PlacedFootprint = Footprint; + } + CD3DX12_TEXTURE_COPY_LOCATION(_In_ ID3D12Resource *pRes, UINT Sub) noexcept { + pResource = pRes; + Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; + PlacedFootprint = {}; + SubresourceIndex = Sub; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_DESCRIPTOR_RANGE : public D3D12_DESCRIPTOR_RANGE { + CD3DX12_DESCRIPTOR_RANGE() = default; + explicit CD3DX12_DESCRIPTOR_RANGE(const D3D12_DESCRIPTOR_RANGE &o) noexcept : D3D12_DESCRIPTOR_RANGE(o) {} + CD3DX12_DESCRIPTOR_RANGE(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, UINT numDescriptors, UINT baseShaderRegister, + UINT registerSpace = 0, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) + noexcept { + Init(rangeType, numDescriptors, baseShaderRegister, registerSpace, offsetInDescriptorsFromTableStart); + } + + inline void Init(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, UINT numDescriptors, UINT baseShaderRegister, + UINT registerSpace = 0, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) noexcept { + Init(*this, rangeType, numDescriptors, baseShaderRegister, registerSpace, offsetInDescriptorsFromTableStart); + } + + static inline void Init(_Out_ D3D12_DESCRIPTOR_RANGE &range, D3D12_DESCRIPTOR_RANGE_TYPE rangeType, + UINT numDescriptors, UINT baseShaderRegister, UINT registerSpace = 0, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) noexcept { + range.RangeType = rangeType; + range.NumDescriptors = numDescriptors; + range.BaseShaderRegister = baseShaderRegister; + range.RegisterSpace = registerSpace; + range.OffsetInDescriptorsFromTableStart = offsetInDescriptorsFromTableStart; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_ROOT_DESCRIPTOR_TABLE : public D3D12_ROOT_DESCRIPTOR_TABLE { + CD3DX12_ROOT_DESCRIPTOR_TABLE() = default; + explicit CD3DX12_ROOT_DESCRIPTOR_TABLE(const D3D12_ROOT_DESCRIPTOR_TABLE &o) noexcept + : D3D12_ROOT_DESCRIPTOR_TABLE(o) {} + CD3DX12_ROOT_DESCRIPTOR_TABLE(UINT numDescriptorRanges, + _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges) + noexcept { + Init(numDescriptorRanges, _pDescriptorRanges); + } + + inline void Init(UINT numDescriptorRanges, + _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges) noexcept { + Init(*this, numDescriptorRanges, _pDescriptorRanges); + } + + static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR_TABLE &rootDescriptorTable, UINT numDescriptorRanges, + _In_reads_opt_(numDescriptorRanges) + const D3D12_DESCRIPTOR_RANGE *_pDescriptorRanges) noexcept { + rootDescriptorTable.NumDescriptorRanges = numDescriptorRanges; + rootDescriptorTable.pDescriptorRanges = _pDescriptorRanges; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_ROOT_CONSTANTS : public D3D12_ROOT_CONSTANTS { + CD3DX12_ROOT_CONSTANTS() = default; + explicit CD3DX12_ROOT_CONSTANTS(const D3D12_ROOT_CONSTANTS &o) noexcept : D3D12_ROOT_CONSTANTS(o) {} + CD3DX12_ROOT_CONSTANTS(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0) noexcept { + Init(num32BitValues, shaderRegister, registerSpace); + } + + inline void Init(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0) noexcept { + Init(*this, num32BitValues, shaderRegister, registerSpace); + } + + static inline void Init(_Out_ D3D12_ROOT_CONSTANTS &rootConstants, UINT num32BitValues, UINT shaderRegister, + UINT registerSpace = 0) noexcept { + rootConstants.Num32BitValues = num32BitValues; + rootConstants.ShaderRegister = shaderRegister; + rootConstants.RegisterSpace = registerSpace; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_ROOT_DESCRIPTOR : public D3D12_ROOT_DESCRIPTOR { + CD3DX12_ROOT_DESCRIPTOR() = default; + explicit CD3DX12_ROOT_DESCRIPTOR(const D3D12_ROOT_DESCRIPTOR &o) noexcept : D3D12_ROOT_DESCRIPTOR(o) {} + CD3DX12_ROOT_DESCRIPTOR(UINT shaderRegister, UINT registerSpace = 0) noexcept { + Init(shaderRegister, registerSpace); + } + + inline void Init(UINT shaderRegister, UINT registerSpace = 0) noexcept { + Init(*this, shaderRegister, registerSpace); + } + + static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR &table, UINT shaderRegister, UINT registerSpace = 0) noexcept { + table.ShaderRegister = shaderRegister; + table.RegisterSpace = registerSpace; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_ROOT_PARAMETER : public D3D12_ROOT_PARAMETER { + CD3DX12_ROOT_PARAMETER() = default; + explicit CD3DX12_ROOT_PARAMETER(const D3D12_ROOT_PARAMETER &o) noexcept : D3D12_ROOT_PARAMETER(o) {} + + static inline void + InitAsDescriptorTable(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT numDescriptorRanges, + _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *pDescriptorRanges, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_DESCRIPTOR_TABLE::Init(rootParam.DescriptorTable, numDescriptorRanges, pDescriptorRanges); + } + + static inline void InitAsConstants(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT num32BitValues, UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_CONSTANTS::Init(rootParam.Constants, num32BitValues, shaderRegister, registerSpace); + } + + static inline void + InitAsConstantBufferView(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT shaderRegister, UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace); + } + + static inline void + InitAsShaderResourceView(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT shaderRegister, UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace); + } + + static inline void + InitAsUnorderedAccessView(_Out_ D3D12_ROOT_PARAMETER &rootParam, UINT shaderRegister, UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_DESCRIPTOR::Init(rootParam.Descriptor, shaderRegister, registerSpace); + } + + inline void InitAsDescriptorTable(UINT numDescriptorRanges, + _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE *pDescriptorRanges, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsDescriptorTable(*this, numDescriptorRanges, pDescriptorRanges, visibility); + } + + inline void InitAsConstants(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsConstants(*this, num32BitValues, shaderRegister, registerSpace, visibility); + } + + inline void InitAsConstantBufferView(UINT shaderRegister, UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsConstantBufferView(*this, shaderRegister, registerSpace, visibility); + } + + inline void InitAsShaderResourceView(UINT shaderRegister, UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsShaderResourceView(*this, shaderRegister, registerSpace, visibility); + } + + inline void InitAsUnorderedAccessView(UINT shaderRegister, UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsUnorderedAccessView(*this, shaderRegister, registerSpace, visibility); + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_STATIC_SAMPLER_DESC : public D3D12_STATIC_SAMPLER_DESC { + CD3DX12_STATIC_SAMPLER_DESC() = default; + explicit CD3DX12_STATIC_SAMPLER_DESC(const D3D12_STATIC_SAMPLER_DESC &o) noexcept : D3D12_STATIC_SAMPLER_DESC(o) {} + CD3DX12_STATIC_SAMPLER_DESC(UINT shaderRegister, D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, + D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + FLOAT mipLODBias = 0, UINT maxAnisotropy = 16, + D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, + D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, + FLOAT minLOD = 0.f, FLOAT maxLOD = D3D12_FLOAT32_MAX, + D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, + UINT registerSpace = 0) + noexcept { + Init(shaderRegister, filter, addressU, addressV, addressW, mipLODBias, maxAnisotropy, comparisonFunc, + borderColor, minLOD, maxLOD, shaderVisibility, registerSpace); + } + + static inline void Init(_Out_ D3D12_STATIC_SAMPLER_DESC &samplerDesc, UINT shaderRegister, + D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, + D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, FLOAT mipLODBias = 0, + UINT maxAnisotropy = 16, + D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, + D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, + FLOAT minLOD = 0.f, FLOAT maxLOD = D3D12_FLOAT32_MAX, + D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, + UINT registerSpace = 0) noexcept { + samplerDesc.ShaderRegister = shaderRegister; + samplerDesc.Filter = filter; + samplerDesc.AddressU = addressU; + samplerDesc.AddressV = addressV; + samplerDesc.AddressW = addressW; + samplerDesc.MipLODBias = mipLODBias; + samplerDesc.MaxAnisotropy = maxAnisotropy; + samplerDesc.ComparisonFunc = comparisonFunc; + samplerDesc.BorderColor = borderColor; + samplerDesc.MinLOD = minLOD; + samplerDesc.MaxLOD = maxLOD; + samplerDesc.ShaderVisibility = shaderVisibility; + samplerDesc.RegisterSpace = registerSpace; + } + inline void Init(UINT shaderRegister, D3D12_FILTER filter = D3D12_FILTER_ANISOTROPIC, + D3D12_TEXTURE_ADDRESS_MODE addressU = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressV = D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE addressW = D3D12_TEXTURE_ADDRESS_MODE_WRAP, FLOAT mipLODBias = 0, + UINT maxAnisotropy = 16, D3D12_COMPARISON_FUNC comparisonFunc = D3D12_COMPARISON_FUNC_LESS_EQUAL, + D3D12_STATIC_BORDER_COLOR borderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_WHITE, FLOAT minLOD = 0.f, + FLOAT maxLOD = D3D12_FLOAT32_MAX, + D3D12_SHADER_VISIBILITY shaderVisibility = D3D12_SHADER_VISIBILITY_ALL, + UINT registerSpace = 0) noexcept { + Init(*this, shaderRegister, filter, addressU, addressV, addressW, mipLODBias, maxAnisotropy, comparisonFunc, + borderColor, minLOD, maxLOD, shaderVisibility, registerSpace); + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_ROOT_SIGNATURE_DESC : public D3D12_ROOT_SIGNATURE_DESC { + CD3DX12_ROOT_SIGNATURE_DESC() = default; + explicit CD3DX12_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC &o) noexcept : D3D12_ROOT_SIGNATURE_DESC(o) {} + CD3DX12_ROOT_SIGNATURE_DESC(UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) + const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) + noexcept { + Init(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags); + } + CD3DX12_ROOT_SIGNATURE_DESC(CD3DX12_DEFAULT) noexcept { + Init(0, nullptr, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE); + } + + inline void Init(UINT numParameters, _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept { + Init(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags); + } + + static inline void + Init(_Out_ D3D12_ROOT_SIGNATURE_DESC &desc, UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept { + desc.NumParameters = numParameters; + desc.pParameters = _pParameters; + desc.NumStaticSamplers = numStaticSamplers; + desc.pStaticSamplers = _pStaticSamplers; + desc.Flags = flags; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_DESCRIPTOR_RANGE1 : public D3D12_DESCRIPTOR_RANGE1 { + CD3DX12_DESCRIPTOR_RANGE1() = default; + explicit CD3DX12_DESCRIPTOR_RANGE1(const D3D12_DESCRIPTOR_RANGE1 &o) noexcept : D3D12_DESCRIPTOR_RANGE1(o) {} + CD3DX12_DESCRIPTOR_RANGE1(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, UINT numDescriptors, UINT baseShaderRegister, + UINT registerSpace = 0, + D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) + noexcept { + Init(rangeType, numDescriptors, baseShaderRegister, registerSpace, flags, offsetInDescriptorsFromTableStart); + } + + inline void Init(D3D12_DESCRIPTOR_RANGE_TYPE rangeType, UINT numDescriptors, UINT baseShaderRegister, + UINT registerSpace = 0, D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) noexcept { + Init(*this, rangeType, numDescriptors, baseShaderRegister, registerSpace, flags, + offsetInDescriptorsFromTableStart); + } + + static inline void Init(_Out_ D3D12_DESCRIPTOR_RANGE1 &range, D3D12_DESCRIPTOR_RANGE_TYPE rangeType, + UINT numDescriptors, UINT baseShaderRegister, UINT registerSpace = 0, + D3D12_DESCRIPTOR_RANGE_FLAGS flags = D3D12_DESCRIPTOR_RANGE_FLAG_NONE, + UINT offsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND) noexcept { + range.RangeType = rangeType; + range.NumDescriptors = numDescriptors; + range.BaseShaderRegister = baseShaderRegister; + range.RegisterSpace = registerSpace; + range.Flags = flags; + range.OffsetInDescriptorsFromTableStart = offsetInDescriptorsFromTableStart; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_ROOT_DESCRIPTOR_TABLE1 : public D3D12_ROOT_DESCRIPTOR_TABLE1 { + CD3DX12_ROOT_DESCRIPTOR_TABLE1() = default; + explicit CD3DX12_ROOT_DESCRIPTOR_TABLE1(const D3D12_ROOT_DESCRIPTOR_TABLE1 &o) noexcept + : D3D12_ROOT_DESCRIPTOR_TABLE1(o) {} + CD3DX12_ROOT_DESCRIPTOR_TABLE1(UINT numDescriptorRanges, _In_reads_opt_(numDescriptorRanges) + const D3D12_DESCRIPTOR_RANGE1 *_pDescriptorRanges) + noexcept { + Init(numDescriptorRanges, _pDescriptorRanges); + } + + inline void Init(UINT numDescriptorRanges, + _In_reads_opt_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1 *_pDescriptorRanges) noexcept { + Init(*this, numDescriptorRanges, _pDescriptorRanges); + } + + static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR_TABLE1 &rootDescriptorTable, UINT numDescriptorRanges, + _In_reads_opt_(numDescriptorRanges) + const D3D12_DESCRIPTOR_RANGE1 *_pDescriptorRanges) noexcept { + rootDescriptorTable.NumDescriptorRanges = numDescriptorRanges; + rootDescriptorTable.pDescriptorRanges = _pDescriptorRanges; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_ROOT_DESCRIPTOR1 : public D3D12_ROOT_DESCRIPTOR1 { + CD3DX12_ROOT_DESCRIPTOR1() = default; + explicit CD3DX12_ROOT_DESCRIPTOR1(const D3D12_ROOT_DESCRIPTOR1 &o) noexcept : D3D12_ROOT_DESCRIPTOR1(o) {} + CD3DX12_ROOT_DESCRIPTOR1(UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE) + noexcept { + Init(shaderRegister, registerSpace, flags); + } + + inline void Init(UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE) noexcept { + Init(*this, shaderRegister, registerSpace, flags); + } + + static inline void Init(_Out_ D3D12_ROOT_DESCRIPTOR1 &table, UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE) noexcept { + table.ShaderRegister = shaderRegister; + table.RegisterSpace = registerSpace; + table.Flags = flags; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_ROOT_PARAMETER1 : public D3D12_ROOT_PARAMETER1 { + CD3DX12_ROOT_PARAMETER1() = default; + explicit CD3DX12_ROOT_PARAMETER1(const D3D12_ROOT_PARAMETER1 &o) noexcept : D3D12_ROOT_PARAMETER1(o) {} + + static inline void + InitAsDescriptorTable(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT numDescriptorRanges, + _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1 *pDescriptorRanges, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_DESCRIPTOR_TABLE1::Init(rootParam.DescriptorTable, numDescriptorRanges, pDescriptorRanges); + } + + static inline void InitAsConstants(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT num32BitValues, UINT shaderRegister, + UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_CONSTANTS::Init(rootParam.Constants, num32BitValues, shaderRegister, registerSpace); + } + + static inline void + InitAsConstantBufferView(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags); + } + + static inline void + InitAsShaderResourceView(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_SRV; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags); + } + + static inline void + InitAsUnorderedAccessView(_Out_ D3D12_ROOT_PARAMETER1 &rootParam, UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + rootParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_UAV; + rootParam.ShaderVisibility = visibility; + CD3DX12_ROOT_DESCRIPTOR1::Init(rootParam.Descriptor, shaderRegister, registerSpace, flags); + } + + inline void InitAsDescriptorTable(UINT numDescriptorRanges, + _In_reads_(numDescriptorRanges) const D3D12_DESCRIPTOR_RANGE1 *pDescriptorRanges, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsDescriptorTable(*this, numDescriptorRanges, pDescriptorRanges, visibility); + } + + inline void InitAsConstants(UINT num32BitValues, UINT shaderRegister, UINT registerSpace = 0, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsConstants(*this, num32BitValues, shaderRegister, registerSpace, visibility); + } + + inline void InitAsConstantBufferView(UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsConstantBufferView(*this, shaderRegister, registerSpace, flags, visibility); + } + + inline void InitAsShaderResourceView(UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsShaderResourceView(*this, shaderRegister, registerSpace, flags, visibility); + } + + inline void InitAsUnorderedAccessView(UINT shaderRegister, UINT registerSpace = 0, + D3D12_ROOT_DESCRIPTOR_FLAGS flags = D3D12_ROOT_DESCRIPTOR_FLAG_NONE, + D3D12_SHADER_VISIBILITY visibility = D3D12_SHADER_VISIBILITY_ALL) noexcept { + InitAsUnorderedAccessView(*this, shaderRegister, registerSpace, flags, visibility); + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC : public D3D12_VERSIONED_ROOT_SIGNATURE_DESC { + CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC() = default; + explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_VERSIONED_ROOT_SIGNATURE_DESC &o) noexcept + : D3D12_VERSIONED_ROOT_SIGNATURE_DESC(o) {} + explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC &o) noexcept { + Version = D3D_ROOT_SIGNATURE_VERSION_1_0; + Desc_1_0 = o; + } + explicit CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(const D3D12_ROOT_SIGNATURE_DESC1 &o) noexcept { + Version = D3D_ROOT_SIGNATURE_VERSION_1_1; + Desc_1_1 = o; + } + CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) + const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) + noexcept { + Init_1_0(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags); + } + CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1 *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) + const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) + noexcept { + Init_1_1(numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags); + } + CD3DX12_VERSIONED_ROOT_SIGNATURE_DESC(CD3DX12_DEFAULT) noexcept { + Init_1_1(0, nullptr, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE); + } + + inline void Init_1_0(UINT numParameters, _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept { + Init_1_0(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags); + } + + static inline void + Init_1_0(_Out_ D3D12_VERSIONED_ROOT_SIGNATURE_DESC &desc, UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER *_pParameters, UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept { + desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_0; + desc.Desc_1_0.NumParameters = numParameters; + desc.Desc_1_0.pParameters = _pParameters; + desc.Desc_1_0.NumStaticSamplers = numStaticSamplers; + desc.Desc_1_0.pStaticSamplers = _pStaticSamplers; + desc.Desc_1_0.Flags = flags; + } + + inline void Init_1_1(UINT numParameters, _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1 *_pParameters, + UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept { + Init_1_1(*this, numParameters, _pParameters, numStaticSamplers, _pStaticSamplers, flags); + } + + static inline void + Init_1_1(_Out_ D3D12_VERSIONED_ROOT_SIGNATURE_DESC &desc, UINT numParameters, + _In_reads_opt_(numParameters) const D3D12_ROOT_PARAMETER1 *_pParameters, UINT numStaticSamplers = 0, + _In_reads_opt_(numStaticSamplers) const D3D12_STATIC_SAMPLER_DESC *_pStaticSamplers = nullptr, + D3D12_ROOT_SIGNATURE_FLAGS flags = D3D12_ROOT_SIGNATURE_FLAG_NONE) noexcept { + desc.Version = D3D_ROOT_SIGNATURE_VERSION_1_1; + desc.Desc_1_1.NumParameters = numParameters; + desc.Desc_1_1.pParameters = _pParameters; + desc.Desc_1_1.NumStaticSamplers = numStaticSamplers; + desc.Desc_1_1.pStaticSamplers = _pStaticSamplers; + desc.Desc_1_1.Flags = flags; + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_CPU_DESCRIPTOR_HANDLE : public D3D12_CPU_DESCRIPTOR_HANDLE { + CD3DX12_CPU_DESCRIPTOR_HANDLE() = default; + explicit CD3DX12_CPU_DESCRIPTOR_HANDLE(const D3D12_CPU_DESCRIPTOR_HANDLE &o) noexcept + : D3D12_CPU_DESCRIPTOR_HANDLE(o) {} + CD3DX12_CPU_DESCRIPTOR_HANDLE(CD3DX12_DEFAULT) noexcept { ptr = 0; } + CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, INT offsetScaledByIncrementSize) + noexcept { + InitOffsetted(other, offsetScaledByIncrementSize); + } + CD3DX12_CPU_DESCRIPTOR_HANDLE(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other, INT offsetInDescriptors, + UINT descriptorIncrementSize) + noexcept { + InitOffsetted(other, offsetInDescriptors, descriptorIncrementSize); + } + CD3DX12_CPU_DESCRIPTOR_HANDLE &Offset(INT offsetInDescriptors, UINT descriptorIncrementSize) noexcept { + ptr = SIZE_T(INT64(ptr) + INT64(offsetInDescriptors) * INT64(descriptorIncrementSize)); + return *this; + } + CD3DX12_CPU_DESCRIPTOR_HANDLE &Offset(INT offsetScaledByIncrementSize) noexcept { + ptr = SIZE_T(INT64(ptr) + INT64(offsetScaledByIncrementSize)); + return *this; + } + bool operator==(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other) const noexcept { return (ptr == other.ptr); } + bool operator!=(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &other) const noexcept { return (ptr != other.ptr); } + CD3DX12_CPU_DESCRIPTOR_HANDLE &operator=(const D3D12_CPU_DESCRIPTOR_HANDLE &other) noexcept { + ptr = other.ptr; + return *this; + } + + inline void InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize) noexcept { + InitOffsetted(*this, base, offsetScaledByIncrementSize); + } + + inline void InitOffsetted(_In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, + UINT descriptorIncrementSize) noexcept { + InitOffsetted(*this, base, offsetInDescriptors, descriptorIncrementSize); + } + + static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle, + _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, + INT offsetScaledByIncrementSize) noexcept { + handle.ptr = SIZE_T(INT64(base.ptr) + INT64(offsetScaledByIncrementSize)); + } + + static inline void InitOffsetted(_Out_ D3D12_CPU_DESCRIPTOR_HANDLE &handle, + _In_ const D3D12_CPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, + UINT descriptorIncrementSize) noexcept { + handle.ptr = SIZE_T(INT64(base.ptr) + INT64(offsetInDescriptors) * INT64(descriptorIncrementSize)); + } +}; + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_GPU_DESCRIPTOR_HANDLE : public D3D12_GPU_DESCRIPTOR_HANDLE { + CD3DX12_GPU_DESCRIPTOR_HANDLE() = default; + explicit CD3DX12_GPU_DESCRIPTOR_HANDLE(const D3D12_GPU_DESCRIPTOR_HANDLE &o) noexcept + : D3D12_GPU_DESCRIPTOR_HANDLE(o) {} + CD3DX12_GPU_DESCRIPTOR_HANDLE(CD3DX12_DEFAULT) noexcept { ptr = 0; } + CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, INT offsetScaledByIncrementSize) + noexcept { + InitOffsetted(other, offsetScaledByIncrementSize); + } + CD3DX12_GPU_DESCRIPTOR_HANDLE(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other, INT offsetInDescriptors, + UINT descriptorIncrementSize) + noexcept { + InitOffsetted(other, offsetInDescriptors, descriptorIncrementSize); + } + CD3DX12_GPU_DESCRIPTOR_HANDLE &Offset(INT offsetInDescriptors, UINT descriptorIncrementSize) noexcept { + ptr = UINT64(INT64(ptr) + INT64(offsetInDescriptors) * INT64(descriptorIncrementSize)); + return *this; + } + CD3DX12_GPU_DESCRIPTOR_HANDLE &Offset(INT offsetScaledByIncrementSize) noexcept { + ptr = UINT64(INT64(ptr) + INT64(offsetScaledByIncrementSize)); + return *this; + } + inline bool operator==(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other) const noexcept { return (ptr == other.ptr); } + inline bool operator!=(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &other) const noexcept { return (ptr != other.ptr); } + CD3DX12_GPU_DESCRIPTOR_HANDLE &operator=(const D3D12_GPU_DESCRIPTOR_HANDLE &other) noexcept { + ptr = other.ptr; + return *this; + } + + inline void InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetScaledByIncrementSize) noexcept { + InitOffsetted(*this, base, offsetScaledByIncrementSize); + } + + inline void InitOffsetted(_In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, + UINT descriptorIncrementSize) noexcept { + InitOffsetted(*this, base, offsetInDescriptors, descriptorIncrementSize); + } + + static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle, + _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, + INT offsetScaledByIncrementSize) noexcept { + handle.ptr = UINT64(INT64(base.ptr) + INT64(offsetScaledByIncrementSize)); + } + + static inline void InitOffsetted(_Out_ D3D12_GPU_DESCRIPTOR_HANDLE &handle, + _In_ const D3D12_GPU_DESCRIPTOR_HANDLE &base, INT offsetInDescriptors, + UINT descriptorIncrementSize) noexcept { + handle.ptr = UINT64(INT64(base.ptr) + INT64(offsetInDescriptors) * INT64(descriptorIncrementSize)); + } +}; + +//------------------------------------------------------------------------------------------------ +inline constexpr UINT D3D12CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice, UINT MipLevels, + UINT ArraySize) noexcept { + return MipSlice + ArraySlice * MipLevels + PlaneSlice * MipLevels * ArraySize; +} + +//------------------------------------------------------------------------------------------------ +template +inline void D3D12DecomposeSubresource(UINT Subresource, UINT MipLevels, UINT ArraySize, _Out_ T &MipSlice, + _Out_ U &ArraySlice, _Out_ V &PlaneSlice) noexcept { + MipSlice = static_cast(Subresource % MipLevels); + ArraySlice = static_cast((Subresource / MipLevels) % ArraySize); + PlaneSlice = static_cast(Subresource / (MipLevels * ArraySize)); +} + +//------------------------------------------------------------------------------------------------ +inline UINT8 D3D12GetFormatPlaneCount(_In_ ID3D12Device *pDevice, DXGI_FORMAT Format) noexcept { + D3D12_FEATURE_DATA_FORMAT_INFO formatInfo = {Format, 0}; + if (FAILED(pDevice->CheckFeatureSupport(D3D12_FEATURE_FORMAT_INFO, &formatInfo, sizeof(formatInfo)))) { + return 0; + } + return formatInfo.PlaneCount; +} + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RESOURCE_DESC : public D3D12_RESOURCE_DESC { + CD3DX12_RESOURCE_DESC() = default; + explicit CD3DX12_RESOURCE_DESC(const D3D12_RESOURCE_DESC &o) noexcept : D3D12_RESOURCE_DESC(o) {} + CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION dimension, UINT64 alignment, UINT64 width, UINT height, + UINT16 depthOrArraySize, UINT16 mipLevels, DXGI_FORMAT format, UINT sampleCount, + UINT sampleQuality, D3D12_TEXTURE_LAYOUT layout, D3D12_RESOURCE_FLAGS flags) + noexcept { + Dimension = dimension; + Alignment = alignment; + Width = width; + Height = height; + DepthOrArraySize = depthOrArraySize; + MipLevels = mipLevels; + Format = format; + SampleDesc.Count = sampleCount; + SampleDesc.Quality = sampleQuality; + Layout = layout; + Flags = flags; + } + static inline CD3DX12_RESOURCE_DESC Buffer(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE) noexcept { + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_BUFFER, resAllocInfo.Alignment, resAllocInfo.SizeInBytes, + 1, 1, 1, DXGI_FORMAT_UNKNOWN, 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags); + } + static inline CD3DX12_RESOURCE_DESC Buffer(UINT64 width, D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + UINT64 alignment = 0) noexcept { + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_BUFFER, alignment, width, 1, 1, 1, DXGI_FORMAT_UNKNOWN, 1, + 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags); + } + static inline CD3DX12_RESOURCE_DESC Tex1D(DXGI_FORMAT format, UINT64 width, UINT16 arraySize = 1, + UINT16 mipLevels = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0) noexcept { + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_TEXTURE1D, alignment, width, 1, arraySize, mipLevels, + format, 1, 0, layout, flags); + } + static inline CD3DX12_RESOURCE_DESC Tex2D(DXGI_FORMAT format, UINT64 width, UINT height, UINT16 arraySize = 1, + UINT16 mipLevels = 0, UINT sampleCount = 1, UINT sampleQuality = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0) noexcept { + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_TEXTURE2D, alignment, width, height, arraySize, mipLevels, + format, sampleCount, sampleQuality, layout, flags); + } + static inline CD3DX12_RESOURCE_DESC Tex3D(DXGI_FORMAT format, UINT64 width, UINT height, UINT16 depth, + UINT16 mipLevels = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0) noexcept { + return CD3DX12_RESOURCE_DESC(D3D12_RESOURCE_DIMENSION_TEXTURE3D, alignment, width, height, depth, mipLevels, + format, 1, 0, layout, flags); + } + inline UINT16 Depth() const noexcept { + return (Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); + } + inline UINT16 ArraySize() const noexcept { + return (Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); + } + inline UINT8 PlaneCount(_In_ ID3D12Device *pDevice) const noexcept { + return D3D12GetFormatPlaneCount(pDevice, Format); + } + inline UINT Subresources(_In_ ID3D12Device *pDevice) const noexcept { + return MipLevels * ArraySize() * PlaneCount(pDevice); + } + inline UINT CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice) noexcept { + return D3D12CalcSubresource(MipSlice, ArraySlice, PlaneSlice, MipLevels, ArraySize()); + } +}; +inline bool operator==(const D3D12_RESOURCE_DESC &l, const D3D12_RESOURCE_DESC &r) noexcept { + return l.Dimension == r.Dimension && l.Alignment == r.Alignment && l.Width == r.Width && l.Height == r.Height && + l.DepthOrArraySize == r.DepthOrArraySize && l.MipLevels == r.MipLevels && l.Format == r.Format && + l.SampleDesc.Count == r.SampleDesc.Count && l.SampleDesc.Quality == r.SampleDesc.Quality && + l.Layout == r.Layout && l.Flags == r.Flags; +} +inline bool operator!=(const D3D12_RESOURCE_DESC &l, const D3D12_RESOURCE_DESC &r) noexcept { return !(l == r); } + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RESOURCE_DESC1 : public D3D12_RESOURCE_DESC1 { + CD3DX12_RESOURCE_DESC1() = default; + explicit CD3DX12_RESOURCE_DESC1(const D3D12_RESOURCE_DESC1 &o) noexcept : D3D12_RESOURCE_DESC1(o) {} + CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION dimension, UINT64 alignment, UINT64 width, UINT height, + UINT16 depthOrArraySize, UINT16 mipLevels, DXGI_FORMAT format, UINT sampleCount, + UINT sampleQuality, D3D12_TEXTURE_LAYOUT layout, D3D12_RESOURCE_FLAGS flags, + UINT samplerFeedbackMipRegionWidth = 0, UINT samplerFeedbackMipRegionHeight = 0, + UINT samplerFeedbackMipRegionDepth = 0) + noexcept { + Dimension = dimension; + Alignment = alignment; + Width = width; + Height = height; + DepthOrArraySize = depthOrArraySize; + MipLevels = mipLevels; + Format = format; + SampleDesc.Count = sampleCount; + SampleDesc.Quality = sampleQuality; + Layout = layout; + Flags = flags; + SamplerFeedbackMipRegion.Width = samplerFeedbackMipRegionWidth; + SamplerFeedbackMipRegion.Height = samplerFeedbackMipRegionHeight; + SamplerFeedbackMipRegion.Depth = samplerFeedbackMipRegionDepth; + } + static inline CD3DX12_RESOURCE_DESC1 Buffer(const D3D12_RESOURCE_ALLOCATION_INFO &resAllocInfo, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE) noexcept { + return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_BUFFER, resAllocInfo.Alignment, resAllocInfo.SizeInBytes, + 1, 1, 1, DXGI_FORMAT_UNKNOWN, 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags, 0, 0, + 0); + } + static inline CD3DX12_RESOURCE_DESC1 Buffer(UINT64 width, D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + UINT64 alignment = 0) noexcept { + return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_BUFFER, alignment, width, 1, 1, 1, DXGI_FORMAT_UNKNOWN, + 1, 0, D3D12_TEXTURE_LAYOUT_ROW_MAJOR, flags, 0, 0, 0); + } + static inline CD3DX12_RESOURCE_DESC1 Tex1D(DXGI_FORMAT format, UINT64 width, UINT16 arraySize = 1, + UINT16 mipLevels = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0) noexcept { + return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_TEXTURE1D, alignment, width, 1, arraySize, mipLevels, + format, 1, 0, layout, flags, 0, 0, 0); + } + static inline CD3DX12_RESOURCE_DESC1 Tex2D(DXGI_FORMAT format, UINT64 width, UINT height, UINT16 arraySize = 1, + UINT16 mipLevels = 0, UINT sampleCount = 1, UINT sampleQuality = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0, UINT samplerFeedbackMipRegionWidth = 0, + UINT samplerFeedbackMipRegionHeight = 0, + UINT samplerFeedbackMipRegionDepth = 0) noexcept { + return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_TEXTURE2D, alignment, width, height, arraySize, + mipLevels, format, sampleCount, sampleQuality, layout, flags, + samplerFeedbackMipRegionWidth, samplerFeedbackMipRegionHeight, + samplerFeedbackMipRegionDepth); + } + static inline CD3DX12_RESOURCE_DESC1 Tex3D(DXGI_FORMAT format, UINT64 width, UINT height, UINT16 depth, + UINT16 mipLevels = 0, + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE, + D3D12_TEXTURE_LAYOUT layout = D3D12_TEXTURE_LAYOUT_UNKNOWN, + UINT64 alignment = 0) noexcept { + return CD3DX12_RESOURCE_DESC1(D3D12_RESOURCE_DIMENSION_TEXTURE3D, alignment, width, height, depth, mipLevels, + format, 1, 0, layout, flags, 0, 0, 0); + } + inline UINT16 Depth() const noexcept { + return (Dimension == D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); + } + inline UINT16 ArraySize() const noexcept { + return (Dimension != D3D12_RESOURCE_DIMENSION_TEXTURE3D ? DepthOrArraySize : 1); + } + inline UINT8 PlaneCount(_In_ ID3D12Device *pDevice) const noexcept { + return D3D12GetFormatPlaneCount(pDevice, Format); + } + inline UINT Subresources(_In_ ID3D12Device *pDevice) const noexcept { + return MipLevels * ArraySize() * PlaneCount(pDevice); + } + inline UINT CalcSubresource(UINT MipSlice, UINT ArraySlice, UINT PlaneSlice) noexcept { + return D3D12CalcSubresource(MipSlice, ArraySlice, PlaneSlice, MipLevels, ArraySize()); + } +}; +inline bool operator==(const D3D12_RESOURCE_DESC1 &l, const D3D12_RESOURCE_DESC1 &r) noexcept { + return l.Dimension == r.Dimension && l.Alignment == r.Alignment && l.Width == r.Width && l.Height == r.Height && + l.DepthOrArraySize == r.DepthOrArraySize && l.MipLevels == r.MipLevels && l.Format == r.Format && + l.SampleDesc.Count == r.SampleDesc.Count && l.SampleDesc.Quality == r.SampleDesc.Quality && + l.Layout == r.Layout && l.Flags == r.Flags && + l.SamplerFeedbackMipRegion.Width == r.SamplerFeedbackMipRegion.Width && + l.SamplerFeedbackMipRegion.Height == r.SamplerFeedbackMipRegion.Height && + l.SamplerFeedbackMipRegion.Depth == r.SamplerFeedbackMipRegion.Depth; +} +inline bool operator!=(const D3D12_RESOURCE_DESC1 &l, const D3D12_RESOURCE_DESC1 &r) noexcept { return !(l == r); } + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_VIEW_INSTANCING_DESC : public D3D12_VIEW_INSTANCING_DESC { + CD3DX12_VIEW_INSTANCING_DESC() = default; + explicit CD3DX12_VIEW_INSTANCING_DESC(const D3D12_VIEW_INSTANCING_DESC &o) noexcept + : D3D12_VIEW_INSTANCING_DESC(o) {} + explicit CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT) noexcept { + ViewInstanceCount = 0; + pViewInstanceLocations = nullptr; + Flags = D3D12_VIEW_INSTANCING_FLAG_NONE; + } + explicit CD3DX12_VIEW_INSTANCING_DESC(UINT InViewInstanceCount, + const D3D12_VIEW_INSTANCE_LOCATION *InViewInstanceLocations, + D3D12_VIEW_INSTANCING_FLAGS InFlags) noexcept { + ViewInstanceCount = InViewInstanceCount; + pViewInstanceLocations = InViewInstanceLocations; + Flags = InFlags; + } +}; + +//------------------------------------------------------------------------------------------------ +// Row-by-row memcpy +inline void MemcpySubresource(_In_ const D3D12_MEMCPY_DEST *pDest, _In_ const D3D12_SUBRESOURCE_DATA *pSrc, + SIZE_T RowSizeInBytes, UINT NumRows, UINT NumSlices) noexcept { + for (UINT z = 0; z < NumSlices; ++z) { + auto pDestSlice = static_cast(pDest->pData) + pDest->SlicePitch * z; + auto pSrcSlice = static_cast(pSrc->pData) + pSrc->SlicePitch * LONG_PTR(z); + for (UINT y = 0; y < NumRows; ++y) { + memcpy(pDestSlice + pDest->RowPitch * y, pSrcSlice + pSrc->RowPitch * LONG_PTR(y), RowSizeInBytes); + } + } +} + +//------------------------------------------------------------------------------------------------ +// Row-by-row memcpy +inline void MemcpySubresource(_In_ const D3D12_MEMCPY_DEST *pDest, _In_ const void *pResourceData, + _In_ const D3D12_SUBRESOURCE_INFO *pSrc, SIZE_T RowSizeInBytes, UINT NumRows, + UINT NumSlices) noexcept { + for (UINT z = 0; z < NumSlices; ++z) { + auto pDestSlice = static_cast(pDest->pData) + pDest->SlicePitch * z; + auto pSrcSlice = (static_cast(pResourceData) + pSrc->Offset) + pSrc->DepthPitch * ULONG_PTR(z); + for (UINT y = 0; y < NumRows; ++y) { + memcpy(pDestSlice + pDest->RowPitch * y, pSrcSlice + pSrc->RowPitch * ULONG_PTR(y), RowSizeInBytes); + } + } +} + +//------------------------------------------------------------------------------------------------ +// Returns required size of a buffer to be used for data upload +inline UINT64 GetRequiredIntermediateSize(_In_ ID3D12Resource *pDestinationResource, + _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, + _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) + UINT NumSubresources) noexcept { + auto Desc = pDestinationResource->GetDesc(); + UINT64 RequiredSize = 0; + + ID3D12Device *pDevice = nullptr; + pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast(&pDevice)); + pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, 0, nullptr, nullptr, nullptr, + &RequiredSize); + pDevice->Release(); + + return RequiredSize; +} + +//------------------------------------------------------------------------------------------------ +// All arrays must be populated (e.g. by calling GetCopyableFootprints) +inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, + _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, + _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources, + UINT64 RequiredSize, + _In_reads_(NumSubresources) const D3D12_PLACED_SUBRESOURCE_FOOTPRINT *pLayouts, + _In_reads_(NumSubresources) const UINT *pNumRows, + _In_reads_(NumSubresources) const UINT64 *pRowSizesInBytes, + _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA *pSrcData) noexcept { + // Minor validation + auto IntermediateDesc = pIntermediate->GetDesc(); + auto DestinationDesc = pDestinationResource->GetDesc(); + if (IntermediateDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || + IntermediateDesc.Width < RequiredSize + pLayouts[0].Offset || RequiredSize > SIZE_T(-1) || + (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER && + (FirstSubresource != 0 || NumSubresources != 1))) { + return 0; + } + + BYTE *pData; + HRESULT hr = pIntermediate->Map(0, nullptr, reinterpret_cast(&pData)); + if (FAILED(hr)) { + return 0; + } + + for (UINT i = 0; i < NumSubresources; ++i) { + if (pRowSizesInBytes[i] > SIZE_T(-1)) + return 0; + D3D12_MEMCPY_DEST DestData = {pData + pLayouts[i].Offset, pLayouts[i].Footprint.RowPitch, + SIZE_T(pLayouts[i].Footprint.RowPitch) * SIZE_T(pNumRows[i])}; + MemcpySubresource(&DestData, &pSrcData[i], static_cast(pRowSizesInBytes[i]), pNumRows[i], + pLayouts[i].Footprint.Depth); + } + pIntermediate->Unmap(0, nullptr); + + if (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER) { + pCmdList->CopyBufferRegion(pDestinationResource, 0, pIntermediate, pLayouts[0].Offset, + pLayouts[0].Footprint.Width); + } else { + for (UINT i = 0; i < NumSubresources; ++i) { + CD3DX12_TEXTURE_COPY_LOCATION Dst(pDestinationResource, i + FirstSubresource); + CD3DX12_TEXTURE_COPY_LOCATION Src(pIntermediate, pLayouts[i]); + pCmdList->CopyTextureRegion(&Dst, 0, 0, 0, &Src, nullptr); + } + } + return RequiredSize; +} + +//------------------------------------------------------------------------------------------------ +// All arrays must be populated (e.g. by calling GetCopyableFootprints) +inline UINT64 +UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, + _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources, UINT64 RequiredSize, + _In_reads_(NumSubresources) const D3D12_PLACED_SUBRESOURCE_FOOTPRINT *pLayouts, + _In_reads_(NumSubresources) const UINT *pNumRows, + _In_reads_(NumSubresources) const UINT64 *pRowSizesInBytes, _In_ const void *pResourceData, + _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_INFO *pSrcData) noexcept { + // Minor validation + auto IntermediateDesc = pIntermediate->GetDesc(); + auto DestinationDesc = pDestinationResource->GetDesc(); + if (IntermediateDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || + IntermediateDesc.Width < RequiredSize + pLayouts[0].Offset || RequiredSize > SIZE_T(-1) || + (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER && + (FirstSubresource != 0 || NumSubresources != 1))) { + return 0; + } + + BYTE *pData; + HRESULT hr = pIntermediate->Map(0, nullptr, reinterpret_cast(&pData)); + if (FAILED(hr)) { + return 0; + } + + for (UINT i = 0; i < NumSubresources; ++i) { + if (pRowSizesInBytes[i] > SIZE_T(-1)) + return 0; + D3D12_MEMCPY_DEST DestData = {pData + pLayouts[i].Offset, pLayouts[i].Footprint.RowPitch, + SIZE_T(pLayouts[i].Footprint.RowPitch) * SIZE_T(pNumRows[i])}; + MemcpySubresource(&DestData, pResourceData, &pSrcData[i], static_cast(pRowSizesInBytes[i]), pNumRows[i], + pLayouts[i].Footprint.Depth); + } + pIntermediate->Unmap(0, nullptr); + + if (DestinationDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER) { + pCmdList->CopyBufferRegion(pDestinationResource, 0, pIntermediate, pLayouts[0].Offset, + pLayouts[0].Footprint.Width); + } else { + for (UINT i = 0; i < NumSubresources; ++i) { + CD3DX12_TEXTURE_COPY_LOCATION Dst(pDestinationResource, i + FirstSubresource); + CD3DX12_TEXTURE_COPY_LOCATION Src(pIntermediate, pLayouts[i]); + pCmdList->CopyTextureRegion(&Dst, 0, 0, 0, &Src, nullptr); + } + } + return RequiredSize; +} + +//------------------------------------------------------------------------------------------------ +// Heap-allocating UpdateSubresources implementation +inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, UINT64 IntermediateOffset, + _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, + _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources, + _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA *pSrcData) noexcept { + UINT64 RequiredSize = 0; + auto MemToAlloc = static_cast(sizeof(D3D12_PLACED_SUBRESOURCE_FOOTPRINT) + sizeof(UINT) + sizeof(UINT64)) * + NumSubresources; + if (MemToAlloc > SIZE_MAX) { + return 0; + } + void *pMem = HeapAlloc(GetProcessHeap(), 0, static_cast(MemToAlloc)); + if (pMem == nullptr) { + return 0; + } + auto pLayouts = static_cast(pMem); + auto pRowSizesInBytes = reinterpret_cast(pLayouts + NumSubresources); + auto pNumRows = reinterpret_cast(pRowSizesInBytes + NumSubresources); + + auto Desc = pDestinationResource->GetDesc(); + ID3D12Device *pDevice = nullptr; + pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast(&pDevice)); + pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, pLayouts, pNumRows, + pRowSizesInBytes, &RequiredSize); + pDevice->Release(); + + UINT64 Result = UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources, + RequiredSize, pLayouts, pNumRows, pRowSizesInBytes, pSrcData); + HeapFree(GetProcessHeap(), 0, pMem); + return Result; +} + +//------------------------------------------------------------------------------------------------ +// Heap-allocating UpdateSubresources implementation +inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, UINT64 IntermediateOffset, + _In_range_(0, D3D12_REQ_SUBRESOURCES) UINT FirstSubresource, + _In_range_(0, D3D12_REQ_SUBRESOURCES - FirstSubresource) UINT NumSubresources, + _In_ const void *pResourceData, + _In_reads_(NumSubresources) D3D12_SUBRESOURCE_INFO *pSrcData) noexcept { + UINT64 RequiredSize = 0; + auto MemToAlloc = static_cast(sizeof(D3D12_PLACED_SUBRESOURCE_FOOTPRINT) + sizeof(UINT) + sizeof(UINT64)) * + NumSubresources; + if (MemToAlloc > SIZE_MAX) { + return 0; + } + void *pMem = HeapAlloc(GetProcessHeap(), 0, static_cast(MemToAlloc)); + if (pMem == nullptr) { + return 0; + } + auto pLayouts = reinterpret_cast(pMem); + auto pRowSizesInBytes = reinterpret_cast(pLayouts + NumSubresources); + auto pNumRows = reinterpret_cast(pRowSizesInBytes + NumSubresources); + + auto Desc = pDestinationResource->GetDesc(); + ID3D12Device *pDevice = nullptr; + pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast(&pDevice)); + pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, pLayouts, pNumRows, + pRowSizesInBytes, &RequiredSize); + pDevice->Release(); + + UINT64 Result = UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources, + RequiredSize, pLayouts, pNumRows, pRowSizesInBytes, pResourceData, pSrcData); + HeapFree(GetProcessHeap(), 0, pMem); + return Result; +} + +//------------------------------------------------------------------------------------------------ +// Stack-allocating UpdateSubresources implementation +template +inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, UINT64 IntermediateOffset, + _In_range_(0, MaxSubresources) UINT FirstSubresource, + _In_range_(1, MaxSubresources - FirstSubresource) UINT NumSubresources, + _In_reads_(NumSubresources) const D3D12_SUBRESOURCE_DATA *pSrcData) noexcept { + UINT64 RequiredSize = 0; + D3D12_PLACED_SUBRESOURCE_FOOTPRINT Layouts[MaxSubresources]; + UINT NumRows[MaxSubresources]; + UINT64 RowSizesInBytes[MaxSubresources]; + + auto Desc = pDestinationResource->GetDesc(); + ID3D12Device *pDevice = nullptr; + pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast(&pDevice)); + pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, Layouts, NumRows, + RowSizesInBytes, &RequiredSize); + pDevice->Release(); + + return UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources, + RequiredSize, Layouts, NumRows, RowSizesInBytes, pSrcData); +} + +//------------------------------------------------------------------------------------------------ +// Stack-allocating UpdateSubresources implementation +template +inline UINT64 UpdateSubresources(_In_ ID3D12GraphicsCommandList *pCmdList, _In_ ID3D12Resource *pDestinationResource, + _In_ ID3D12Resource *pIntermediate, UINT64 IntermediateOffset, + _In_range_(0, MaxSubresources) UINT FirstSubresource, + _In_range_(1, MaxSubresources - FirstSubresource) UINT NumSubresources, + _In_ const void *pResourceData, + _In_reads_(NumSubresources) D3D12_SUBRESOURCE_INFO *pSrcData) noexcept { + UINT64 RequiredSize = 0; + D3D12_PLACED_SUBRESOURCE_FOOTPRINT Layouts[MaxSubresources]; + UINT NumRows[MaxSubresources]; + UINT64 RowSizesInBytes[MaxSubresources]; + + auto Desc = pDestinationResource->GetDesc(); + ID3D12Device *pDevice = nullptr; + pDestinationResource->GetDevice(IID_ID3D12Device, reinterpret_cast(&pDevice)); + pDevice->GetCopyableFootprints(&Desc, FirstSubresource, NumSubresources, IntermediateOffset, Layouts, NumRows, + RowSizesInBytes, &RequiredSize); + pDevice->Release(); + + return UpdateSubresources(pCmdList, pDestinationResource, pIntermediate, FirstSubresource, NumSubresources, + RequiredSize, Layouts, NumRows, RowSizesInBytes, pResourceData, pSrcData); +} + +//------------------------------------------------------------------------------------------------ +inline constexpr bool D3D12IsLayoutOpaque(D3D12_TEXTURE_LAYOUT Layout) noexcept { + return Layout == D3D12_TEXTURE_LAYOUT_UNKNOWN || Layout == D3D12_TEXTURE_LAYOUT_64KB_UNDEFINED_SWIZZLE; +} + +//------------------------------------------------------------------------------------------------ +template +inline ID3D12CommandList *const *CommandListCast(t_CommandListType *const *pp) noexcept { + // This cast is useful for passing strongly typed command list pointers into + // ExecuteCommandLists. + // This cast is valid as long as the const-ness is respected. D3D12 APIs do + // respect the const-ness of their arguments. + return reinterpret_cast(pp); +} + +//------------------------------------------------------------------------------------------------ +// D3D12 exports a new method for serializing root signatures in the Windows 10 Anniversary Update. +// To help enable root signature 1.1 features when they are available and not require maintaining +// two code paths for building root signatures, this helper method reconstructs a 1.0 signature when +// 1.1 is not supported. +inline HRESULT D3DX12SerializeVersionedRootSignature(_In_ const D3D12_VERSIONED_ROOT_SIGNATURE_DESC *pRootSignatureDesc, + D3D_ROOT_SIGNATURE_VERSION MaxVersion, _Outptr_ ID3DBlob **ppBlob, + _Always_(_Outptr_opt_result_maybenull_) + ID3DBlob **ppErrorBlob) noexcept { + if (ppErrorBlob != nullptr) { + *ppErrorBlob = nullptr; + } + + switch (MaxVersion) { + case D3D_ROOT_SIGNATURE_VERSION_1_0: + switch (pRootSignatureDesc->Version) { + case D3D_ROOT_SIGNATURE_VERSION_1_0: + return D3D12SerializeRootSignature(&pRootSignatureDesc->Desc_1_0, D3D_ROOT_SIGNATURE_VERSION_1, ppBlob, + ppErrorBlob); + + case D3D_ROOT_SIGNATURE_VERSION_1_1: { + HRESULT hr = S_OK; + const D3D12_ROOT_SIGNATURE_DESC1 &desc_1_1 = pRootSignatureDesc->Desc_1_1; + + const SIZE_T ParametersSize = sizeof(D3D12_ROOT_PARAMETER) * desc_1_1.NumParameters; + void *pParameters = (ParametersSize > 0) ? HeapAlloc(GetProcessHeap(), 0, ParametersSize) : nullptr; + if (ParametersSize > 0 && pParameters == nullptr) { + hr = E_OUTOFMEMORY; + } + auto pParameters_1_0 = static_cast(pParameters); + + if (SUCCEEDED(hr)) { + for (UINT n = 0; n < desc_1_1.NumParameters; n++) { + __analysis_assume(ParametersSize == sizeof(D3D12_ROOT_PARAMETER) * desc_1_1.NumParameters); + pParameters_1_0[n].ParameterType = desc_1_1.pParameters[n].ParameterType; + pParameters_1_0[n].ShaderVisibility = desc_1_1.pParameters[n].ShaderVisibility; + + switch (desc_1_1.pParameters[n].ParameterType) { + case D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS: + pParameters_1_0[n].Constants.Num32BitValues = desc_1_1.pParameters[n].Constants.Num32BitValues; + pParameters_1_0[n].Constants.RegisterSpace = desc_1_1.pParameters[n].Constants.RegisterSpace; + pParameters_1_0[n].Constants.ShaderRegister = desc_1_1.pParameters[n].Constants.ShaderRegister; + break; + + case D3D12_ROOT_PARAMETER_TYPE_CBV: + case D3D12_ROOT_PARAMETER_TYPE_SRV: + case D3D12_ROOT_PARAMETER_TYPE_UAV: + pParameters_1_0[n].Descriptor.RegisterSpace = desc_1_1.pParameters[n].Descriptor.RegisterSpace; + pParameters_1_0[n].Descriptor.ShaderRegister = + desc_1_1.pParameters[n].Descriptor.ShaderRegister; + break; + + case D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE: + const D3D12_ROOT_DESCRIPTOR_TABLE1 &table_1_1 = desc_1_1.pParameters[n].DescriptorTable; + + const SIZE_T DescriptorRangesSize = + sizeof(D3D12_DESCRIPTOR_RANGE) * table_1_1.NumDescriptorRanges; + void *pDescriptorRanges = (DescriptorRangesSize > 0 && SUCCEEDED(hr)) + ? HeapAlloc(GetProcessHeap(), 0, DescriptorRangesSize) + : nullptr; + if (DescriptorRangesSize > 0 && pDescriptorRanges == nullptr) { + hr = E_OUTOFMEMORY; + } + auto pDescriptorRanges_1_0 = static_cast(pDescriptorRanges); + + if (SUCCEEDED(hr)) { + for (UINT x = 0; x < table_1_1.NumDescriptorRanges; x++) { + __analysis_assume(DescriptorRangesSize == + sizeof(D3D12_DESCRIPTOR_RANGE) * table_1_1.NumDescriptorRanges); + pDescriptorRanges_1_0[x].BaseShaderRegister = + table_1_1.pDescriptorRanges[x].BaseShaderRegister; + pDescriptorRanges_1_0[x].NumDescriptors = table_1_1.pDescriptorRanges[x].NumDescriptors; + pDescriptorRanges_1_0[x].OffsetInDescriptorsFromTableStart = + table_1_1.pDescriptorRanges[x].OffsetInDescriptorsFromTableStart; + pDescriptorRanges_1_0[x].RangeType = table_1_1.pDescriptorRanges[x].RangeType; + pDescriptorRanges_1_0[x].RegisterSpace = table_1_1.pDescriptorRanges[x].RegisterSpace; + } + } + + D3D12_ROOT_DESCRIPTOR_TABLE &table_1_0 = pParameters_1_0[n].DescriptorTable; + table_1_0.NumDescriptorRanges = table_1_1.NumDescriptorRanges; + table_1_0.pDescriptorRanges = pDescriptorRanges_1_0; + } + } + } + + if (SUCCEEDED(hr)) { + CD3DX12_ROOT_SIGNATURE_DESC desc_1_0(desc_1_1.NumParameters, pParameters_1_0, + desc_1_1.NumStaticSamplers, desc_1_1.pStaticSamplers, + desc_1_1.Flags); + hr = D3D12SerializeRootSignature(&desc_1_0, D3D_ROOT_SIGNATURE_VERSION_1, ppBlob, ppErrorBlob); + } + + if (pParameters) { + for (UINT n = 0; n < desc_1_1.NumParameters; n++) { + if (desc_1_1.pParameters[n].ParameterType == D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE) { + auto pDescriptorRanges_1_0 = pParameters_1_0[n].DescriptorTable.pDescriptorRanges; + HeapFree(GetProcessHeap(), 0, + reinterpret_cast(const_cast(pDescriptorRanges_1_0))); + } + } + HeapFree(GetProcessHeap(), 0, pParameters); + } + return hr; + } + } + break; + + case D3D_ROOT_SIGNATURE_VERSION_1_1: + return D3D12SerializeVersionedRootSignature(pRootSignatureDesc, ppBlob, ppErrorBlob); + } + + return E_INVALIDARG; +} + +//------------------------------------------------------------------------------------------------ +struct CD3DX12_RT_FORMAT_ARRAY : public D3D12_RT_FORMAT_ARRAY { + CD3DX12_RT_FORMAT_ARRAY() = default; + explicit CD3DX12_RT_FORMAT_ARRAY(const D3D12_RT_FORMAT_ARRAY &o) noexcept : D3D12_RT_FORMAT_ARRAY(o) {} + explicit CD3DX12_RT_FORMAT_ARRAY(_In_reads_(NumFormats) const DXGI_FORMAT *pFormats, UINT NumFormats) noexcept { + NumRenderTargets = NumFormats; + memcpy(RTFormats, pFormats, sizeof(RTFormats)); + // assumes ARRAY_SIZE(pFormats) == ARRAY_SIZE(RTFormats) + } +}; + +//------------------------------------------------------------------------------------------------ +// Pipeline State Stream Helpers +//------------------------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------------------------ +// Stream Subobjects, i.e. elements of a stream + +struct DefaultSampleMask { + operator UINT() noexcept { return UINT_MAX; } +}; +struct DefaultSampleDesc { + operator DXGI_SAMPLE_DESC() noexcept { return DXGI_SAMPLE_DESC{1, 0}; } +}; + +#pragma warning(push) +#pragma warning(disable : 4324) +template +class alignas(void *) CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT { + private: + D3D12_PIPELINE_STATE_SUBOBJECT_TYPE _Type; + InnerStructType _Inner; + + public: + CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT() noexcept : _Type(Type), _Inner(DefaultArg()) {} + CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT(InnerStructType const &i) noexcept : _Type(Type), _Inner(i) {} + CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT &operator=(InnerStructType const &i) noexcept { + _Type = Type; + _Inner = i; + return *this; + } + operator InnerStructType const &() const noexcept { return _Inner; } + operator InnerStructType &() noexcept { return _Inner; } + InnerStructType *operator&() noexcept { return &_Inner; } + InnerStructType const *operator&() const noexcept { return &_Inner; } +}; +#pragma warning(pop) +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_FLAGS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_INPUT_LAYOUT; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_IB_STRIP_CUT_VALUE; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_PRIMITIVE_TOPOLOGY; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_VS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_GS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_STREAM_OUTPUT; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_HS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_DS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_PS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_AS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_MS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_CS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO; +typedef CD3DX12_PIPELINE_STATE_STREAM_SUBOBJECT + CD3DX12_PIPELINE_STATE_STREAM_VIEW_INSTANCING; + +//------------------------------------------------------------------------------------------------ +// Stream Parser Helpers + +struct ID3DX12PipelineParserCallbacks { + // Subobject Callbacks + virtual void FlagsCb(D3D12_PIPELINE_STATE_FLAGS) {} + virtual void NodeMaskCb(UINT) {} + virtual void RootSignatureCb(ID3D12RootSignature *) {} + virtual void InputLayoutCb(const D3D12_INPUT_LAYOUT_DESC &) {} + virtual void IBStripCutValueCb(D3D12_INDEX_BUFFER_STRIP_CUT_VALUE) {} + virtual void PrimitiveTopologyTypeCb(D3D12_PRIMITIVE_TOPOLOGY_TYPE) {} + virtual void VSCb(const D3D12_SHADER_BYTECODE &) {} + virtual void GSCb(const D3D12_SHADER_BYTECODE &) {} + virtual void StreamOutputCb(const D3D12_STREAM_OUTPUT_DESC &) {} + virtual void HSCb(const D3D12_SHADER_BYTECODE &) {} + virtual void DSCb(const D3D12_SHADER_BYTECODE &) {} + virtual void PSCb(const D3D12_SHADER_BYTECODE &) {} + virtual void CSCb(const D3D12_SHADER_BYTECODE &) {} + virtual void ASCb(const D3D12_SHADER_BYTECODE &) {} + virtual void MSCb(const D3D12_SHADER_BYTECODE &) {} + virtual void BlendStateCb(const D3D12_BLEND_DESC &) {} + virtual void DepthStencilStateCb(const D3D12_DEPTH_STENCIL_DESC &) {} + virtual void DepthStencilState1Cb(const D3D12_DEPTH_STENCIL_DESC1 &) {} + virtual void DSVFormatCb(DXGI_FORMAT) {} + virtual void RasterizerStateCb(const D3D12_RASTERIZER_DESC &) {} + virtual void RTVFormatsCb(const D3D12_RT_FORMAT_ARRAY &) {} + virtual void SampleDescCb(const DXGI_SAMPLE_DESC &) {} + virtual void SampleMaskCb(UINT) {} + virtual void ViewInstancingCb(const D3D12_VIEW_INSTANCING_DESC &) {} + virtual void CachedPSOCb(const D3D12_CACHED_PIPELINE_STATE &) {} + + // Error Callbacks + virtual void ErrorBadInputParameter(UINT /*ParameterIndex*/) {} + virtual void ErrorDuplicateSubobject(D3D12_PIPELINE_STATE_SUBOBJECT_TYPE /*DuplicateType*/) {} + virtual void ErrorUnknownSubobject(UINT /*UnknownTypeValue*/) {} + + virtual ~ID3DX12PipelineParserCallbacks() = default; +}; + +struct D3DX12_MESH_SHADER_PIPELINE_STATE_DESC { + ID3D12RootSignature *pRootSignature; + D3D12_SHADER_BYTECODE AS; + D3D12_SHADER_BYTECODE MS; + D3D12_SHADER_BYTECODE PS; + D3D12_BLEND_DESC BlendState; + UINT SampleMask; + D3D12_RASTERIZER_DESC RasterizerState; + D3D12_DEPTH_STENCIL_DESC DepthStencilState; + D3D12_PRIMITIVE_TOPOLOGY_TYPE PrimitiveTopologyType; + UINT NumRenderTargets; + DXGI_FORMAT RTVFormats[D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT]; + DXGI_FORMAT DSVFormat; + DXGI_SAMPLE_DESC SampleDesc; + UINT NodeMask; + D3D12_CACHED_PIPELINE_STATE CachedPSO; + D3D12_PIPELINE_STATE_FLAGS Flags; +}; + +// CD3DX12_PIPELINE_STATE_STREAM2 Works on OS Build 19041+ (where there is a new mesh shader pipeline). +// Use CD3DX12_PIPELINE_STATE_STREAM1 for OS Build 16299+ (where there is a new view instancing subobject). +// Use CD3DX12_PIPELINE_STATE_STREAM for OS Build 15063+ support. +struct CD3DX12_PIPELINE_STATE_STREAM2 { + CD3DX12_PIPELINE_STATE_STREAM2() = default; + // Mesh and amplification shaders must be set manually, since they do not have representation in + // D3D12_GRAPHICS_PIPELINE_STATE_DESC + CD3DX12_PIPELINE_STATE_STREAM2(const D3D12_GRAPHICS_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), + InputLayout(Desc.InputLayout), IBStripCutValue(Desc.IBStripCutValue), + PrimitiveTopologyType(Desc.PrimitiveTopologyType), VS(Desc.VS), GS(Desc.GS), StreamOutput(Desc.StreamOutput), + HS(Desc.HS), DS(Desc.DS), PS(Desc.PS), BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)), + DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat), + RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)), + RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc), + SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO), + ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {} + CD3DX12_PIPELINE_STATE_STREAM2(const D3DX12_MESH_SHADER_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), + PrimitiveTopologyType(Desc.PrimitiveTopologyType), PS(Desc.PS), AS(Desc.AS), MS(Desc.MS), + BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)), + DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat), + RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)), + RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc), + SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO), + ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {} + CD3DX12_PIPELINE_STATE_STREAM2(const D3D12_COMPUTE_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), + CS(CD3DX12_SHADER_BYTECODE(Desc.CS)), CachedPSO(Desc.CachedPSO) { + static_cast(DepthStencilState).DepthEnable = false; + } + CD3DX12_PIPELINE_STATE_STREAM_FLAGS Flags; + CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK NodeMask; + CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE pRootSignature; + CD3DX12_PIPELINE_STATE_STREAM_INPUT_LAYOUT InputLayout; + CD3DX12_PIPELINE_STATE_STREAM_IB_STRIP_CUT_VALUE IBStripCutValue; + CD3DX12_PIPELINE_STATE_STREAM_PRIMITIVE_TOPOLOGY PrimitiveTopologyType; + CD3DX12_PIPELINE_STATE_STREAM_VS VS; + CD3DX12_PIPELINE_STATE_STREAM_GS GS; + CD3DX12_PIPELINE_STATE_STREAM_STREAM_OUTPUT StreamOutput; + CD3DX12_PIPELINE_STATE_STREAM_HS HS; + CD3DX12_PIPELINE_STATE_STREAM_DS DS; + CD3DX12_PIPELINE_STATE_STREAM_PS PS; + CD3DX12_PIPELINE_STATE_STREAM_AS AS; + CD3DX12_PIPELINE_STATE_STREAM_MS MS; + CD3DX12_PIPELINE_STATE_STREAM_CS CS; + CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC BlendState; + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1 DepthStencilState; + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT DSVFormat; + CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER RasterizerState; + CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS RTVFormats; + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC SampleDesc; + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK SampleMask; + CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO CachedPSO; + CD3DX12_PIPELINE_STATE_STREAM_VIEW_INSTANCING ViewInstancingDesc; + D3D12_GRAPHICS_PIPELINE_STATE_DESC GraphicsDescV0() const noexcept { + D3D12_GRAPHICS_PIPELINE_STATE_DESC D; + D.Flags = this->Flags; + D.NodeMask = this->NodeMask; + D.pRootSignature = this->pRootSignature; + D.InputLayout = this->InputLayout; + D.IBStripCutValue = this->IBStripCutValue; + D.PrimitiveTopologyType = this->PrimitiveTopologyType; + D.VS = this->VS; + D.GS = this->GS; + D.StreamOutput = this->StreamOutput; + D.HS = this->HS; + D.DS = this->DS; + D.PS = this->PS; + D.BlendState = this->BlendState; + D.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(D3D12_DEPTH_STENCIL_DESC1(this->DepthStencilState)); + D.DSVFormat = this->DSVFormat; + D.RasterizerState = this->RasterizerState; + D.NumRenderTargets = D3D12_RT_FORMAT_ARRAY(this->RTVFormats).NumRenderTargets; + memcpy(D.RTVFormats, D3D12_RT_FORMAT_ARRAY(this->RTVFormats).RTFormats, sizeof(D.RTVFormats)); + D.SampleDesc = this->SampleDesc; + D.SampleMask = this->SampleMask; + D.CachedPSO = this->CachedPSO; + return D; + } + D3D12_COMPUTE_PIPELINE_STATE_DESC ComputeDescV0() const noexcept { + D3D12_COMPUTE_PIPELINE_STATE_DESC D; + D.Flags = this->Flags; + D.NodeMask = this->NodeMask; + D.pRootSignature = this->pRootSignature; + D.CS = this->CS; + D.CachedPSO = this->CachedPSO; + return D; + } +}; + +// CD3DX12_PIPELINE_STATE_STREAM1 Works on OS Build 16299+ (where there is a new view instancing subobject). +// Use CD3DX12_PIPELINE_STATE_STREAM for OS Build 15063+ support. +struct CD3DX12_PIPELINE_STATE_STREAM1 { + CD3DX12_PIPELINE_STATE_STREAM1() = default; + // Mesh and amplification shaders must be set manually, since they do not have representation in + // D3D12_GRAPHICS_PIPELINE_STATE_DESC + CD3DX12_PIPELINE_STATE_STREAM1(const D3D12_GRAPHICS_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), + InputLayout(Desc.InputLayout), IBStripCutValue(Desc.IBStripCutValue), + PrimitiveTopologyType(Desc.PrimitiveTopologyType), VS(Desc.VS), GS(Desc.GS), StreamOutput(Desc.StreamOutput), + HS(Desc.HS), DS(Desc.DS), PS(Desc.PS), BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)), + DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat), + RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)), + RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc), + SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO), + ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {} + CD3DX12_PIPELINE_STATE_STREAM1(const D3DX12_MESH_SHADER_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), + PrimitiveTopologyType(Desc.PrimitiveTopologyType), PS(Desc.PS), + BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)), + DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat), + RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)), + RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc), + SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO), + ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {} + CD3DX12_PIPELINE_STATE_STREAM1(const D3D12_COMPUTE_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), + CS(CD3DX12_SHADER_BYTECODE(Desc.CS)), CachedPSO(Desc.CachedPSO) { + static_cast(DepthStencilState).DepthEnable = false; + } + CD3DX12_PIPELINE_STATE_STREAM_FLAGS Flags; + CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK NodeMask; + CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE pRootSignature; + CD3DX12_PIPELINE_STATE_STREAM_INPUT_LAYOUT InputLayout; + CD3DX12_PIPELINE_STATE_STREAM_IB_STRIP_CUT_VALUE IBStripCutValue; + CD3DX12_PIPELINE_STATE_STREAM_PRIMITIVE_TOPOLOGY PrimitiveTopologyType; + CD3DX12_PIPELINE_STATE_STREAM_VS VS; + CD3DX12_PIPELINE_STATE_STREAM_GS GS; + CD3DX12_PIPELINE_STATE_STREAM_STREAM_OUTPUT StreamOutput; + CD3DX12_PIPELINE_STATE_STREAM_HS HS; + CD3DX12_PIPELINE_STATE_STREAM_DS DS; + CD3DX12_PIPELINE_STATE_STREAM_PS PS; + CD3DX12_PIPELINE_STATE_STREAM_CS CS; + CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC BlendState; + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1 DepthStencilState; + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT DSVFormat; + CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER RasterizerState; + CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS RTVFormats; + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC SampleDesc; + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK SampleMask; + CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO CachedPSO; + CD3DX12_PIPELINE_STATE_STREAM_VIEW_INSTANCING ViewInstancingDesc; + D3D12_GRAPHICS_PIPELINE_STATE_DESC GraphicsDescV0() const noexcept { + D3D12_GRAPHICS_PIPELINE_STATE_DESC D; + D.Flags = this->Flags; + D.NodeMask = this->NodeMask; + D.pRootSignature = this->pRootSignature; + D.InputLayout = this->InputLayout; + D.IBStripCutValue = this->IBStripCutValue; + D.PrimitiveTopologyType = this->PrimitiveTopologyType; + D.VS = this->VS; + D.GS = this->GS; + D.StreamOutput = this->StreamOutput; + D.HS = this->HS; + D.DS = this->DS; + D.PS = this->PS; + D.BlendState = this->BlendState; + D.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(D3D12_DEPTH_STENCIL_DESC1(this->DepthStencilState)); + D.DSVFormat = this->DSVFormat; + D.RasterizerState = this->RasterizerState; + D.NumRenderTargets = D3D12_RT_FORMAT_ARRAY(this->RTVFormats).NumRenderTargets; + memcpy(D.RTVFormats, D3D12_RT_FORMAT_ARRAY(this->RTVFormats).RTFormats, sizeof(D.RTVFormats)); + D.SampleDesc = this->SampleDesc; + D.SampleMask = this->SampleMask; + D.CachedPSO = this->CachedPSO; + return D; + } + D3D12_COMPUTE_PIPELINE_STATE_DESC ComputeDescV0() const noexcept { + D3D12_COMPUTE_PIPELINE_STATE_DESC D; + D.Flags = this->Flags; + D.NodeMask = this->NodeMask; + D.pRootSignature = this->pRootSignature; + D.CS = this->CS; + D.CachedPSO = this->CachedPSO; + return D; + } +}; + +struct CD3DX12_PIPELINE_MESH_STATE_STREAM { + CD3DX12_PIPELINE_MESH_STATE_STREAM() = default; + CD3DX12_PIPELINE_MESH_STATE_STREAM(const D3DX12_MESH_SHADER_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), PS(Desc.PS), AS(Desc.AS), + MS(Desc.MS), BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)), + DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat), + RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)), + RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc), + SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO), + ViewInstancingDesc(CD3DX12_VIEW_INSTANCING_DESC(CD3DX12_DEFAULT())) {} + CD3DX12_PIPELINE_STATE_STREAM_FLAGS Flags; + CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK NodeMask; + CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE pRootSignature; + CD3DX12_PIPELINE_STATE_STREAM_PS PS; + CD3DX12_PIPELINE_STATE_STREAM_AS AS; + CD3DX12_PIPELINE_STATE_STREAM_MS MS; + CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC BlendState; + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1 DepthStencilState; + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT DSVFormat; + CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER RasterizerState; + CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS RTVFormats; + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC SampleDesc; + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK SampleMask; + CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO CachedPSO; + CD3DX12_PIPELINE_STATE_STREAM_VIEW_INSTANCING ViewInstancingDesc; + D3DX12_MESH_SHADER_PIPELINE_STATE_DESC MeshShaderDescV0() const noexcept { + D3DX12_MESH_SHADER_PIPELINE_STATE_DESC D; + D.Flags = this->Flags; + D.NodeMask = this->NodeMask; + D.pRootSignature = this->pRootSignature; + D.PS = this->PS; + D.AS = this->AS; + D.MS = this->MS; + D.BlendState = this->BlendState; + D.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(D3D12_DEPTH_STENCIL_DESC1(this->DepthStencilState)); + D.DSVFormat = this->DSVFormat; + D.RasterizerState = this->RasterizerState; + D.NumRenderTargets = D3D12_RT_FORMAT_ARRAY(this->RTVFormats).NumRenderTargets; + memcpy(D.RTVFormats, D3D12_RT_FORMAT_ARRAY(this->RTVFormats).RTFormats, sizeof(D.RTVFormats)); + D.SampleDesc = this->SampleDesc; + D.SampleMask = this->SampleMask; + D.CachedPSO = this->CachedPSO; + return D; + } +}; + +// CD3DX12_PIPELINE_STATE_STREAM works on OS Build 15063+ but does not support new subobject(s) added in OS Build +// 16299+. See CD3DX12_PIPELINE_STATE_STREAM1 for instance. +struct CD3DX12_PIPELINE_STATE_STREAM { + CD3DX12_PIPELINE_STATE_STREAM() = default; + CD3DX12_PIPELINE_STATE_STREAM(const D3D12_GRAPHICS_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), + InputLayout(Desc.InputLayout), IBStripCutValue(Desc.IBStripCutValue), + PrimitiveTopologyType(Desc.PrimitiveTopologyType), VS(Desc.VS), GS(Desc.GS), StreamOutput(Desc.StreamOutput), + HS(Desc.HS), DS(Desc.DS), PS(Desc.PS), BlendState(CD3DX12_BLEND_DESC(Desc.BlendState)), + DepthStencilState(CD3DX12_DEPTH_STENCIL_DESC1(Desc.DepthStencilState)), DSVFormat(Desc.DSVFormat), + RasterizerState(CD3DX12_RASTERIZER_DESC(Desc.RasterizerState)), + RTVFormats(CD3DX12_RT_FORMAT_ARRAY(Desc.RTVFormats, Desc.NumRenderTargets)), SampleDesc(Desc.SampleDesc), + SampleMask(Desc.SampleMask), CachedPSO(Desc.CachedPSO) {} + CD3DX12_PIPELINE_STATE_STREAM(const D3D12_COMPUTE_PIPELINE_STATE_DESC &Desc) + noexcept + : Flags(Desc.Flags), NodeMask(Desc.NodeMask), pRootSignature(Desc.pRootSignature), + CS(CD3DX12_SHADER_BYTECODE(Desc.CS)), CachedPSO(Desc.CachedPSO) {} + CD3DX12_PIPELINE_STATE_STREAM_FLAGS Flags; + CD3DX12_PIPELINE_STATE_STREAM_NODE_MASK NodeMask; + CD3DX12_PIPELINE_STATE_STREAM_ROOT_SIGNATURE pRootSignature; + CD3DX12_PIPELINE_STATE_STREAM_INPUT_LAYOUT InputLayout; + CD3DX12_PIPELINE_STATE_STREAM_IB_STRIP_CUT_VALUE IBStripCutValue; + CD3DX12_PIPELINE_STATE_STREAM_PRIMITIVE_TOPOLOGY PrimitiveTopologyType; + CD3DX12_PIPELINE_STATE_STREAM_VS VS; + CD3DX12_PIPELINE_STATE_STREAM_GS GS; + CD3DX12_PIPELINE_STATE_STREAM_STREAM_OUTPUT StreamOutput; + CD3DX12_PIPELINE_STATE_STREAM_HS HS; + CD3DX12_PIPELINE_STATE_STREAM_DS DS; + CD3DX12_PIPELINE_STATE_STREAM_PS PS; + CD3DX12_PIPELINE_STATE_STREAM_CS CS; + CD3DX12_PIPELINE_STATE_STREAM_BLEND_DESC BlendState; + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL1 DepthStencilState; + CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL_FORMAT DSVFormat; + CD3DX12_PIPELINE_STATE_STREAM_RASTERIZER RasterizerState; + CD3DX12_PIPELINE_STATE_STREAM_RENDER_TARGET_FORMATS RTVFormats; + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_DESC SampleDesc; + CD3DX12_PIPELINE_STATE_STREAM_SAMPLE_MASK SampleMask; + CD3DX12_PIPELINE_STATE_STREAM_CACHED_PSO CachedPSO; + D3D12_GRAPHICS_PIPELINE_STATE_DESC GraphicsDescV0() const noexcept { + D3D12_GRAPHICS_PIPELINE_STATE_DESC D; + D.Flags = this->Flags; + D.NodeMask = this->NodeMask; + D.pRootSignature = this->pRootSignature; + D.InputLayout = this->InputLayout; + D.IBStripCutValue = this->IBStripCutValue; + D.PrimitiveTopologyType = this->PrimitiveTopologyType; + D.VS = this->VS; + D.GS = this->GS; + D.StreamOutput = this->StreamOutput; + D.HS = this->HS; + D.DS = this->DS; + D.PS = this->PS; + D.BlendState = this->BlendState; + D.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(D3D12_DEPTH_STENCIL_DESC1(this->DepthStencilState)); + D.DSVFormat = this->DSVFormat; + D.RasterizerState = this->RasterizerState; + D.NumRenderTargets = D3D12_RT_FORMAT_ARRAY(this->RTVFormats).NumRenderTargets; + memcpy(D.RTVFormats, D3D12_RT_FORMAT_ARRAY(this->RTVFormats).RTFormats, sizeof(D.RTVFormats)); + D.SampleDesc = this->SampleDesc; + D.SampleMask = this->SampleMask; + D.CachedPSO = this->CachedPSO; + return D; + } + D3D12_COMPUTE_PIPELINE_STATE_DESC ComputeDescV0() const noexcept { + D3D12_COMPUTE_PIPELINE_STATE_DESC D; + D.Flags = this->Flags; + D.NodeMask = this->NodeMask; + D.pRootSignature = this->pRootSignature; + D.CS = this->CS; + D.CachedPSO = this->CachedPSO; + return D; + } +}; + +struct CD3DX12_PIPELINE_STATE_STREAM2_PARSE_HELPER : public ID3DX12PipelineParserCallbacks { + CD3DX12_PIPELINE_STATE_STREAM2 PipelineStream; + CD3DX12_PIPELINE_STATE_STREAM2_PARSE_HELPER() noexcept : SeenDSS(false) { + // Adjust defaults to account for absent members. + PipelineStream.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + + // Depth disabled if no DSV format specified. + static_cast(PipelineStream.DepthStencilState).DepthEnable = false; + } + + // ID3DX12PipelineParserCallbacks + void FlagsCb(D3D12_PIPELINE_STATE_FLAGS Flags) override { PipelineStream.Flags = Flags; } + void NodeMaskCb(UINT NodeMask) override { PipelineStream.NodeMask = NodeMask; } + void RootSignatureCb(ID3D12RootSignature *pRootSignature) override { + PipelineStream.pRootSignature = pRootSignature; + } + void InputLayoutCb(const D3D12_INPUT_LAYOUT_DESC &InputLayout) override { + PipelineStream.InputLayout = InputLayout; + } + void IBStripCutValueCb(D3D12_INDEX_BUFFER_STRIP_CUT_VALUE IBStripCutValue) override { + PipelineStream.IBStripCutValue = IBStripCutValue; + } + void PrimitiveTopologyTypeCb(D3D12_PRIMITIVE_TOPOLOGY_TYPE PrimitiveTopologyType) override { + PipelineStream.PrimitiveTopologyType = PrimitiveTopologyType; + } + void VSCb(const D3D12_SHADER_BYTECODE &VS) override { PipelineStream.VS = VS; } + void GSCb(const D3D12_SHADER_BYTECODE &GS) override { PipelineStream.GS = GS; } + void StreamOutputCb(const D3D12_STREAM_OUTPUT_DESC &StreamOutput) override { + PipelineStream.StreamOutput = StreamOutput; + } + void HSCb(const D3D12_SHADER_BYTECODE &HS) override { PipelineStream.HS = HS; } + void DSCb(const D3D12_SHADER_BYTECODE &DS) override { PipelineStream.DS = DS; } + void PSCb(const D3D12_SHADER_BYTECODE &PS) override { PipelineStream.PS = PS; } + void CSCb(const D3D12_SHADER_BYTECODE &CS) override { PipelineStream.CS = CS; } + void ASCb(const D3D12_SHADER_BYTECODE &AS) override { PipelineStream.AS = AS; } + void MSCb(const D3D12_SHADER_BYTECODE &MS) override { PipelineStream.MS = MS; } + void BlendStateCb(const D3D12_BLEND_DESC &BlendState) override { + PipelineStream.BlendState = CD3DX12_BLEND_DESC(BlendState); + } + void DepthStencilStateCb(const D3D12_DEPTH_STENCIL_DESC &DepthStencilState) override { + PipelineStream.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(DepthStencilState); + SeenDSS = true; + } + void DepthStencilState1Cb(const D3D12_DEPTH_STENCIL_DESC1 &DepthStencilState) override { + PipelineStream.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(DepthStencilState); + SeenDSS = true; + } + void DSVFormatCb(DXGI_FORMAT DSVFormat) override { + PipelineStream.DSVFormat = DSVFormat; + if (!SeenDSS && DSVFormat != DXGI_FORMAT_UNKNOWN) { + // Re-enable depth for the default state. + static_cast(PipelineStream.DepthStencilState).DepthEnable = true; + } + } + void RasterizerStateCb(const D3D12_RASTERIZER_DESC &RasterizerState) override { + PipelineStream.RasterizerState = CD3DX12_RASTERIZER_DESC(RasterizerState); + } + void RTVFormatsCb(const D3D12_RT_FORMAT_ARRAY &RTVFormats) override { PipelineStream.RTVFormats = RTVFormats; } + void SampleDescCb(const DXGI_SAMPLE_DESC &SampleDesc) override { PipelineStream.SampleDesc = SampleDesc; } + void SampleMaskCb(UINT SampleMask) override { PipelineStream.SampleMask = SampleMask; } + void ViewInstancingCb(const D3D12_VIEW_INSTANCING_DESC &ViewInstancingDesc) override { + PipelineStream.ViewInstancingDesc = CD3DX12_VIEW_INSTANCING_DESC(ViewInstancingDesc); + } + void CachedPSOCb(const D3D12_CACHED_PIPELINE_STATE &CachedPSO) override { PipelineStream.CachedPSO = CachedPSO; } + + private: + bool SeenDSS; +}; + +struct CD3DX12_PIPELINE_STATE_STREAM_PARSE_HELPER : public ID3DX12PipelineParserCallbacks { + CD3DX12_PIPELINE_STATE_STREAM1 PipelineStream; + CD3DX12_PIPELINE_STATE_STREAM_PARSE_HELPER() noexcept : SeenDSS(false) { + // Adjust defaults to account for absent members. + PipelineStream.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + + // Depth disabled if no DSV format specified. + static_cast(PipelineStream.DepthStencilState).DepthEnable = false; + } + + // ID3DX12PipelineParserCallbacks + void FlagsCb(D3D12_PIPELINE_STATE_FLAGS Flags) override { PipelineStream.Flags = Flags; } + void NodeMaskCb(UINT NodeMask) override { PipelineStream.NodeMask = NodeMask; } + void RootSignatureCb(ID3D12RootSignature *pRootSignature) override { + PipelineStream.pRootSignature = pRootSignature; + } + void InputLayoutCb(const D3D12_INPUT_LAYOUT_DESC &InputLayout) override { + PipelineStream.InputLayout = InputLayout; + } + void IBStripCutValueCb(D3D12_INDEX_BUFFER_STRIP_CUT_VALUE IBStripCutValue) override { + PipelineStream.IBStripCutValue = IBStripCutValue; + } + void PrimitiveTopologyTypeCb(D3D12_PRIMITIVE_TOPOLOGY_TYPE PrimitiveTopologyType) override { + PipelineStream.PrimitiveTopologyType = PrimitiveTopologyType; + } + void VSCb(const D3D12_SHADER_BYTECODE &VS) override { PipelineStream.VS = VS; } + void GSCb(const D3D12_SHADER_BYTECODE &GS) override { PipelineStream.GS = GS; } + void StreamOutputCb(const D3D12_STREAM_OUTPUT_DESC &StreamOutput) override { + PipelineStream.StreamOutput = StreamOutput; + } + void HSCb(const D3D12_SHADER_BYTECODE &HS) override { PipelineStream.HS = HS; } + void DSCb(const D3D12_SHADER_BYTECODE &DS) override { PipelineStream.DS = DS; } + void PSCb(const D3D12_SHADER_BYTECODE &PS) override { PipelineStream.PS = PS; } + void CSCb(const D3D12_SHADER_BYTECODE &CS) override { PipelineStream.CS = CS; } + void BlendStateCb(const D3D12_BLEND_DESC &BlendState) override { + PipelineStream.BlendState = CD3DX12_BLEND_DESC(BlendState); + } + void DepthStencilStateCb(const D3D12_DEPTH_STENCIL_DESC &DepthStencilState) override { + PipelineStream.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(DepthStencilState); + SeenDSS = true; + } + void DepthStencilState1Cb(const D3D12_DEPTH_STENCIL_DESC1 &DepthStencilState) override { + PipelineStream.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC1(DepthStencilState); + SeenDSS = true; + } + void DSVFormatCb(DXGI_FORMAT DSVFormat) override { + PipelineStream.DSVFormat = DSVFormat; + if (!SeenDSS && DSVFormat != DXGI_FORMAT_UNKNOWN) { + // Re-enable depth for the default state. + static_cast(PipelineStream.DepthStencilState).DepthEnable = true; + } + } + void RasterizerStateCb(const D3D12_RASTERIZER_DESC &RasterizerState) override { + PipelineStream.RasterizerState = CD3DX12_RASTERIZER_DESC(RasterizerState); + } + void RTVFormatsCb(const D3D12_RT_FORMAT_ARRAY &RTVFormats) override { PipelineStream.RTVFormats = RTVFormats; } + void SampleDescCb(const DXGI_SAMPLE_DESC &SampleDesc) override { PipelineStream.SampleDesc = SampleDesc; } + void SampleMaskCb(UINT SampleMask) override { PipelineStream.SampleMask = SampleMask; } + void ViewInstancingCb(const D3D12_VIEW_INSTANCING_DESC &ViewInstancingDesc) override { + PipelineStream.ViewInstancingDesc = CD3DX12_VIEW_INSTANCING_DESC(ViewInstancingDesc); + } + void CachedPSOCb(const D3D12_CACHED_PIPELINE_STATE &CachedPSO) override { PipelineStream.CachedPSO = CachedPSO; } + + private: + bool SeenDSS; +}; + +inline D3D12_PIPELINE_STATE_SUBOBJECT_TYPE +D3DX12GetBaseSubobjectType(D3D12_PIPELINE_STATE_SUBOBJECT_TYPE SubobjectType) noexcept { + switch (SubobjectType) { + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL1: + return D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL; + default: + return SubobjectType; + } +} + +inline HRESULT D3DX12ParsePipelineStream(const D3D12_PIPELINE_STATE_STREAM_DESC &Desc, + ID3DX12PipelineParserCallbacks *pCallbacks) { + if (pCallbacks == nullptr) { + return E_INVALIDARG; + } + + if (Desc.SizeInBytes == 0 || Desc.pPipelineStateSubobjectStream == nullptr) { + pCallbacks->ErrorBadInputParameter(1); // first parameter issue + return E_INVALIDARG; + } + + bool SubobjectSeen[D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_MAX_VALID] = {}; + for (SIZE_T CurOffset = 0, SizeOfSubobject = 0; CurOffset < Desc.SizeInBytes; CurOffset += SizeOfSubobject) { + BYTE *pStream = static_cast(Desc.pPipelineStateSubobjectStream) + CurOffset; + auto SubobjectType = *reinterpret_cast(pStream); + if (SubobjectType < 0 || SubobjectType >= D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_MAX_VALID) { + pCallbacks->ErrorUnknownSubobject(SubobjectType); + return E_INVALIDARG; + } + if (SubobjectSeen[D3DX12GetBaseSubobjectType(SubobjectType)]) { + pCallbacks->ErrorDuplicateSubobject(SubobjectType); + return E_INVALIDARG; // disallow subobject duplicates in a stream + } + SubobjectSeen[SubobjectType] = true; + switch (SubobjectType) { + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_ROOT_SIGNATURE: + pCallbacks->RootSignatureCb( + *reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::pRootSignature); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_VS: + pCallbacks->VSCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::VS); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_PS: + pCallbacks->PSCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::PS); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DS: + pCallbacks->DSCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::DS); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_HS: + pCallbacks->HSCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::HS); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_GS: + pCallbacks->GSCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::GS); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_CS: + pCallbacks->CSCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::CS); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_AS: + pCallbacks->ASCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM2::AS); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_MS: + pCallbacks->MSCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM2::MS); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_STREAM_OUTPUT: + pCallbacks->StreamOutputCb( + *reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::StreamOutput); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_BLEND: + pCallbacks->BlendStateCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::BlendState); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_SAMPLE_MASK: + pCallbacks->SampleMaskCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::SampleMask); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_RASTERIZER: + pCallbacks->RasterizerStateCb( + *reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::RasterizerState); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL: + pCallbacks->DepthStencilStateCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM_DEPTH_STENCIL); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL1: + pCallbacks->DepthStencilState1Cb( + *reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::DepthStencilState); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_INPUT_LAYOUT: + pCallbacks->InputLayoutCb( + *reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::InputLayout); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_IB_STRIP_CUT_VALUE: + pCallbacks->IBStripCutValueCb( + *reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::IBStripCutValue); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_PRIMITIVE_TOPOLOGY: + pCallbacks->PrimitiveTopologyTypeCb( + *reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::PrimitiveTopologyType); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_RENDER_TARGET_FORMATS: + pCallbacks->RTVFormatsCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::RTVFormats); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_DEPTH_STENCIL_FORMAT: + pCallbacks->DSVFormatCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::DSVFormat); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_SAMPLE_DESC: + pCallbacks->SampleDescCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::SampleDesc); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_NODE_MASK: + pCallbacks->NodeMaskCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::NodeMask); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_CACHED_PSO: + pCallbacks->CachedPSOCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::CachedPSO); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_FLAGS: + pCallbacks->FlagsCb(*reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM::Flags); + break; + case D3D12_PIPELINE_STATE_SUBOBJECT_TYPE_VIEW_INSTANCING: + pCallbacks->ViewInstancingCb( + *reinterpret_cast(pStream)); + SizeOfSubobject = sizeof(CD3DX12_PIPELINE_STATE_STREAM1::ViewInstancingDesc); + break; + default: + pCallbacks->ErrorUnknownSubobject(SubobjectType); + return E_INVALIDARG; + } + } + + return S_OK; +} + +//------------------------------------------------------------------------------------------------ +inline bool operator==(const D3D12_CLEAR_VALUE &a, const D3D12_CLEAR_VALUE &b) noexcept { + if (a.Format != b.Format) + return false; + if (a.Format == DXGI_FORMAT_D24_UNORM_S8_UINT || a.Format == DXGI_FORMAT_D16_UNORM || + a.Format == DXGI_FORMAT_D32_FLOAT || a.Format == DXGI_FORMAT_D32_FLOAT_S8X24_UINT) { + return (a.DepthStencil.Depth == b.DepthStencil.Depth) && (a.DepthStencil.Stencil == b.DepthStencil.Stencil); + } else { + return (a.Color[0] == b.Color[0]) && (a.Color[1] == b.Color[1]) && (a.Color[2] == b.Color[2]) && + (a.Color[3] == b.Color[3]); + } +} +inline bool operator==(const D3D12_RENDER_PASS_BEGINNING_ACCESS_CLEAR_PARAMETERS &a, + const D3D12_RENDER_PASS_BEGINNING_ACCESS_CLEAR_PARAMETERS &b) noexcept { + return a.ClearValue == b.ClearValue; +} +inline bool operator==(const D3D12_RENDER_PASS_ENDING_ACCESS_RESOLVE_PARAMETERS &a, + const D3D12_RENDER_PASS_ENDING_ACCESS_RESOLVE_PARAMETERS &b) noexcept { + if (a.pSrcResource != b.pSrcResource) + return false; + if (a.pDstResource != b.pDstResource) + return false; + if (a.SubresourceCount != b.SubresourceCount) + return false; + if (a.Format != b.Format) + return false; + if (a.ResolveMode != b.ResolveMode) + return false; + if (a.PreserveResolveSource != b.PreserveResolveSource) + return false; + return true; +} +inline bool operator==(const D3D12_RENDER_PASS_BEGINNING_ACCESS &a, + const D3D12_RENDER_PASS_BEGINNING_ACCESS &b) noexcept { + if (a.Type != b.Type) + return false; + if (a.Type == D3D12_RENDER_PASS_BEGINNING_ACCESS_TYPE_CLEAR && !(a.Clear == b.Clear)) + return false; + return true; +} +inline bool operator==(const D3D12_RENDER_PASS_ENDING_ACCESS &a, const D3D12_RENDER_PASS_ENDING_ACCESS &b) noexcept { + if (a.Type != b.Type) + return false; + if (a.Type == D3D12_RENDER_PASS_ENDING_ACCESS_TYPE_RESOLVE && !(a.Resolve == b.Resolve)) + return false; + return true; +} +inline bool operator==(const D3D12_RENDER_PASS_RENDER_TARGET_DESC &a, + const D3D12_RENDER_PASS_RENDER_TARGET_DESC &b) noexcept { + if (a.cpuDescriptor.ptr != b.cpuDescriptor.ptr) + return false; + if (!(a.BeginningAccess == b.BeginningAccess)) + return false; + if (!(a.EndingAccess == b.EndingAccess)) + return false; + return true; +} +inline bool operator==(const D3D12_RENDER_PASS_DEPTH_STENCIL_DESC &a, + const D3D12_RENDER_PASS_DEPTH_STENCIL_DESC &b) noexcept { + if (a.cpuDescriptor.ptr != b.cpuDescriptor.ptr) + return false; + if (!(a.DepthBeginningAccess == b.DepthBeginningAccess)) + return false; + if (!(a.StencilBeginningAccess == b.StencilBeginningAccess)) + return false; + if (!(a.DepthEndingAccess == b.DepthEndingAccess)) + return false; + if (!(a.StencilEndingAccess == b.StencilEndingAccess)) + return false; + return true; +} + +#ifndef D3DX12_NO_STATE_OBJECT_HELPERS + +//================================================================================================ +// D3DX12 State Object Creation Helpers +// +// Helper classes for creating new style state objects out of an arbitrary set of subobjects. +// Uses STL +// +// Start by instantiating CD3DX12_STATE_OBJECT_DESC (see it's public methods). +// One of its methods is CreateSubobject(), which has a comment showing a couple of options for +// defining subobjects using the helper classes for each subobject (CD3DX12_DXIL_LIBRARY_SUBOBJECT +// etc.). The subobject helpers each have methods specific to the subobject for configuring it's +// contents. +// +//================================================================================================ +#include +#include +#include +#include +#ifndef D3DX12_USE_ATL +#include +#define D3DX12_COM_PTR Microsoft::WRL::ComPtr +#define D3DX12_COM_PTR_GET(x) x.Get() +#define D3DX12_COM_PTR_ADDRESSOF(x) x.GetAddressOf() +#else +#include +#define D3DX12_COM_PTR ATL::CComPtr +#define D3DX12_COM_PTR_GET(x) x.p +#define D3DX12_COM_PTR_ADDRESSOF(x) &x.p +#endif + +//------------------------------------------------------------------------------------------------ +class CD3DX12_STATE_OBJECT_DESC { + public: + CD3DX12_STATE_OBJECT_DESC() noexcept { Init(D3D12_STATE_OBJECT_TYPE_COLLECTION); } + CD3DX12_STATE_OBJECT_DESC(D3D12_STATE_OBJECT_TYPE Type) noexcept { Init(Type); } + void SetStateObjectType(D3D12_STATE_OBJECT_TYPE Type) noexcept { m_Desc.Type = Type; } + operator const D3D12_STATE_OBJECT_DESC &() { + // Do final preparation work + m_RepointedAssociations.clear(); + m_SubobjectArray.clear(); + m_SubobjectArray.reserve(m_Desc.NumSubobjects); + // Flatten subobjects into an array (each flattened subobject still has a + // member that's a pointer to it's desc that's not flattened) + for (auto Iter = m_SubobjectList.begin(); Iter != m_SubobjectList.end(); Iter++) { + m_SubobjectArray.push_back(*Iter); + // Store new location in array so we can redirect pointers contained in subobjects + Iter->pSubobjectArrayLocation = &m_SubobjectArray.back(); + } + // For subobjects with pointer fields, create a new copy of those subobject definitions + // with fixed pointers + for (UINT i = 0; i < m_Desc.NumSubobjects; i++) { + if (m_SubobjectArray[i].Type == D3D12_STATE_SUBOBJECT_TYPE_SUBOBJECT_TO_EXPORTS_ASSOCIATION) { + auto pOriginalSubobjectAssociation = + static_cast(m_SubobjectArray[i].pDesc); + D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION Repointed = *pOriginalSubobjectAssociation; + auto pWrapper = + static_cast(pOriginalSubobjectAssociation->pSubobjectToAssociate); + Repointed.pSubobjectToAssociate = pWrapper->pSubobjectArrayLocation; + m_RepointedAssociations.push_back(Repointed); + m_SubobjectArray[i].pDesc = &m_RepointedAssociations.back(); + } + } + // Below: using ugly way to get pointer in case .data() is not defined + m_Desc.pSubobjects = m_Desc.NumSubobjects ? &m_SubobjectArray[0] : nullptr; + return m_Desc; + } + operator const D3D12_STATE_OBJECT_DESC *() { + // Cast calls the above final preparation work + return &static_cast(*this); + } + + // CreateSubobject creates a sububject helper (e.g. CD3DX12_HIT_GROUP_SUBOBJECT) + // whose lifetime is owned by this class. + // e.g. + // + // CD3DX12_STATE_OBJECT_DESC Collection1(D3D12_STATE_OBJECT_TYPE_COLLECTION); + // auto Lib0 = Collection1.CreateSubobject(); + // Lib0->SetDXILLibrary(&pMyAppDxilLibs[0]); + // Lib0->DefineExport(L"rayGenShader0"); // in practice these export listings might be + // // data/engine driven + // etc. + // + // Alternatively, users can instantiate sububject helpers explicitly, such as via local + // variables instead, passing the state object desc that should point to it into the helper + // constructor (or call mySubobjectHelper.AddToStateObject(Collection1)). + // In this alternative scenario, the user must keep the subobject alive as long as the state + // object it is associated with is alive, else it's pointer references will be stale. + // e.g. + // + // CD3DX12_STATE_OBJECT_DESC RaytracingState2(D3D12_STATE_OBJECT_TYPE_RAYTRACING_PIPELINE); + // CD3DX12_DXIL_LIBRARY_SUBOBJECT LibA(RaytracingState2); + // LibA.SetDXILLibrary(&pMyAppDxilLibs[4]); // not manually specifying exports + // // - meaning all exports in the libraries + // // are exported + // etc. + + template T *CreateSubobject() { + T *pSubobject = new T(*this); + m_OwnedSubobjectHelpers.emplace_back(pSubobject); + return pSubobject; + } + + private: + D3D12_STATE_SUBOBJECT *TrackSubobject(D3D12_STATE_SUBOBJECT_TYPE Type, void *pDesc) { + SUBOBJECT_WRAPPER Subobject; + Subobject.pSubobjectArrayLocation = nullptr; + Subobject.Type = Type; + Subobject.pDesc = pDesc; + m_SubobjectList.push_back(Subobject); + m_Desc.NumSubobjects++; + return &m_SubobjectList.back(); + } + void Init(D3D12_STATE_OBJECT_TYPE Type) noexcept { + SetStateObjectType(Type); + m_Desc.pSubobjects = nullptr; + m_Desc.NumSubobjects = 0; + m_SubobjectList.clear(); + m_SubobjectArray.clear(); + m_RepointedAssociations.clear(); + } + typedef struct SUBOBJECT_WRAPPER : public D3D12_STATE_SUBOBJECT { + D3D12_STATE_SUBOBJECT *pSubobjectArrayLocation; // new location when flattened into array + // for repointing pointers in subobjects + } SUBOBJECT_WRAPPER; + D3D12_STATE_OBJECT_DESC m_Desc; + std::list m_SubobjectList; // Pointers to list nodes handed out so + // these can be edited live + std::vector m_SubobjectArray; // Built at the end, copying list contents + + std::list + m_RepointedAssociations; // subobject type that contains pointers to other subobjects, + // repointed to flattened array + + class StringContainer { + public: + LPCWSTR LocalCopy(LPCWSTR string, bool bSingleString = false) { + if (string) { + if (bSingleString) { + m_Strings.clear(); + m_Strings.push_back(string); + } else { + m_Strings.push_back(string); + } + return m_Strings.back().c_str(); + } else { + return nullptr; + } + } + void clear() noexcept { m_Strings.clear(); } + + private: + std::list m_Strings; + }; + + class SUBOBJECT_HELPER_BASE { + public: + SUBOBJECT_HELPER_BASE() noexcept { Init(); } + virtual ~SUBOBJECT_HELPER_BASE() = default; + virtual D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept = 0; + void AddToStateObject(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + m_pSubobject = ContainingStateObject.TrackSubobject(Type(), Data()); + } + + protected: + virtual void *Data() noexcept = 0; + void Init() noexcept { m_pSubobject = nullptr; } + D3D12_STATE_SUBOBJECT *m_pSubobject; + }; + +#if (__cplusplus >= 201103L) + std::list> m_OwnedSubobjectHelpers; +#else + class OWNED_HELPER { + public: + OWNED_HELPER(const SUBOBJECT_HELPER_BASE *pHelper) noexcept { m_pHelper = pHelper; } + ~OWNED_HELPER() { delete m_pHelper; } + const SUBOBJECT_HELPER_BASE *m_pHelper; + }; + + std::list m_OwnedSubobjectHelpers; +#endif + + friend class CD3DX12_DXIL_LIBRARY_SUBOBJECT; + friend class CD3DX12_EXISTING_COLLECTION_SUBOBJECT; + friend class CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT; + friend class CD3DX12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION; + friend class CD3DX12_HIT_GROUP_SUBOBJECT; + friend class CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT; + friend class CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT; + friend class CD3DX12_RAYTRACING_PIPELINE_CONFIG1_SUBOBJECT; + friend class CD3DX12_GLOBAL_ROOT_SIGNATURE_SUBOBJECT; + friend class CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT; + friend class CD3DX12_STATE_OBJECT_CONFIG_SUBOBJECT; + friend class CD3DX12_NODE_MASK_SUBOBJECT; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_DXIL_LIBRARY_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_DXIL_LIBRARY_SUBOBJECT() noexcept { Init(); } + CD3DX12_DXIL_LIBRARY_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetDXILLibrary(const D3D12_SHADER_BYTECODE *pCode) noexcept { + static const D3D12_SHADER_BYTECODE Default = {}; + m_Desc.DXILLibrary = pCode ? *pCode : Default; + } + void DefineExport(LPCWSTR Name, LPCWSTR ExportToRename = nullptr, + D3D12_EXPORT_FLAGS Flags = D3D12_EXPORT_FLAG_NONE) { + D3D12_EXPORT_DESC Export; + Export.Name = m_Strings.LocalCopy(Name); + Export.ExportToRename = m_Strings.LocalCopy(ExportToRename); + Export.Flags = Flags; + m_Exports.push_back(Export); + m_Desc.pExports = &m_Exports[0]; // using ugly way to get pointer in case .data() is not defined + m_Desc.NumExports = static_cast(m_Exports.size()); + } + template void DefineExports(LPCWSTR (&Exports)[N]) { + for (UINT i = 0; i < N; i++) { + DefineExport(Exports[i]); + } + } + void DefineExports(const LPCWSTR *Exports, UINT N) { + for (UINT i = 0; i < N; i++) { + DefineExport(Exports[i]); + } + } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_DXIL_LIBRARY; } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_DXIL_LIBRARY_DESC &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + m_Strings.clear(); + m_Exports.clear(); + } + void *Data() noexcept override { return &m_Desc; } + D3D12_DXIL_LIBRARY_DESC m_Desc; + CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings; + std::vector m_Exports; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_EXISTING_COLLECTION_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_EXISTING_COLLECTION_SUBOBJECT() noexcept { Init(); } + CD3DX12_EXISTING_COLLECTION_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetExistingCollection(ID3D12StateObject *pExistingCollection) noexcept { + m_Desc.pExistingCollection = pExistingCollection; + m_CollectionRef = pExistingCollection; + } + void DefineExport(LPCWSTR Name, LPCWSTR ExportToRename = nullptr, + D3D12_EXPORT_FLAGS Flags = D3D12_EXPORT_FLAG_NONE) { + D3D12_EXPORT_DESC Export; + Export.Name = m_Strings.LocalCopy(Name); + Export.ExportToRename = m_Strings.LocalCopy(ExportToRename); + Export.Flags = Flags; + m_Exports.push_back(Export); + m_Desc.pExports = &m_Exports[0]; // using ugly way to get pointer in case .data() is not defined + m_Desc.NumExports = static_cast(m_Exports.size()); + } + template void DefineExports(LPCWSTR (&Exports)[N]) { + for (UINT i = 0; i < N; i++) { + DefineExport(Exports[i]); + } + } + void DefineExports(const LPCWSTR *Exports, UINT N) { + for (UINT i = 0; i < N; i++) { + DefineExport(Exports[i]); + } + } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_EXISTING_COLLECTION; } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_EXISTING_COLLECTION_DESC &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + m_CollectionRef = nullptr; + m_Strings.clear(); + m_Exports.clear(); + } + void *Data() noexcept override { return &m_Desc; } + D3D12_EXISTING_COLLECTION_DESC m_Desc; + D3DX12_COM_PTR m_CollectionRef; + CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings; + std::vector m_Exports; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT() noexcept { Init(); } + CD3DX12_SUBOBJECT_TO_EXPORTS_ASSOCIATION_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetSubobjectToAssociate(const D3D12_STATE_SUBOBJECT &SubobjectToAssociate) noexcept { + m_Desc.pSubobjectToAssociate = &SubobjectToAssociate; + } + void AddExport(LPCWSTR Export) { + m_Desc.NumExports++; + m_Exports.push_back(m_Strings.LocalCopy(Export)); + m_Desc.pExports = &m_Exports[0]; // using ugly way to get pointer in case .data() is not defined + } + template void AddExports(LPCWSTR (&Exports)[N]) { + for (UINT i = 0; i < N; i++) { + AddExport(Exports[i]); + } + } + void AddExports(const LPCWSTR *Exports, UINT N) { + for (UINT i = 0; i < N; i++) { + AddExport(Exports[i]); + } + } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { + return D3D12_STATE_SUBOBJECT_TYPE_SUBOBJECT_TO_EXPORTS_ASSOCIATION; + } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + m_Strings.clear(); + m_Exports.clear(); + } + void *Data() noexcept override { return &m_Desc; } + D3D12_SUBOBJECT_TO_EXPORTS_ASSOCIATION m_Desc; + CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings; + std::vector m_Exports; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION() noexcept { Init(); } + CD3DX12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetSubobjectNameToAssociate(LPCWSTR SubobjectToAssociate) { + m_Desc.SubobjectToAssociate = m_SubobjectName.LocalCopy(SubobjectToAssociate, true); + } + void AddExport(LPCWSTR Export) { + m_Desc.NumExports++; + m_Exports.push_back(m_Strings.LocalCopy(Export)); + m_Desc.pExports = &m_Exports[0]; // using ugly way to get pointer in case .data() is not defined + } + template void AddExports(LPCWSTR (&Exports)[N]) { + for (UINT i = 0; i < N; i++) { + AddExport(Exports[i]); + } + } + void AddExports(const LPCWSTR *Exports, UINT N) { + for (UINT i = 0; i < N; i++) { + AddExport(Exports[i]); + } + } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { + return D3D12_STATE_SUBOBJECT_TYPE_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION; + } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + m_Strings.clear(); + m_SubobjectName.clear(); + m_Exports.clear(); + } + void *Data() noexcept override { return &m_Desc; } + D3D12_DXIL_SUBOBJECT_TO_EXPORTS_ASSOCIATION m_Desc; + CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings; + CD3DX12_STATE_OBJECT_DESC::StringContainer m_SubobjectName; + std::vector m_Exports; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_HIT_GROUP_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_HIT_GROUP_SUBOBJECT() noexcept { Init(); } + CD3DX12_HIT_GROUP_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetHitGroupExport(LPCWSTR exportName) { m_Desc.HitGroupExport = m_Strings[0].LocalCopy(exportName, true); } + void SetHitGroupType(D3D12_HIT_GROUP_TYPE Type) noexcept { m_Desc.Type = Type; } + void SetAnyHitShaderImport(LPCWSTR importName) { + m_Desc.AnyHitShaderImport = m_Strings[1].LocalCopy(importName, true); + } + void SetClosestHitShaderImport(LPCWSTR importName) { + m_Desc.ClosestHitShaderImport = m_Strings[2].LocalCopy(importName, true); + } + void SetIntersectionShaderImport(LPCWSTR importName) { + m_Desc.IntersectionShaderImport = m_Strings[3].LocalCopy(importName, true); + } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_HIT_GROUP; } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_HIT_GROUP_DESC &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + for (UINT i = 0; i < m_NumStrings; i++) { + m_Strings[i].clear(); + } + } + void *Data() noexcept override { return &m_Desc; } + D3D12_HIT_GROUP_DESC m_Desc; + static const UINT m_NumStrings = 4; + CD3DX12_STATE_OBJECT_DESC::StringContainer m_Strings[m_NumStrings]; // one string for every entrypoint name +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT() noexcept { Init(); } + CD3DX12_RAYTRACING_SHADER_CONFIG_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void Config(UINT MaxPayloadSizeInBytes, UINT MaxAttributeSizeInBytes) noexcept { + m_Desc.MaxPayloadSizeInBytes = MaxPayloadSizeInBytes; + m_Desc.MaxAttributeSizeInBytes = MaxAttributeSizeInBytes; + } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { + return D3D12_STATE_SUBOBJECT_TYPE_RAYTRACING_SHADER_CONFIG; + } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_RAYTRACING_SHADER_CONFIG &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + } + void *Data() noexcept override { return &m_Desc; } + D3D12_RAYTRACING_SHADER_CONFIG m_Desc; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT() noexcept { Init(); } + CD3DX12_RAYTRACING_PIPELINE_CONFIG_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void Config(UINT MaxTraceRecursionDepth) noexcept { m_Desc.MaxTraceRecursionDepth = MaxTraceRecursionDepth; } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { + return D3D12_STATE_SUBOBJECT_TYPE_RAYTRACING_PIPELINE_CONFIG; + } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_RAYTRACING_PIPELINE_CONFIG &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + } + void *Data() noexcept override { return &m_Desc; } + D3D12_RAYTRACING_PIPELINE_CONFIG m_Desc; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_RAYTRACING_PIPELINE_CONFIG1_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_RAYTRACING_PIPELINE_CONFIG1_SUBOBJECT() noexcept { Init(); } + CD3DX12_RAYTRACING_PIPELINE_CONFIG1_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void Config(UINT MaxTraceRecursionDepth, D3D12_RAYTRACING_PIPELINE_FLAGS Flags) noexcept { + m_Desc.MaxTraceRecursionDepth = MaxTraceRecursionDepth; + m_Desc.Flags = Flags; + } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { + return D3D12_STATE_SUBOBJECT_TYPE_RAYTRACING_PIPELINE_CONFIG1; + } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_RAYTRACING_PIPELINE_CONFIG1 &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + } + void *Data() noexcept override { return &m_Desc; } + D3D12_RAYTRACING_PIPELINE_CONFIG1 m_Desc; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_GLOBAL_ROOT_SIGNATURE_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_GLOBAL_ROOT_SIGNATURE_SUBOBJECT() noexcept { Init(); } + CD3DX12_GLOBAL_ROOT_SIGNATURE_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetRootSignature(ID3D12RootSignature *pRootSig) noexcept { m_pRootSig = pRootSig; } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { + return D3D12_STATE_SUBOBJECT_TYPE_GLOBAL_ROOT_SIGNATURE; + } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator ID3D12RootSignature *() const noexcept { return D3DX12_COM_PTR_GET(m_pRootSig); } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_pRootSig = nullptr; + } + void *Data() noexcept override { return D3DX12_COM_PTR_ADDRESSOF(m_pRootSig); } + D3DX12_COM_PTR m_pRootSig; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT() noexcept { Init(); } + CD3DX12_LOCAL_ROOT_SIGNATURE_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetRootSignature(ID3D12RootSignature *pRootSig) noexcept { m_pRootSig = pRootSig; } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { + return D3D12_STATE_SUBOBJECT_TYPE_LOCAL_ROOT_SIGNATURE; + } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator ID3D12RootSignature *() const noexcept { return D3DX12_COM_PTR_GET(m_pRootSig); } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_pRootSig = nullptr; + } + void *Data() noexcept override { return D3DX12_COM_PTR_ADDRESSOF(m_pRootSig); } + D3DX12_COM_PTR m_pRootSig; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_STATE_OBJECT_CONFIG_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_STATE_OBJECT_CONFIG_SUBOBJECT() noexcept { Init(); } + CD3DX12_STATE_OBJECT_CONFIG_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetFlags(D3D12_STATE_OBJECT_FLAGS Flags) noexcept { m_Desc.Flags = Flags; } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_STATE_OBJECT_CONFIG; } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_STATE_OBJECT_CONFIG &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + } + void *Data() noexcept override { return &m_Desc; } + D3D12_STATE_OBJECT_CONFIG m_Desc; +}; + +//------------------------------------------------------------------------------------------------ +class CD3DX12_NODE_MASK_SUBOBJECT : public CD3DX12_STATE_OBJECT_DESC::SUBOBJECT_HELPER_BASE { + public: + CD3DX12_NODE_MASK_SUBOBJECT() noexcept { Init(); } + CD3DX12_NODE_MASK_SUBOBJECT(CD3DX12_STATE_OBJECT_DESC &ContainingStateObject) { + Init(); + AddToStateObject(ContainingStateObject); + } + void SetNodeMask(UINT NodeMask) noexcept { m_Desc.NodeMask = NodeMask; } + D3D12_STATE_SUBOBJECT_TYPE Type() const noexcept override { return D3D12_STATE_SUBOBJECT_TYPE_NODE_MASK; } + operator const D3D12_STATE_SUBOBJECT &() const noexcept { return *m_pSubobject; } + operator const D3D12_NODE_MASK &() const noexcept { return m_Desc; } + + private: + void Init() noexcept { + SUBOBJECT_HELPER_BASE::Init(); + m_Desc = {}; + } + void *Data() noexcept override { return &m_Desc; } + D3D12_NODE_MASK m_Desc; +}; + +#undef D3DX12_COM_PTR +#undef D3DX12_COM_PTR_GET +#undef D3DX12_COM_PTR_ADDRESSOF +#endif // #ifndef D3DX12_NO_STATE_OBJECT_HELPERS + +#endif // defined( __cplusplus ) + +#endif //__D3DX12_H__ diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.cpp b/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.cpp new file mode 100644 index 000000000..af3472f85 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.cpp @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "D3D12Timer.h" +#include "../directx_third_party/DXSampleHelper.h" +#include "../directx_third_party/d3dx12.h" +#include + +namespace D3D12 { +D3D12Timer::D3D12Timer() {} + +// Destructor. +D3D12Timer::~D3D12Timer() { + if (m_queryHeap) + m_queryHeap->Release(); + if (m_queryResourceCPU) + m_queryResourceCPU->Release(); +} + +void D3D12Timer::init(ID3D12Device *pDevice, ID3D12CommandQueue *pCommandQueue, UINT numTimers, QueueType type) { + assert(pDevice != nullptr); + m_device = pDevice; + m_timerCount = numTimers; + + UINT64 gpuFreq; + ThrowIfFailed(pCommandQueue->GetTimestampFrequency(&gpuFreq)); + m_gpuFreqInv = 1000.0 / double(gpuFreq); + + D3D12_QUERY_HEAP_DESC queryHeapDesc; + queryHeapDesc.Count = m_timerCount * 2; + queryHeapDesc.NodeMask = 0; + if (type == QueueType::compute) { + queryHeapDesc.Type = D3D12_QUERY_HEAP_TYPE_TIMESTAMP; + } else if (type == QueueType::copy) { + queryHeapDesc.Type = D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP; + } + ThrowIfFailed(m_device->CreateQueryHeap(&queryHeapDesc, IID_PPV_ARGS(&m_queryHeap))); + + D3D12_HEAP_PROPERTIES heapProp = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK); + D3D12_RESOURCE_DESC resouceDesc = CD3DX12_RESOURCE_DESC::Buffer(m_timerCount * sizeof(GPUTimestampPair)); + ThrowIfFailed(m_device->CreateCommittedResource(&heapProp, D3D12_HEAP_FLAG_NONE, &resouceDesc, + D3D12_RESOURCE_STATE_COPY_DEST, nullptr, + IID_PPV_ARGS(&m_queryResourceCPU))); +} + +// Start timestamp. +bool D3D12Timer::start(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex) { + if (timestampPairIndex >= m_timerCount) + return false; + pCommandList->EndQuery(m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, getStartIndex(timestampPairIndex)); + return true; +} + +// Stop timestamp. +bool D3D12Timer::stop(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex) { + if (timestampPairIndex >= m_timerCount) + return false; + pCommandList->EndQuery(m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, getEndIndex(timestampPairIndex)); + return true; +} + +// Resolve query data. Write query to device memory. Make sure to wait for query to finish before resolving data. +void D3D12Timer::resolveQueryToCPU(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex) { + pCommandList->ResolveQueryData(m_queryHeap, D3D12_QUERY_TYPE_TIMESTAMP, getStartIndex(timestampPairIndex), 2, + m_queryResourceCPU, sizeof(GPUTimestampPair) * timestampPairIndex); +} + +// Get start and end timestamp pair. +double D3D12Timer::getElapsedMsByTimestampPair(UINT timestampPairIndex) { + GPUTimestampPair *timingData = nullptr; + D3D12_RANGE readRange{sizeof(GPUTimestampPair) * timestampPairIndex, + sizeof(GPUTimestampPair) * (timestampPairIndex + 1)}; + D3D12_RANGE writeRange{0, 0}; + if (SUCCEEDED(m_queryResourceCPU->Map(0, &readRange, (void **)&timingData))) { + m_queryResourceCPU->Unmap(0, &writeRange); + return (timingData->Stop - timingData->Start) * m_gpuFreqInv; + } + return -1; +} +} // namespace D3D12 diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.h b/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.h new file mode 100644 index 000000000..e7308a5fe --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_utils/D3D12Timer.h @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once +#include + +namespace D3D12 { +struct GPUTimestampPair { + UINT64 Start; + UINT64 Stop; +}; + +enum QueueType { compute = 0, copy = 1 }; + +// D3D12 timer. +class D3D12Timer { + public: + // Constructor. + D3D12Timer(); + + // Destructor. + ~D3D12Timer(); + + void init(ID3D12Device *pDevice, ID3D12CommandQueue *pCommandQueue, UINT numTimers, QueueType type); + + // Start timestamp. + bool start(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex); + + // Stop timestamp. + bool stop(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex); + + // Resolve query data. Write query to device memory. Make sure to wait for query to finsih before resolving data. + void resolveQueryToCPU(ID3D12GraphicsCommandList *pCommandList, UINT timestampPairIndex); + + // Get start and end timestamp pair. + double getElapsedMsByTimestampPair(UINT timestampPairIndex); + + // Get the GPU frequency. + double getGPUFrequecy() { return m_gpuFreqInv; } + + // Get start index of the selected timestamp pair + UINT getStartIndex(UINT timestampPairIndex) { return timestampPairIndex * 2; } + + // Get end index of the selected timestamp pair + UINT getEndIndex(UINT timestampPairIndex) { return timestampPairIndex * 2 + 1; } + + private: + ID3D12Device *m_device = nullptr; + ID3D12QueryHeap *m_queryHeap = nullptr; + ID3D12Resource *m_queryResourceCPU = nullptr; + UINT m_timerCount = 0; + double m_gpuFreqInv; +}; +} // namespace D3D12 diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h new file mode 100644 index 000000000..ce384272a --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h @@ -0,0 +1,113 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include + +class Options { + protected: + char **begin; + char **end; + + /** + * @brief Get the char* value of the cmd line argument. + * @param option the argument in cmd. + * @return char* + */ + char *get_cmd_option(const std::string &option) { + char **itr = std::find(begin, end, option); + if (itr != end && ++itr != end) { + return *itr; + } + return 0; + } + + /** + * @brief Get the int type value of cmd line argument. + * @param option the cmd line argument. + * @param defaults the default value. + * @return int the int type value of cmd line argument 'option'. + */ + int get_cmd_line_argument_int(const std::string &option, int defaults) { + if (char *value = get_cmd_option(option)) { + try { + return std::stoi(value); + } catch (const std::exception &e) { + std::cerr << "Error: Invalid argument - " << option << " should be INT " << e.what() << '\n'; + exit(1); + } + } + return defaults; + } + + /** + * @brief Get the string type value of cmd line argument. + * @param option the cmd line argument. + * @return std::string the int type value of cmd line argument 'option'. + */ + std::string get_cmd_line_argument_string(const std::string &option) { + if (char *value = get_cmd_option(option)) { + return std::string(value); + } + return ""; + } + + /** + * @brief Get the boolean type value of cmd line argument. + * @param option the cmd line argument. + * @return bool the boolean value. + */ + bool get_cmd_line_argument_bool(const std::string &option) { + if (cmd_option_exists(option)) { + return true; + } + return false; + } + + /** + * @brief Check if a argument exists. + * @param option the cmd line argument. + * @return bool if a argument exists. + */ + bool cmd_option_exists(const std::string &option) { return std::find(begin, end, option) != end; } + + /** + * @brief Get the option usage. + */ + virtual void get_option_usage(){}; + + /** + * @brief Parse the arguments. + */ + virtual void parse_arguments(){}; + + public: + /** + * @brief Construct a new Command Line object. + * @param argc the number of command line arguments. + * @param argv the string array of comamnd line arguments. + */ + Options(int argc, char *argv[]) { + begin = argv; + end = argv + argc; + } + + /** + * @brief Init and parse the arguments. + */ + virtual void init() { + if (cmd_option_exists("--help")) { + get_option_usage(); + exit(0); + } + try { + parse_arguments(); + } catch (const std::exception &e) { + std::cerr << "Error: Invalid argument - " << e.what() << '\n'; + exit(1); + } + }; +}; From ed027e4c8ef8d15a1238342c2e5f165510ad91b6 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 29 Jun 2023 06:09:44 +0000 Subject: [PATCH 12/33] Tools - Add runner for sys info and update docs (#532) **Description** Add runner for sys info to automatically collect on multiple nodes and update related docs. **Major Revision** - add runner for sys info which will check docker status and run `sb node info` on all nodes' docker and fetch results from all nodes **Minor Revision** - update cli and system-info doc - update sb node info to save output info output-dir/sys-info.json --- docs/cli.md | 32 +++++++++++++++++++++++++++++ docs/user-tutorial/system-config.md | 30 ++++++++++++++++++++++++++- superbench/cli/_commands.py | 3 +++ superbench/cli/_handler.py | 8 +++++++- superbench/cli/_help.py | 4 ++++ superbench/cli/_node_handler.py | 13 +++++++++++- superbench/runner/runner.py | 18 ++++++++++++++++ 7 files changed, 105 insertions(+), 3 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index df1c1ca4d..1f6b13a7a 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -165,6 +165,26 @@ Execute GPT2 model benchmark in default configuration: sb exec --config-override superbench.enable="['gpt2_models']" ``` +### `sb node info` +Get system info on the local node. + +```bash title="SB CLI" +sb node info [--output-dir] +``` + +#### Optional arguments + +| Name | Default | Description | +|----------------|---------|-----------------------------------------------------------------------------| +| `--output-dir` | `None` | Path to output directory, outputs/{datetime} will be used if not specified. | + +#### Examples + +Get system info on the local node and save it into the `outputs` dir: +```bash title="SB CLI" +sb node info --output-dir outputs +``` + ### `sb result diagnosis` Filter the defective machines automatically from benchmarking results according to rules defined in rule file. @@ -284,6 +304,7 @@ sb run [--config-file] [--docker-image] [--docker-password] [--docker-username] + [--get-info] [--host-file] [--host-list] [--host-password] @@ -302,6 +323,7 @@ sb run [--config-file] | `--docker-image` `-i` | `superbench/superbench` | Docker image URI. | | `--docker-password` | `None` | Docker registry password if authentication is needed. | | `--docker-username` | `None` | Docker registry username if authentication is needed. | +| `--get-info` | `False` | Collect system info. | | `--host-file` `-f` | `None` | Path to Ansible inventory host file. | | `--host-list` `-l` | `None` | Comma separated host list. | | `--host-password` | `None` | Host password or key passphase if needed. | @@ -335,6 +357,16 @@ sb run --no-docker --host-list localhost --config-override \ superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark ``` +Collect system info on all nodes in ./host.ini" distributed without running benchmarks: +```bash title="SB CLI" +sb run --get-info --host-file ./host.ini -C superbench.enable=none +``` + +Collect system info on all nodes in ./host.ini" distributed while running benchmarks: +```bash title="SB CLI" +sb run --get-info --host-file ./host.ini +``` + ### `sb version` Print the current SuperBench CLI version. diff --git a/docs/user-tutorial/system-config.md b/docs/user-tutorial/system-config.md index dbde728d3..2a749ba52 100644 --- a/docs/user-tutorial/system-config.md +++ b/docs/user-tutorial/system-config.md @@ -4,6 +4,8 @@ id: system-config # System Config Info +This tool is to collect the system information automatically on the tested GPU nodes including the following hardware categories: + - [System](#system) - [Memory](#memory) - [CPU](#cpu) @@ -12,7 +14,33 @@ id: system-config - [Accelerator](#accelerator) - [PCIe](#pcie) -## Parameter amd Details +## Usage + +### Usage on local machine + +1. [Install SuperBench](../getting-started/installation.mdx) on the local machine using root privilege. + +2. Start to collect the sys info using `sb node info --output-dir ${output-dir}` command using root privilege. + +3. After the command finished, you can find the output system info json file `sys-info.json` of local node under \${output_dir}. + +### Usage on multiple remote machines + +1. [Install SuperBench](../getting-started/installation.mdx) on the local machine. + +2. [Deploy SuperBench](../getting-started/run-superbench.md#deploy) onto the remote machines. + +2. Prepare the host file of the tested GPU nodes using [Ansible Inventory](../getting-started/configuration.md#ansible-inventory) on the local machine. + +3. After installing the Superbnech and the host file is ready, you can start to collect the sys info automatically using `sb run --get-info` command. The detailed command can be found from [SuperBench CLI](../cli.md). + + ``` + sb run --get-info -f host.ini --output-dir ${output-dir} -C superbench.enable=none + ``` + +4. After the command finished, you can find the output system info json file `sys-info.json` of each node under \${output_dir}/nodes/${node_name}. + +## Parameter and Details ### System diff --git a/superbench/cli/_commands.py b/superbench/cli/_commands.py index f37bc0f33..2122034a3 100644 --- a/superbench/cli/_commands.py +++ b/superbench/cli/_commands.py @@ -67,6 +67,9 @@ def load_arguments(self, command): nargs='+', help='Extra arguments to override config_file.' ) + ac.argument( + 'get_info', options_list=('--get-info', '-g'), action='store_true', help='Collect node system info.' + ) with ArgumentsContext(self, 'benchmark') as ac: ac.argument('name', options_list=('--name', '-n'), type=str, help='Benchmark name or regular expression.') diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py index 3c2d1cbaa..41c9f3741 100644 --- a/superbench/cli/_handler.py +++ b/superbench/cli/_handler.py @@ -275,7 +275,8 @@ def run_command_handler( output_dir=None, private_key=None, config_file=None, - config_override=None + config_override=None, + get_info=False, ): """Run the SuperBench benchmarks distributedly. @@ -295,6 +296,7 @@ def run_command_handler( config_file (str, optional): Path to SuperBench config file. Defaults to None. config_override (str, optional): Extra arguments to override config_file, following [Hydra syntax](https://hydra.cc/docs/advanced/override_grammar/basic). Defaults to None. + get_info (bool, optional): Collect node system info. Defaults to False. Raises: CLIError: If input arguments are invalid. @@ -316,6 +318,10 @@ def run_command_handler( ) runner = SuperBenchRunner(sb_config, docker_config, ansible_config, sb_output_dir) + runner.run() + if get_info: + runner.run_sys_info() + if runner.get_failure_count() != 0: sys.exit(runner.get_failure_count()) diff --git a/superbench/cli/_help.py b/superbench/cli/_help.py index 2c7f507b2..fb7f87973 100644 --- a/superbench/cli/_help.py +++ b/superbench/cli/_help.py @@ -63,6 +63,10 @@ text: > {cli_name} run --no-docker --host-list localhost --config-override superbench.enable=kernel-launch superbench.env.SB_MICRO_PATH=/path/to/superbenchmark + - name: Collect system info on all nodes in ./host.ini" without running benchmarks + text: {cli_name} run --get-info --host-file ./host.ini -C superbench.enable=none + - name: Collect system info on all nodes in ./host.ini" while running benchmarks + text: {cli_name} run --get-info --host-file ./host.ini """.format(cli_name=CLI_NAME) helps['benchmark'] = """ diff --git a/superbench/cli/_node_handler.py b/superbench/cli/_node_handler.py index 4a57b5b20..d59ed8b85 100644 --- a/superbench/cli/_node_handler.py +++ b/superbench/cli/_node_handler.py @@ -3,17 +3,28 @@ """SuperBench CLI node subgroup command handler.""" +from pathlib import Path +import json + from superbench.tools import SystemInfo +from superbench.common.utils import create_sb_output_dir -def info_command_handler(): +def info_command_handler(output_dir=None): """Get node hardware info. + Args: + output_dir (str): Output directory. + Returns: dict: node info. """ try: info = SystemInfo().get_all() + output_dir = create_sb_output_dir(output_dir) + output_dir_path = Path(output_dir) + with open(output_dir_path / 'sys_info.json', 'w') as f: + json.dump(info, f) except Exception as ex: raise RuntimeError('Failed to get node info.') from ex return info diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index d91020bfb..bd8cc9c83 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -199,6 +199,24 @@ def deploy(self): # pragma: no cover ) self._ansible_client.run(self._ansible_client.get_playbook_config('deploy.yaml', extravars=extravars)) + def run_sys_info(self): + """Run the system info on all nodes.""" + self.check_env() + + logger.info('Runner is going to get node system info.') + + fcmd = "docker exec sb-workspace bash -c '{command}'" + if self._docker_config.skip: + fcmd = "bash -c 'cd $SB_WORKSPACE && {command}'" + ansible_runner_config = self._ansible_client.get_shell_config( + fcmd.format(command='sb node info --output-dir {output_dir}'.format(output_dir=self._sb_output_dir)) + ) + ansible_rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip)) + + if ansible_rc != 0: + self.cleanup() + self.fetch_results() + def check_env(self): # pragma: no cover """Check SuperBench environment.""" logger.info('Checking SuperBench environment.') From af4d18dedf8c158d3f96c034cf82464fb63c90d8 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 29 Jun 2023 07:03:40 +0000 Subject: [PATCH 13/33] Benchmarks: Add benchmark - Add source code of DirectxGPUMemBw microbenchmark (#487) **Description** Add source code of DirectxGPUMemBw microbenchmark. --------- Co-authored-by: v-junlinlv --- .../BenchmarkOptions.h | 83 +++++ .../directx_mem_bw_performance/GPUMemRwBw.cpp | 317 ++++++++++++++++++ .../directx_mem_bw_performance/GPUMemRwBw.h | 174 ++++++++++ .../GPUMemRwBw.vcxproj | 105 ++++++ .../directx_mem_bw_performance/Main.cpp | 24 ++ .../directx_mem_bw_performance/ReadWrite.hlsl | 62 ++++ .../micro_benchmarks/directx_utils/Options.h | 51 +++ 7 files changed, 816 insertions(+) create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/Main.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/ReadWrite.hlsl diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h new file mode 100644 index 000000000..7893fe8af --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include + +#include "../directx_utils/Options.h" +#include "GPUMemRwBw.h" + +enum Memtype { + Read, + Write, + ReadWrite, +}; +const std::string MemtypeString[] = {"Read", "Write", "ReadWrite"}; + +class BenchmarkOptions : public Options { + public: + // Number of warm up rounds. + int num_warm_up = 0; + // Number of loop rounds of dispatch to measure the performance. + int num_loop = 0; + // Size of data for GPU mem access. + unsigned long long size; + // Run size from min_size to max_size for GPU mem access. + unsigned long long min_size = 0; + // Run size from min_size to max_size for GPU mem access. + unsigned long long max_size = 0; + // Whether check data correctness. + bool check_data = false; + // Memory operation type. + Memtype mem_type = Memtype::Write; + // Number of threads to launch. + UInt3 num_threads; + + /** + * @brief Construct a new BenchmarkOptions object. + */ + BenchmarkOptions(int argc, char *argv[]) : Options(argc, argv) {} + + /** + * @brief Get the option usage. + */ + void get_option_usage() override { + std::cout << "Usage: " << std::endl; + std::cout << " --num_warm_up : Number of warm up rounds." << std::endl; + std::cout << " --num_loop : Number of loop times to measure the performance." << std::endl; + std::cout << " --minbytes : Lower data size bound to test." << std::endl; + std::cout << " --maxbytes : Upper data size bound to test." << std::endl; + std::cout << " --check_data : Whether check data correctness." << std::endl; + std::cout << " --read : Memory operation type is read." << std::endl; + std::cout << " --write : Memory operation type is write." << std::endl; + std::cout << " --readwrite : Memory operation type is readwrite." << std::endl; + std::cout << " --numthreads ,, : Number of threads in 3 dimenstions to launch." << std::endl; + std::cout << " --help : Print help message." << std::endl; + } + + /** + * @brief Parse the arguments. + */ + virtual void parse_arguments() override { + num_warm_up = get_cmd_line_argument_int("--num_warm_up", 0); + num_loop = get_cmd_line_argument_int("--num_loop", 1); + size = get_cmd_line_argument_ulonglong("--size", -1); + min_size = get_cmd_line_argument_int("--minbytes", 4 * 1024); + max_size = + get_cmd_line_argument_ulonglong("--maxbytes", static_cast(1LL * 1024 * 1024 * 1024)); + check_data = get_cmd_line_argument_bool("--check"); + if (get_cmd_line_argument_bool("--read")) { + mem_type = Memtype::Read; + } + if (get_cmd_line_argument_bool("--write")) { + mem_type = Memtype::Write; + } + if (get_cmd_line_argument_bool("--readwrite")) { + mem_type = Memtype::ReadWrite; + } + num_threads = get_cmd_line_argument_uint3("--numthreads", {256, 1, 1}); + } +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.cpp b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.cpp new file mode 100644 index 000000000..75a7f7141 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.cpp @@ -0,0 +1,317 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include +#include +#include + +#include "GPUMemRwBw.h" + +/* + * @brief Start benchmark. + */ +void GPUMemRwBw::Run() { + // Create GPU pipeline and device objects. + CreatePipeline(); + // Prepare data and buffers. + PrepareDataAndBuffer(this->m_num_elements); + // Load shaders and root signatures. + LoadAssets(); + // Start benchmark. + double time_ms = MemReadWriteBench(this->m_num_elements, opts->num_loop, opts->num_warm_up); + double bw = this->m_num_elements * sizeof(float) * opts->num_loop / time_ms / 1e6; + // Output benchmark result. + std::string mode = MemtypeString[static_cast(opts->mem_type)]; + cout << "GPUMemBw: " << mode << " " << opts->size << " " << bw << " GB/s" << endl; +} + +/** + * @brief Allocate resouce on both CPU side and GPU side and construct a array of buffers with given length. + * @param numElement the length of data array. + + */ +void GPUMemRwBw::PrepareDataAndBuffer(SIZE_T numElement) { + // Prepare CPU side data. + std::vector dataA(numElement); + for (SIZE_T i = 0; i < numElement; i++) { + dataA[i] = i % 256; + } + // Allocate resources on GPU side to take those data. + UINT64 byteSize = dataA.size() * sizeof(float); + if (opts->mem_type == Memtype::Write || opts->mem_type == Memtype::ReadWrite) { + m_inputBuffer = + CreateDefaultBuffer(m_device.Get(), m_commandList.Get(), dataA.data(), byteSize, m_uploadBuffer); + } + // Allocate upload buffer to upload data from CPU to GPU. + ThrowIfFailed(m_device->CreateCommittedResource( + get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT)), D3D12_HEAP_FLAG_NONE, + get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(byteSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS)), + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, nullptr, IID_PPV_ARGS(&m_outputBuffer))); + // Allocate readback buffer if needed. + if (opts->check_data && opts->mem_type != Memtype::Read) { + // Allocate readback buffer to check result correctness + ThrowIfFailed(m_device->CreateCommittedResource( + get_rvalue_ptr(CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK)), D3D12_HEAP_FLAG_NONE, + get_rvalue_ptr(CD3DX12_RESOURCE_DESC::Buffer(byteSize)), D3D12_RESOURCE_STATE_COPY_DEST, nullptr, + IID_PPV_ARGS(&m_readbackBuffer))); + } + // Prepare the parameter buffer of shader. + UINT8 *pCBDataBegin; + CD3DX12_HEAP_PROPERTIES heapProperties(D3D12_HEAP_TYPE_UPLOAD); + CD3DX12_RESOURCE_DESC bufferDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(ParameterBuffer)); + ThrowIfFailed(m_device->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE, &bufferDesc, + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, + IID_PPV_ARGS(&m_constantBuffer))); + // Fill the constant buffer to pass parameters to GPU. + ParameterBuffer param; + // Calculate total number of threads. + SIZE_T totalThreadNum = 1LL * (m_num_dispatch.x * m_num_dispatch.y * m_num_dispatch.z) * + (m_num_thread.x * m_num_thread.y * m_num_thread.z); + param.numLoop = numElement / totalThreadNum; + param.numThread = m_num_thread; + // Upload constant buffer. + param.numDispatch = m_num_dispatch; + ThrowIfFailed(m_constantBuffer->Map(0, nullptr, reinterpret_cast(&pCBDataBegin))); + memcpy(pCBDataBegin, ¶m, sizeof(param)); + m_constantBuffer->Unmap(0, nullptr); + // Commit resource allocation command list. + ExecuteWaitForCommandQueue(); +} + +/** + * @brief Check result correctness. + * @param numElement the length of data array. + * @return true if result is correct. + */ +bool GPUMemRwBw::CheckData(SIZE_T numElement) { + // Readback result to check correctness. + m_commandList->ResourceBarrier( + 1, get_rvalue_ptr(CD3DX12_RESOURCE_BARRIER::Transition(m_outputBuffer.Get(), D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_COPY_SOURCE))); + m_commandList->CopyResource(m_readbackBuffer.Get(), m_outputBuffer.Get()); + m_commandList->ResourceBarrier( + 1, get_rvalue_ptr(CD3DX12_RESOURCE_BARRIER::Transition(m_outputBuffer.Get(), D3D12_RESOURCE_STATE_COPY_SOURCE, + D3D12_RESOURCE_STATE_COMMON))); + // Execute copy back and sync. + ExecuteWaitForCommandQueue(); + // Access from CPU. + float *mappedData = nullptr; + ThrowIfFailed(m_readbackBuffer->Map(0, nullptr, reinterpret_cast(&mappedData))); + for (int i = 0; i < numElement; ++i) { + if ((int)mappedData[i] != i % 256) { + cout << "Error: check data failed - index " << i << " should be " << i % 256 << " but got " + << (int)mappedData[i] << endl; + break; + } + } + m_readbackBuffer->Unmap(0, nullptr); + return true; +} + +/** + * @brief Memory read write benchmark. + * @param numElem the length of data array. + * @return double the time elapsed in ms. + */ +double GPUMemRwBw::MemReadWriteBench(SIZE_T numElem, int loops, int numWarmUp) { + // Start test. + m_gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); + for (int i = 0; i < loops + numWarmUp; i++) { + if (i == numWarmUp) { + // Start timestamp. + m_gpuTimer.start(m_commandList.Get(), 0); + } + UInt3 dispatch = m_num_dispatch; + m_commandList->Dispatch(dispatch.x, dispatch.y, dispatch.z); + } + // Stop timestamp. + m_gpuTimer.stop(m_commandList.Get(), 0); + m_gpuTimer.resolveQueryToCPU(m_commandList.Get(), 0); + + // Close, execute (and optionally reset) the command list, and also to use a fence to wait for the command queue. + ExecuteWaitForCommandQueue(); + + // Get time in ms. + double timeInMs = m_gpuTimer.getElapsedMsByTimestampPair(0); + + if (opts->check_data && opts->mem_type != Memtype::Read) { + CheckData(numElem); + } + return timeInMs; +} + +/** + * @brief Create pipeline including + * create device object, command list, command queue + * and synchronization objects. + */ +void GPUMemRwBw::CreatePipeline() { + UINT dxgiFactoryFlags = 0; +#if _DEBUG + // Enable the debug layer (requires the Graphics Tools "optional feature"). + // NOTE: Enabling the debug layer after device creation will invalidate the active device. + { + ComPtr debugController; + if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) { + debugController->EnableDebugLayer(); + // Enable additional debug layers. + dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG; + } + } +#endif + ComPtr factory; + ThrowIfFailed(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory))); + ComPtr hardwareAdapter; + GetHardwareAdapter(factory.Get(), &hardwareAdapter); + ThrowIfFailed(D3D12CreateDevice(hardwareAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device))); + D3D12_COMMAND_QUEUE_DESC cqd3 = {}; + cqd3.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; + ThrowIfFailed(m_device->CreateCommandQueue(&cqd3, IID_PPV_ARGS(&m_commandQueue))); + ThrowIfFailed(m_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_commandAllocator))); + // Create the command list. + ThrowIfFailed(m_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_commandAllocator.Get(), nullptr, + IID_PPV_ARGS(&m_commandList))); + // Create synchronization objects. + ThrowIfFailed(m_device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_fence))); + m_fenceValue = 1; + // Create an event handle to use for GPU synchronization. + m_eventHandle = CreateEvent(0, false, false, 0); +} + +/** + * @brief Setup GPU pipeline resource including creating root signature, pipeline state and compile shader. + */ +void GPUMemRwBw::LoadAssets() { + // Prepare root signature, root parameter can be a table, root descriptor or root constants. + const int nParamter = 3; + CD3DX12_ROOT_PARAMETER slotRootParameter[nParamter]; + // Bind the SRV, CBV and UAV descriptor tables to the root parameters. + slotRootParameter[0].InitAsShaderResourceView(0); + slotRootParameter[1].InitAsConstantBufferView(0); + slotRootParameter[2].InitAsUnorderedAccessView(0); + // Create the root signature. + // A root signature is an array of root parameters. + CD3DX12_ROOT_SIGNATURE_DESC rootSigDesc(nParamter, slotRootParameter, 0, nullptr, D3D12_ROOT_SIGNATURE_FLAG_NONE); + ComPtr serializedRootSig = nullptr; + ComPtr errorBlob = nullptr; + HRESULT hr = D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1, + serializedRootSig.GetAddressOf(), errorBlob.GetAddressOf()); + if (hr != S_OK || errorBlob != nullptr) { + std::cout << "Error: " << (char *)errorBlob->GetBufferPointer() << std::endl; + throw runtime_error("Error: D3D12SerializeRootSignature failed."); + } + ThrowIfFailed(m_device->CreateRootSignature(0, serializedRootSig->GetBufferPointer(), + serializedRootSig->GetBufferSize(), + IID_PPV_ARGS(m_rootSignature.GetAddressOf()))); + // Define the number of threads per thread group. + // LPCSTR pointer obtained from myString.c_str() is only valid as long as the myString object exists. + std::string x_str = std::to_string(m_num_thread.x); + LPCSTR x_val = x_str.c_str(); + std::string y_str = std::to_string(m_num_thread.y); + LPCSTR y_val = y_str.c_str(); + std::string z_str = std::to_string(m_num_thread.z); + LPCSTR z_val = z_str.c_str(); + D3D_SHADER_MACRO defines[] = { + {"X", x_val}, + {"Y", y_val}, + {"Z", z_val}, + {nullptr, nullptr} // The last entry must be nullptr to indicate the end of the array + }; + // Load and Compile shader according to user specified. + switch (opts->mem_type) { + case Memtype::Read: + m_shader = CompileShader(L"ReadWrite.hlsl", defines, "Read", "cs_5_0"); + break; + case Memtype::Write: + m_shader = CompileShader(L"ReadWrite.hlsl", defines, "Write", "cs_5_0"); + break; + case Memtype::ReadWrite: + m_shader = CompileShader(L"ReadWrite.hlsl", defines, "ReadWrite", "cs_5_0"); + break; + default: + std::cout << "Error: Invalid memory type." << std::endl; + exit(1); + } + // Describe and create the graphics pipeline state object (PSO). + D3D12_COMPUTE_PIPELINE_STATE_DESC computePsoDesc = {}; + computePsoDesc.pRootSignature = m_rootSignature.Get(); + computePsoDesc.CS = {reinterpret_cast(m_shader->GetBufferPointer()), m_shader->GetBufferSize()}; + computePsoDesc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE; + ThrowIfFailed(m_device->CreateComputePipelineState(&computePsoDesc, IID_PPV_ARGS(&m_PSO))); + + ExecuteWaitForCommandQueue(); + + // Setup root signature for pipeline. + m_commandList->SetPipelineState(m_PSO.Get()); + m_commandList->SetComputeRootSignature(m_rootSignature.Get()); + if (opts->mem_type == Memtype::Write || opts->mem_type == Memtype::ReadWrite) { + m_commandList->SetComputeRootShaderResourceView(0, m_inputBuffer->GetGPUVirtualAddress()); + } + m_commandList->SetComputeRootConstantBufferView(1, m_constantBuffer->GetGPUVirtualAddress()); + m_commandList->SetComputeRootUnorderedAccessView(2, m_outputBuffer->GetGPUVirtualAddress()); +} + +/** + * @brief Create a default buffer and upload data with the upload buffer. + * @param device the GPU device object. + * @param cmdList the GPU command list object. + * @param initData the data that need to upload. + * @param byteSize the size of data that need to upload. + * @param uploadBuffer the upload that use for upload data. + * @return a constant buffer object. + */ +Microsoft::WRL::ComPtr +GPUMemRwBw::CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, const void *initData, + UINT64 byteSize, Microsoft::WRL::ComPtr &uploadBuffer) { + ComPtr defaultBuffer; + // Create target default buffer. + CD3DX12_HEAP_PROPERTIES DefaultHeap(D3D12_HEAP_TYPE_DEFAULT); + CD3DX12_RESOURCE_DESC defaultResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(byteSize); + ThrowIfFailed(device->CreateCommittedResource(&DefaultHeap, D3D12_HEAP_FLAG_NONE, &defaultResourceDesc, + D3D12_RESOURCE_STATE_COMMON, nullptr, + IID_PPV_ARGS(defaultBuffer.GetAddressOf()))); + // Create a temporary upload buffer to upload data. + CD3DX12_HEAP_PROPERTIES UploadHeap(D3D12_HEAP_TYPE_UPLOAD); + CD3DX12_RESOURCE_DESC UploadResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(byteSize); + ThrowIfFailed(device->CreateCommittedResource(&UploadHeap, D3D12_HEAP_FLAG_NONE, &UploadResourceDesc, + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, + IID_PPV_ARGS(uploadBuffer.GetAddressOf()))); + // Upload data that pass in. + D3D12_SUBRESOURCE_DATA subResourceData = {}; + subResourceData.pData = initData; + subResourceData.RowPitch = byteSize; + subResourceData.SlicePitch = subResourceData.RowPitch; + // Commit copy command list. + CD3DX12_RESOURCE_BARRIER WriteBarrier = CD3DX12_RESOURCE_BARRIER::Transition( + defaultBuffer.Get(), D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST); + cmdList->ResourceBarrier(1, &WriteBarrier); + UpdateSubresources<1>(cmdList, defaultBuffer.Get(), uploadBuffer.Get(), 0, 0, 1, &subResourceData); + CD3DX12_RESOURCE_BARRIER ReadBarrier = CD3DX12_RESOURCE_BARRIER::Transition( + defaultBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_GENERIC_READ); + cmdList->ResourceBarrier(1, &ReadBarrier); + return defaultBuffer; +} + +/** + * @brief Execute the commands and wait until command completed. + */ +void GPUMemRwBw::ExecuteWaitForCommandQueue(DWORD dwMilliseconds) { + // Close, execute (and optionally reset) the command list, and also to use a fence to wait for the command queue. + ThrowIfFailed(m_commandList->Close()); + ID3D12CommandList *listsToExecute[] = {m_commandList.Get()}; + m_commandQueue->ExecuteCommandLists(ARRAYSIZE(listsToExecute), listsToExecute); + // Signal and increment the fence value. + const UINT64 fenceL = m_fenceValue; + ThrowIfFailed(m_commandQueue->Signal(m_fence.Get(), fenceL)); + m_fenceValue++; + // Wait until command queue is done. + if (m_fence->GetCompletedValue() < fenceL) { + ThrowIfFailed(m_fence->SetEventOnCompletion(fenceL, m_eventHandle)); + WaitForSingleObject(m_eventHandle, dwMilliseconds); + } + // Reset the command allocator and command list. + ID3D12CommandAllocator *activeAllocator = m_commandAllocator.Get(); + ThrowIfFailed(activeAllocator->Reset()); + ThrowIfFailed(m_commandList->Reset(activeAllocator, nullptr)); +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.h b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.h new file mode 100644 index 000000000..59ca86db8 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.h @@ -0,0 +1,174 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers. +#endif + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../directx_third_party/DXSampleHelper.h" +#include "../directx_third_party/d3dx12.h" +#include "../directx_utils/D3D12Timer.h" +#include "BenchmarkOptions.h" + +// linker +#pragma comment(lib, "dxguid.lib") +#pragma comment(lib, "dxgi.lib") +#pragma comment(lib, "d3d12.lib") +#pragma comment(lib, "d3dcompiler.lib") + +#if defined(_DEBUG) +#include +#endif + +using namespace DirectX; +// Note that while ComPtr is used to manage the lifetime of resources on the CPU, +// it has no understanding of the lifetime of resources on the GPU. Apps must account +// for the GPU lifetime of resources to avoid destroying objects that may still be +// referenced by the GPU. +// An example of this can be found in the class method: OnDestroy(). +using Microsoft::WRL::ComPtr; +using namespace std; + +struct ParameterBuffer { + int numLoop; + UInt3 numThread; + UInt3 numDispatch; +}; + +template T *get_rvalue_ptr(T &&v) { return &v; } + +class GPUMemRwBw { + public: + /** + * @brief Constructor, initialize the options. + * @param opts, Options for construct. + * @param usize, the byte size of data array. + */ + GPUMemRwBw(BenchmarkOptions *opts) : opts(opts) { + // The setting of num_thread need be consistent with the the shader file. + m_num_thread = opts->num_threads; + m_num_elements = opts->size / sizeof(float); + uint32_t numThreadGroup = m_num_elements / (m_num_thread.x * m_num_thread.y * m_num_thread.z); + m_num_dispatch = {numThreadGroup, 1, 1}; + } + + /** + * @brief Destructor, release the fence. + */ + ~GPUMemRwBw() {} + + /** + * @brief Start and run the benchmark. + */ + void Run(); + + /** + * @brief Memory read write benchmark. + * @param numElem the length of data array. + * @param loops the number of dispatch tiems for measuring the performance. + * @param numWarmUp the number of warm up dispatch times. + * @return double the time elapsed in ms. + */ + double MemReadWriteBench(SIZE_T numElem, int loops, int numWarmUp); + + /** + * @brief Create pipeline including + * create device object, command list, command queue + * and synchronization objects. + */ + void CreatePipeline(); + + /** + * @brief Setup GPU pipeline resource including creating root signature, pipeline state and compile shader. + */ + void LoadAssets(); + + /** + * @brief Allocate resouce on both CPU side and GPU side and construct a array of buffers with given length. + * @param numElement the length of data array. + */ + void PrepareDataAndBuffer(SIZE_T numElement); + + /** + * @brief Create a default buffer and upload data with the upload buffer. + * @param device the GPU device object. + * @param cmdList the GPU command list object. + * @param initData the data that need to upload. + * @param byteSize the size of data that need to upload. + * @param UploadBuffer the upload that use for upload data. + * @return a constant buffer object. + */ + Microsoft::WRL::ComPtr CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, + const void *initData, UINT64 byteSize, + Microsoft::WRL::ComPtr &uploadBuffer); + + /** + * @brief Execute the commands and wait until command completed. + */ + void ExecuteWaitForCommandQueue(DWORD dwMilliseconds = 30000); + + /** + * @brief Check result correctness. + * @param numElement the length of data array. + * @return true if result is correct. + */ + bool CheckData(SIZE_T numElement); + + private: + // Dispatch layout of command. + UInt3 m_num_dispatch; + // Number of elements in data buffer. + uint32_t m_num_elements = 0; + // Number of threads each group. + UInt3 m_num_thread; + + // Pipeline objects. + ComPtr m_device = nullptr; + ComPtr m_commandAllocator = nullptr; + ComPtr m_commandQueue = nullptr; + ComPtr m_commandList = nullptr; + + // Upload buffer to upload data from CPU to GPU. + ComPtr m_uploadBuffer = nullptr; + // Input buffer to pass data into GPU. + ComPtr m_inputBuffer = nullptr; + // Readback buffer to copy data from GPU to CPU for data check. + ComPtr m_readbackBuffer = nullptr; + // Output buffer. + ComPtr m_outputBuffer = nullptr; + // Constant buffer. + ComPtr m_constantBuffer = nullptr; + + // Root signature of GPU pipeline. + ComPtr m_rootSignature = nullptr; + // Pipeline object to execute. + ComPtr m_PSO = nullptr; + // Shader objects that loaded. + ComPtr m_shader = nullptr; + + // Synchronization objects. + ComPtr m_fence = nullptr; + HANDLE m_eventHandle = nullptr; + UINT64 m_fenceValue = 0; + + // GPU timer. + D3D12::D3D12Timer m_gpuTimer; + + // User options. + BenchmarkOptions *opts; +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj new file mode 100644 index 000000000..80ab02e37 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj @@ -0,0 +1,105 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + Win32Proj + {7880ced5-0e93-4003-9f9b-2ed29bc4bd0f} + GPUMemRwBw + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + + + + + + + + + + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + + + + + + + + + + + + Compute + 4.0 + Compute + 4.0 + Document + false + copy %(Identity) "$(OutDir)" > NUL + $(OutDir)\%(Identity) + false + copy %(Identity) "$(OutDir)" > NUL + $(OutDir)\%(Identity) + + + + + + \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/Main.cpp new file mode 100644 index 000000000..7901224e7 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/Main.cpp @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include + +#include "GPUMemRwBw.h" + +int main(int argc, char *argv[]) { + BenchmarkOptions option(argc, argv); + option.init(); + if (option.size != -1) { + // Run only one size + GPUMemRwBw benchmark(&option); + benchmark.Run(); + } else { + // Run all sizes + for (SIZE_T usize = option.min_size; usize <= option.max_size; usize += usize) { + option.size = usize; + GPUMemRwBw benchmark(&option); + benchmark.Run(); + } + } +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/ReadWrite.hlsl b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/ReadWrite.hlsl new file mode 100644 index 000000000..f27ca2ebe --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/ReadWrite.hlsl @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +StructuredBuffer gInputA : register(t0); +RWStructuredBuffer gOutput : register(u0); + +cbuffer ParamBuffer : register(b0) { + int numLoop; + uint3 numThreads; + uint3 numDispatch; +}; + +[numthreads(X, Y, Z)] +void Read(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID, uint3 dispatchId : SV_DispatchThreadID) +{ + uint idStart = dispatchId.x + + dispatchId.y * numDispatch.x * numThreads.x + + dispatchId.z * numDispatch.x * numThreads.x * numDispatch.y * numThreads.y; + + uint start = idStart * numLoop; + uint end = start + numLoop; + for (uint i = start; i < end; i++) + { + float c = gOutput[i]; + if (c == -1) + { + // This condition should never access since gOutput init as zero. + // It is for avoid compile optimization. + gOutput[i] = 0; + } + } +} + +[numthreads(X, Y, Z)] +void Write(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID, uint3 dispatchId : SV_DispatchThreadID) +{ + uint idStart = dispatchId.x + + dispatchId.y * numDispatch.x * numThreads.x + + dispatchId.z * numDispatch.x * numThreads.x * numDispatch.y * numThreads.y; + + uint start = idStart * numLoop; + uint end = start + numLoop; + for (uint i = start; i < end; i++) + { + gOutput[i] = i % 256; + } +} + +[numthreads(X, Y, Z)] +void ReadWrite(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID, uint3 dispatchId : SV_DispatchThreadID) +{ + uint idStart = dispatchId.x + + dispatchId.y * numDispatch.x * numThreads.x + + dispatchId.z * numDispatch.x * numThreads.x * numDispatch.y * numThreads.y; + + uint start = idStart * numLoop; + uint end = start + numLoop; + for (uint i = start; i < end; i++) + { + gOutput[i] = gInputA[i]; + } +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h index ce384272a..848688351 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h +++ b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h @@ -7,6 +7,12 @@ #include #include +struct UInt3 { + unsigned int x; + unsigned int y; + unsigned int z; +}; + class Options { protected: char **begin; @@ -43,6 +49,51 @@ class Options { return defaults; } + /** + * @brief Get the unsigned long long type value of cmd line argument. + * @param option the cmd line argument. + * @param defaults the default value. + * @return unsigned long long the unsigned long long type value of cmd line argument 'option'. + */ + std::vector splitAndConvertToInt(const std::string &str) { + std::vector result; + std::stringstream ss(str); + std::string token; + + while (std::getline(ss, token, ',')) { + try { + result.push_back(std::stoul(token)); + } catch (std::invalid_argument &e) { + throw std::invalid_argument("Invalid argument: " + token + e.what()); + } + } + return result; + } + + /** + * @brief Get the unsigned int type value of cmd line argument. + * @param option the cmd line argument. + * @param defaults the default value. + * @return unsigned int the unsigned int type value of cmd line argument 'option'. + */ + UInt3 get_cmd_line_argument_uint3(const std::string &option, const UInt3 &defaults) { + if (char *value = get_cmd_option(option)) { + try { + std::vector values = splitAndConvertToInt(value); + if (values.size() != 3) { + std::cout << "Error: Invalid argument - " << option << " should be unsigned int3" << '\n'; + exit(1); + } + return {values[0], values[1], values[2]}; + + } catch (const std::exception &e) { + std::cout << "Error: Invalid argument - " << option << " should be unsigned int3" << e.what() << '\n'; + exit(1); + } + } + return defaults; + } + /** * @brief Get the string type value of cmd line argument. * @param option the cmd line argument. From f25991370770c9f55ea8cf445e01301db61679d6 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 29 Jun 2023 11:38:01 +0000 Subject: [PATCH 14/33] Benchmarks: Add benchmark - Add source code of DirectxGPUCopy microbenchmark (#486) **Description** Add source code of DirectxGPUCopy microbenchmark. --- .../BenchmarkOptions.h | 3 +- .../BenchmarkOptions.h | 69 +++++ .../GPUCopyBw.cpp | 241 ++++++++++++++++++ .../directx_gpu_copy_performance/GPUCopyBw.h | 146 +++++++++++ .../GPUCopyBw.vcxproj | 90 +++++++ .../directx_gpu_copy_performance/Main.cpp | 23 ++ .../micro_benchmarks/directx_utils/Options.h | 26 +- 7 files changed, 592 insertions(+), 6 deletions(-) create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/BenchmarkOptions.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h index c5207bb4f..8ba9fb913 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h @@ -37,9 +37,8 @@ class BenchmarkOptions : public Options { * @brief Parse the arguments. */ virtual void parse_arguments() { - num_loops = get_cmd_line_argument_int("--num_loops", 10); - num_warm_up = get_cmd_line_argument_int("--num_loops", 0); + num_warm_up = get_cmd_line_argument_int("--num_warm_up", 0); m = get_cmd_line_argument_int("--m", 16 * 256); n = get_cmd_line_argument_int("--n", 16 * 256); k = get_cmd_line_argument_int("--k", 16 * 256); diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/BenchmarkOptions.h new file mode 100644 index 000000000..aa0493cd4 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/BenchmarkOptions.h @@ -0,0 +1,69 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include + +#include "../directx_utils/Options.h" + +class BenchmarkOptions : public Options { + + public: + // Size of data for GPU copy. + unsigned long long size; + // Run size from min_size to max_size for GPU copy. + unsigned long long min_size = 0; + // Run size from min_size to max_size for GPU copy. + unsigned long long max_size = 0; + // Number of warm up copy times to run. + int num_warm_up = 0; + // Number of copy times to run. + int num_loops = 0; + // Host-to-device copy mode. + bool htod_enabled = false; + // device-to-host copy mode. + bool dtoh_enabled = false; + // Whether check data after copy. + bool check_data = false; + + /** + * @brief Construct a new BenchmarkOptions object. + */ + BenchmarkOptions(int argc, char *argv[]) : Options(argc, argv) {} + + /** + * @brief Parse the arguments. + */ + virtual void parse_arguments() override { + size = get_cmd_line_argument_int("--size", -1); + num_warm_up = get_cmd_line_argument_int("--warm_up", 20); + num_loops = get_cmd_line_argument_int("--num_loops", 100000); + min_size = get_cmd_line_argument_int("--minbytes", 64); + max_size = get_cmd_line_argument_ulonglong("--maxbytes", 8 * 1024 * 1024); + htod_enabled = get_cmd_line_argument_bool("--htod"); + dtoh_enabled = get_cmd_line_argument_bool("--dtoh"); + check_data = get_cmd_line_argument_bool("--check"); + if (!htod_enabled && !dtoh_enabled) { + std::cerr << "Error: Please specify copy mode!" << std::endl; + exit(-1); + } + } + + /** + * @brief Get the option usage. + */ + void get_option_usage() override { + std::cout << "Usage: " << std::endl; + std::cout << " --size Size of data for GPU copy." << std::endl; + std::cout << " --warm_up Number of warm up copy times to run." << std::endl; + std::cout << " --num_loops Number of copy times to run." << std::endl; + std::cout << " --minbytes Run size from min_size to max_size for GPU copy." << std::endl; + std::cout << " --maxbytes Run size from min_size to max_size for GPU copy." << std::endl; + std::cout << " --htod Host-to-device copy mode." << std::endl; + std::cout << " --dtoh Device-to-host copy mode." << std::endl; + std::cout << " --check Whether check data after copy." << std::endl; + } +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.cpp b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.cpp new file mode 100644 index 000000000..c95c79f3f --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.cpp @@ -0,0 +1,241 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include +#include + +#include "GPUCopyBw.h" + +/** + * @brief Run the benchmark. + */ +void GPUCopyBw::Run() { + CreatePipeline(); + double time_ms = CopyResourceBench(opts->size, opts->num_loops, opts->num_warm_up); + double bw = opts->size * opts->num_loops / time_ms / 1e6; + string mode = opts->dtoh_enabled ? "dtoh" : "htod"; + cout << mode << ": " << opts->size << "B " << bw << " GB/s" << endl; +} + +/** + * @brief Allocate gpu resources, construct a array of buffers with given size. + * @param uSize the size of each buffer inside of array. + */ +void GPUCopyBw::InitializeBuffer(SIZE_T uSize) { + m_defaultBufferDesc = CD3DX12_RESOURCE_DESC::Buffer(uSize); + + // The output buffer (created below) is on a default heap, so only the GPU can access it. + auto defaultHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT); + ThrowIfFailed(m_device->CreateCommittedResource(&defaultHeapProperties, D3D12_HEAP_FLAG_NONE, &m_defaultBufferDesc, + D3D12_RESOURCE_STATE_COPY_DEST, nullptr, + IID_PPV_ARGS(&m_defaultBuffer))); + + // Create upload buffer to upload data to GPU. + auto uploadHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD); + ThrowIfFailed(m_device->CreateCommittedResource(&uploadHeapProperties, D3D12_HEAP_FLAG_NONE, &m_defaultBufferDesc, + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, + IID_PPV_ARGS(&m_uploadBuffer))); + + // Create read back buffer if dtoh mode. + if (opts->dtoh_enabled) { + auto readbackHeapProperties = CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_READBACK); + ThrowIfFailed(m_device->CreateCommittedResource(&readbackHeapProperties, D3D12_HEAP_FLAG_NONE, + &m_defaultBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, + IID_PPV_ARGS(&m_readbackBuffer))); + } +} + +/** + * @brief Allocate data on CPU side to prepare upload. + * @param byteSize the size of data to be uploaded. + */ +void GPUCopyBw::PrepareData(SIZE_T byteSize) { + m_pDataBegin = std::make_unique(byteSize); + constexpr int uint8_mod = 256; + for (int j = 0; j < byteSize; j++) { + m_pDataBegin[j] = static_cast(j % uint8_mod); + } +} + +/** + * @brief Check result correctness. + * @param byteSize the size of data to be checked. + * @param pData the byte array that expect to be. + * @return true result is correct. + */ +bool GPUCopyBw::CheckData(SIZE_T byteSize, const uint8_t *pData) { + if (opts->dtoh_enabled) { + D3D12_RANGE readbackBufferRange{0, byteSize}; + uint8_t *pReadbackBufferData{}; + + // Read back data from GPU. + ThrowIfFailed(m_readbackBuffer->Map(0, &readbackBufferRange, reinterpret_cast(&pReadbackBufferData))); + // Check result correctness. + for (int i = 0; i < byteSize; i++) { + if (pData[i] != pReadbackBufferData[i]) + return false; + } + D3D12_RANGE emptyRange{0, 0}; + m_readbackBuffer->Unmap(0, &emptyRange); + } + return true; +} + +/** + * @brief GPU copy benchmark. + * @param size the size of data to copy. + * @param loops the number of copy times to measure the performance. + * @return double the time elapsed in ms. + */ +double GPUCopyBw::CopyResourceBench(SIZE_T size, int loops, int warm_up) { + // Prepare CPU side data buffer. + PrepareData(size); + // Prepare GPU resources and buffers. + InitializeBuffer(size); + // Set data into source buffer. + PrepareSourceBufferData(m_pDataBegin.get(), size); + + // Run the copy command. + gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::copy); + for (int i = 0; i < loops + warm_up; i++) { + if (i == warm_up) { + // Start timestamp. + this->gpuTimer.start(m_commandList.Get(), 0); + } + if (opts->htod_enabled) { + CopyResourceFromUploadToDefault(); + } else if (opts->dtoh_enabled) { + CopyResourceFromDefaultToReadback(); + } + } + // Stop timestamp. + this->gpuTimer.stop(m_commandList.Get(), 0); + this->gpuTimer.resolveQueryToCPU(m_commandList.Get(), 0); + + // Close, execute (and optionally reset) the command list, and also to use a fence to wait for the command queue. + this->ExecuteWaitForCopyQueue(); + + // Check if result is correctly copied. + // The code below assumes that the GPU wrote FLOATs to the buffer. + if (opts->check_data) { + bool correctness = CheckData(size, m_pDataBegin.get()); + if (!correctness) { + std::cout << "Error: Result is not correct!" << std::endl; + } + } + + return this->gpuTimer.getElapsedMsByTimestampPair(0); +} + +/** + * @brief Copy data from CPU side to GPU side. + */ +void GPUCopyBw::CopyResourceFromUploadToDefault() { + m_commandList->CopyResource(m_defaultBuffer.Get(), m_uploadBuffer.Get()); +} + +/** + * @brief Copy data from GPU side to GPU side. + */ +void GPUCopyBw::CopyResourceFromDefaultToDefault() { + m_commandList->CopyResource(m_defaultBuffer.Get(), m_defaultDescBuffer.Get()); +} + +/** + * @brief Copy data from GPU side to CPU side. + */ +void GPUCopyBw::CopyResourceFromDefaultToReadback() { + m_commandList->CopyResource(m_readbackBuffer.Get(), m_defaultBuffer.Get()); +} + +/** + * @brief Execute the commands and wait until command completed. + */ +void GPUCopyBw::ExecuteWaitForCopyQueue(DWORD dwMilliseconds) { + // Close, execute (and optionally reset) the command list, and also to use a fence to wait for the command queue. + ThrowIfFailed(m_commandList->Close()); + ID3D12CommandList *listsToExecute[] = {m_commandList.Get()}; + m_commandQueue->ExecuteCommandLists(ARRAYSIZE(listsToExecute), listsToExecute); + // Signal and increment the fence value. + const UINT64 fenceL = m_copyFenceValue; + ThrowIfFailed(m_commandQueue->Signal(m_copyFence.Get(), fenceL)); + m_copyFenceValue++; + // Wait until command queue is done. + if (m_copyFence->GetCompletedValue() < fenceL) { + ThrowIfFailed(m_copyFence->SetEventOnCompletion(fenceL, m_copyEventHandle)); + WaitForSingleObject(m_copyEventHandle, dwMilliseconds); + } + // Reset the command allocator and command list. + ID3D12CommandAllocator *activeAllocator = m_commandAllocator.Get(); + ThrowIfFailed(activeAllocator->Reset()); + ThrowIfFailed(m_commandList->Reset(activeAllocator, nullptr)); +} + +/** + * @brief Prepare data of the source buffer of benchmark. + * @param pData the data that should upload. + * @param byteSize the size of data. + */ +void GPUCopyBw::PrepareSourceBufferData(const void *pData, SIZE_T byteSize) { + // Upload data from CPU to upload buffer. + void *p; + ThrowIfFailed(m_uploadBuffer->Map(0, nullptr, &p)); + memcpy(p, pData, byteSize); + m_uploadBuffer->Unmap(0, nullptr); + + if (opts->dtoh_enabled) { + // Upload data from upload to default buffer. + CopyResourceFromUploadToDefault(); + D3D12_RESOURCE_BARRIER outputBufferResourceBarrier{CD3DX12_RESOURCE_BARRIER::Transition( + m_defaultBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE)}; + m_commandList->ResourceBarrier(1, &outputBufferResourceBarrier); + ExecuteWaitForCopyQueue(); + } +} + +/** + * @brief Create pipeline including + * create device object, command list, command queue + * and synchronization objects. + */ +void GPUCopyBw::CreatePipeline() { + UINT dxgiFactoryFlags = 0; + +#if _DEBUG + // Enable the debug layer (requires the Graphics Tools "optional feature"). + // NOTE: Enabling the debug layer after device creation will invalidate the active device. + { + ComPtr debugController; + if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) { + debugController->EnableDebugLayer(); + + // Enable additional debug layers. + dxgiFactoryFlags |= DXGI_CREATE_FACTORY_DEBUG; + } + } +#endif + + ComPtr factory; + ThrowIfFailed(CreateDXGIFactory2(dxgiFactoryFlags, IID_PPV_ARGS(&factory))); + + ComPtr hardwareAdapter; + GetHardwareAdapter(factory.Get(), &hardwareAdapter); + + ThrowIfFailed(D3D12CreateDevice(hardwareAdapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device))); + + D3D12_COMMAND_QUEUE_DESC cqd3 = {}; + cqd3.Type = D3D12_COMMAND_LIST_TYPE_COPY; + ThrowIfFailed(m_device->CreateCommandQueue(&cqd3, IID_PPV_ARGS(&m_commandQueue))); + + ThrowIfFailed(m_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COPY, IID_PPV_ARGS(&m_commandAllocator))); + + // Create the command list. + ThrowIfFailed(m_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COPY, m_commandAllocator.Get(), nullptr, + IID_PPV_ARGS(&m_commandList))); + + ThrowIfFailed(m_device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_copyFence))); + m_copyFenceValue = 1; + // Create an event handle to use for GPU synchronization. + m_copyEventHandle = CreateEvent(0, false, false, 0); +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.h b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.h new file mode 100644 index 000000000..945aa2092 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.h @@ -0,0 +1,146 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +// linker +#pragma comment(lib, "dxguid.lib") +#pragma comment(lib, "dxgi.lib") +#pragma comment(lib, "d3d12.lib") +#pragma comment(lib, "d3dcompiler.lib") + +#if defined(_DEBUG) +#include +#endif + +#include "../directx_third_party/DXSampleHelper.h" +#include "../directx_third_party/d3dx12.h" +#include "../directx_utils/D3D12Timer.h" +#include "BenchmarkOptions.h" + +using namespace DirectX; +// Note that while ComPtr is used to manage the lifetime of resources on the CPU, +// it has no understanding of the lifetime of resources on the GPU. Apps must account +// for the GPU lifetime of resources to avoid destroying objects that may still be +// referenced by the GPU. +// An example of this can be found in the class method: OnDestroy(). +using Microsoft::WRL::ComPtr; +using namespace std; + +class GPUCopyBw { + public: + GPUCopyBw(BenchmarkOptions *opts) : opts(opts) {} + ~GPUCopyBw() { CloseHandle(m_copyFence.Get()); } + + /** + * @brief Run the benchmark. + */ + void Run(); + + /** + * @brief GPU copy benchmark. + * @param size the size of data to copy. + * @param loops the number of copy times to measure the performance. + * @return double the time elapsed in ms. + */ + double CopyResourceBench(SIZE_T size, int loops, int warm_up); + + /** + * @brief Create pipeline including + * create device object, command list, command queue + * and synchronization objects. + */ + void CreatePipeline(); + + /** + * @brief Allocate data on CPU side to prepare upload. + * @param byteSize the size of data to be uploaded. + */ + void PrepareData(SIZE_T byteSize); + + /** + * @brief Allocate gpu resources, construct a array of buffers with given size. + * @param uSize the size of each buffer inside of array. + */ + void InitializeBuffer(SIZE_T uSize); + + /** + * @brief Prepare data of the source buffer of benchmark. + * @param pData the data that should upload. + * @param byteSize the size of data. + */ + void PrepareSourceBufferData(const void *pData, SIZE_T byteSize); + + /** + * @brief Copy data from CPU side to GPU side. + */ + void CopyResourceFromUploadToDefault(); + + /** + * @brief Copy data from GPU side to CPU side. + */ + void CopyResourceFromDefaultToReadback(); + + /** + * @brief Copy data from GPU side to GPU side. + */ + void CopyResourceFromDefaultToDefault(); + + /** + * @brief Execute the commands and wait until command completed. + */ + void ExecuteWaitForCopyQueue(DWORD dwMilliseconds = 60000); + + /** + * @brief Check result correctness. + * @param byteSize the size of data to be checked. + * @param pData the byte array that expect to be. + * @return true result is correct. + */ + bool CheckData(SIZE_T byteSize, const uint8_t *pData); + + private: + // Pipeline objects. + ComPtr m_device = nullptr; + ComPtr m_commandAllocator = nullptr; + ComPtr m_commandQueue = nullptr; + ComPtr m_commandList = nullptr; + + // App resources. + // Pointer of CPU size resource. + std::unique_ptr m_pDataBegin = nullptr; + // GPU side buffer. + ComPtr m_defaultBuffer = nullptr; + // GPU side buffer as destination if in dtod mode. + ComPtr m_defaultDescBuffer = nullptr; + // Upload buffer to upload data from CPU to GPU. + ComPtr m_uploadBuffer = nullptr; + // Read back buffer to check data correctness. + ComPtr m_readbackBuffer = nullptr; + // Default buffer descriptor. + D3D12_RESOURCE_DESC m_defaultBufferDesc; + + // Synchronization objects. + ComPtr m_copyFence = nullptr; + HANDLE m_copyEventHandle = nullptr; + UINT64 m_copyFenceValue = 0; + + // GPU timer. + D3D12::D3D12Timer gpuTimer; + + // Options. + BenchmarkOptions *opts; +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj new file mode 100644 index 000000000..3be231342 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj @@ -0,0 +1,90 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + Win32Proj + {f561fb23-0ec2-492f-9c8d-9555a0f6a4f6} + GPUCopyBw + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + + + + + + + + + + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + + + Console + true + true + true + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp new file mode 100644 index 000000000..ac12597c5 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include + +#include "GPUCopyBw.h" + +int main(int argc, char *argv[]) { + BenchmarkOptions option(argc, argv); + option.init(); + if (option.size != -1) { + // Run only one size + GPUCopyBw benchmark(&option); + benchmark.Run(); + } else { + // Run all sizes + for (SIZE_T usize = option.min_size; usize <= option.max_size; usize += usize) { + GPUCopyBw benchmark(&option); + benchmark.Run(); + } + } +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h index 848688351..edb34bcee 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h +++ b/superbench/benchmarks/micro_benchmarks/directx_utils/Options.h @@ -6,6 +6,7 @@ #include #include #include +#include struct UInt3 { unsigned int x; @@ -55,6 +56,23 @@ class Options { * @param defaults the default value. * @return unsigned long long the unsigned long long type value of cmd line argument 'option'. */ + unsigned long long get_cmd_line_argument_ulonglong(const std::string &option, unsigned long long defaults) { + if (char *value = get_cmd_option(option)) { + try { + return std::stoull(value); + } catch (const std::exception &e) { + std::cout << "Error: Invalid argument - " << option << " should be unsigned long long" << e.what() + << '\n'; + } + } + return defaults; + } + + /** + * @brief Split the string by ',' and convert to unsigned int. + * @param str the string to be split. + * @return std::vector the vector of unsigned int. + */ std::vector splitAndConvertToInt(const std::string &str) { std::vector result; std::stringstream ss(str); @@ -71,10 +89,10 @@ class Options { } /** - * @brief Get the unsigned int type value of cmd line argument. + * @brief Get the unsigned int 3 type value of cmd line argument. * @param option the cmd line argument. * @param defaults the default value. - * @return unsigned int the unsigned int type value of cmd line argument 'option'. + * @return unsigned int the unsigned int 3 type value of cmd line argument 'option'. */ UInt3 get_cmd_line_argument_uint3(const std::string &option, const UInt3 &defaults) { if (char *value = get_cmd_option(option)) { @@ -128,12 +146,12 @@ class Options { /** * @brief Get the option usage. */ - virtual void get_option_usage(){}; + virtual void get_option_usage() = 0; /** * @brief Parse the arguments. */ - virtual void parse_arguments(){}; + virtual void parse_arguments() = 0; public: /** From 7184bdd1ede7037007b9bbf54d2103952191dc57 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 30 Jun 2023 11:22:46 +0800 Subject: [PATCH 15/33] Benchmarks - Update result parsing in tensorrt inference (#541) * Update result parsing for newer tensorrt versions * Update arguments when load torchvision models --- setup.py | 1 + .../micro_benchmarks/_export_torch_to_onnx.py | 5 +- .../tensorrt_inference_performance.py | 11 +++ .../test_tensorrt_inference_performance.py | 16 +++- ...inference.log => tensorrt_inference.1.log} | 0 tests/data/tensorrt_inference.2.log | 80 +++++++++++++++++++ 6 files changed, 108 insertions(+), 5 deletions(-) rename tests/data/{tensorrt_inference.log => tensorrt_inference.1.log} (100%) create mode 100644 tests/data/tensorrt_inference.2.log diff --git a/setup.py b/setup.py index af65fc690..23c796833 100644 --- a/setup.py +++ b/setup.py @@ -166,6 +166,7 @@ def run(self): 'numpy>=1.19.2', 'omegaconf==2.0.6', 'openpyxl>=3.0.7', + 'packaging>=21.0', 'pandas>=1.1.5', 'pssh @ git+https://github.com/lilydjwg/pssh.git@v2.3.4', 'pyyaml>=5.3', diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index cd7c8b134..1e37b793d 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -5,6 +5,7 @@ from pathlib import Path +from packaging import version import torch.hub import torch.onnx import torchvision.models @@ -129,7 +130,9 @@ def export_torchvision_model(self, model_name, batch_size=1): if not self.check_torchvision_model(model_name): return '' file_name = str(self._onnx_model_path / (model_name + '.onnx')) - model = getattr(torchvision.models, model_name)(pretrained=False).eval().cuda() + # the parameter 'pretrained' is deprecated since 0.13 in torchvision + args = {'pretrained': False} if version.parse(torchvision.__version__) < version.parse('0.13') else {} + model = getattr(torchvision.models, model_name)(**args).eval().cuda() dummy_input = torch.randn((batch_size, 3, 224, 224), device='cuda') torch.onnx.export( model, diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index a7a4aa17b..306aa2de8 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -145,6 +145,17 @@ def _process_raw_result(self, cmd_idx, raw_output): self._result.add_result(f'{model}_host_time_{tag}', float(lats[0])) self._result.add_result(f'{model}_end_to_end_time_{tag}', float(lats[1])) success = True + if '[I] Latency:' in line or '[I] GPU Compute Time:' in line: + tm = 'gpu' if '[I] GPU Compute Time:' in line else 'host' + self._result.add_result( + f'{model}_{tm}_time_mean', + float(re.findall(r'mean = (\d+\.\d+) ms', line)[0]), + ) + self._result.add_result( + f'{model}_{tm}_time_99', + float(re.findall(r'\(99\%\) = (\d+\.\d+) ms', line)[0]), + ) + success = True except BaseException as e: self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) logger.error( diff --git a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py index 43277b7a3..301a4a08d 100644 --- a/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py +++ b/tests/benchmarks/micro_benchmarks/test_tensorrt_inference_performance.py @@ -116,16 +116,17 @@ def test_tensorrt_inference_params(self): len(test_case.get('pytorch_models', benchmark._pytorch_models)), len(benchmark._commands) ) - @decorator.load_data('tests/data/tensorrt_inference.log') - def test_tensorrt_inference_result_parsing(self, test_raw_log): + @decorator.load_data('tests/data/tensorrt_inference.1.log') + @decorator.load_data('tests/data/tensorrt_inference.2.log') + def test_tensorrt_inference_result_parsing(self, test_raw_log_1, test_raw_log_2): """Test tensorrt-inference benchmark result parsing.""" (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) benchmark = benchmark_cls(self.benchmark_name, parameters='') benchmark._args = SimpleNamespace(pytorch_models=['model_0', 'model_1'], log_raw_data=False) benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) - # Positive case - valid raw output - self.assertTrue(benchmark._process_raw_result(0, test_raw_log)) + # Positive case 1 - valid raw output + self.assertTrue(benchmark._process_raw_result(0, test_raw_log_1)) self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result)) @@ -134,5 +135,12 @@ def test_tensorrt_inference_result_parsing(self, test_raw_log): self.assertEqual(0.6, benchmark.result[f'model_0_host_time_{tag}'][0]) self.assertEqual(1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0]) + # Positive case 2 - valid raw output + self.assertTrue(benchmark._process_raw_result(0, test_raw_log_2)) + self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) + for tag in ['mean', '99']: + self.assertEqual(1.5, benchmark.result[f'model_0_gpu_time_{tag}'][1]) + self.assertEqual(2.0, benchmark.result[f'model_0_host_time_{tag}'][1]) + # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output')) diff --git a/tests/data/tensorrt_inference.log b/tests/data/tensorrt_inference.1.log similarity index 100% rename from tests/data/tensorrt_inference.log rename to tests/data/tensorrt_inference.1.log diff --git a/tests/data/tensorrt_inference.2.log b/tests/data/tensorrt_inference.2.log new file mode 100644 index 000000000..b07529e94 --- /dev/null +++ b/tests/data/tensorrt_inference.2.log @@ -0,0 +1,80 @@ +[06/29/2023-08:24:55] [I] === Model Options === +[06/29/2023-08:24:55] [I] Format: ONNX +[06/29/2023-08:24:55] [I] Model: /root/.cache/torch/hub/onnx/resnet50.onnx +[06/29/2023-08:24:55] [I] Output: +[06/29/2023-08:24:55] [I] === Build Options === +[06/29/2023-08:24:55] [I] Max batch: explicit batch +[06/29/2023-08:24:55] [I] Memory Pools: workspace: 8192 MiB, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default +[06/29/2023-08:24:55] [I] minTiming: 1 +[06/29/2023-08:24:55] [I] avgTiming: 8 +[06/29/2023-08:24:55] [I] Precision: FP32+FP16 +[06/29/2023-08:24:55] [I] LayerPrecisions: +[06/29/2023-08:24:55] [I] Calibration: +[06/29/2023-08:24:55] [I] Refit: Disabled +[06/29/2023-08:24:55] [I] Sparsity: Disabled +[06/29/2023-08:24:55] [I] Safe mode: Disabled +[06/29/2023-08:24:55] [I] DirectIO mode: Disabled +[06/29/2023-08:24:55] [I] Restricted mode: Disabled +[06/29/2023-08:24:55] [I] Build only: Disabled +[06/29/2023-08:24:55] [I] Save engine: +[06/29/2023-08:24:55] [I] Load engine: +[06/29/2023-08:24:55] [I] Profiling verbosity: 0 +[06/29/2023-08:24:55] [I] Tactic sources: Using default tactic sources +[06/29/2023-08:24:55] [I] timingCacheMode: local +[06/29/2023-08:24:55] [I] timingCacheFile: +[06/29/2023-08:24:55] [I] Heuristic: Disabled +[06/29/2023-08:24:55] [I] Preview Features: Use default preview flags. +[06/29/2023-08:24:55] [I] Input(s)s format: fp32:CHW +[06/29/2023-08:24:55] [I] Output(s)s format: fp32:CHW +[06/29/2023-08:24:55] [I] Input build shape: input=32x3x224x224+32x3x224x224+32x3x224x224 +[06/29/2023-08:24:55] [I] Input calibration shapes: model +[06/29/2023-08:24:55] [I] === System Options === +[06/29/2023-08:24:55] [I] Device: 0 +[06/29/2023-08:24:55] [I] DLACore: +[06/29/2023-08:24:55] [I] Plugins: +[06/29/2023-08:24:55] [I] === Inference Options === +[06/29/2023-08:24:55] [I] Batch: Explicit +[06/29/2023-08:24:55] [I] Input inference shape: input=32x3x224x224 +[06/29/2023-08:24:55] [I] Iterations: 2048 +[06/29/2023-08:24:55] [I] Duration: 3s (+ 200ms warm up) +[06/29/2023-08:24:55] [I] Sleep time: 0ms +[06/29/2023-08:24:55] [I] Idle time: 0ms +[06/29/2023-08:24:55] [I] Streams: 1 +[06/29/2023-08:24:55] [I] ExposeDMA: Disabled +[06/29/2023-08:24:55] [I] Data transfers: Enabled +[06/29/2023-08:24:55] [I] Spin-wait: Disabled +[06/29/2023-08:24:55] [I] Multithreading: Disabled +[06/29/2023-08:24:55] [I] CUDA Graph: Disabled +[06/29/2023-08:24:55] [I] Separate profiling: Disabled +[06/29/2023-08:24:55] [I] Time Deserialize: Disabled +[06/29/2023-08:24:55] [I] Time Refit: Disabled +[06/29/2023-08:24:55] [I] NVTX verbosity: 0 +[06/29/2023-08:24:55] [I] Persistent Cache Ratio: 0 +[06/29/2023-08:24:55] [I] Inputs: +[06/29/2023-08:24:55] [I] === Reporting Options === +[06/29/2023-08:24:55] [I] Verbose: Disabled +[06/29/2023-08:24:55] [I] Averages: 10 inferences +[06/29/2023-08:24:55] [I] Percentiles: 99 +[06/29/2023-08:24:55] [I] Dump refittable layers:Disabled +[06/29/2023-08:24:55] [I] Dump output: Disabled +[06/29/2023-08:24:55] [I] Profile: Disabled +[06/29/2023-08:24:55] [I] Export timing to JSON file: +[06/29/2023-08:24:55] [I] Export output to JSON file: +[06/29/2023-08:24:55] [I] Export profile to JSON file: +[06/29/2023-08:25:38] [I] +[06/29/2023-08:25:38] [I] === Trace details === +[06/29/2023-08:25:38] [I] Trace averages of 10 runs: +[06/29/2023-08:25:38] [I] Average on 10 runs - GPU latency: 1.5 ms - Host latency: 2.0 ms (enqueue 0.4 ms) +[06/29/2023-08:25:38] [I] Average on 10 runs - GPU latency: 1.5 ms - Host latency: 2.0 ms (enqueue 0.4 ms) +[06/29/2023-08:25:38] [I] +[06/29/2023-08:25:38] [I] === Performance summary === +[06/29/2023-08:25:38] [I] Throughput: 1000.00 qps +[06/29/2023-08:25:38] [I] Latency: min = 1.9 ms, max = 2.1 ms, mean = 2.0 ms, median = 2.0 ms, percentile(99%) = 2.0 ms +[06/29/2023-08:25:38] [I] Enqueue Time: min = 0.3 ms, max = 0.3 ms, mean = 0.3 ms, median = 0.3 ms, percentile(99%) = 0.3 ms +[06/29/2023-08:25:38] [I] H2D Latency: min = 0.3 ms, max = 0.3 ms, mean = 0.3 ms, median = 0.3 ms, percentile(99%) = 0.3 ms +[06/29/2023-08:25:38] [I] GPU Compute Time: min = 1.4 ms, max = 1.6 ms, mean = 1.5 ms, median = 1.5 ms, percentile(99%) = 1.5 ms +[06/29/2023-08:25:38] [I] D2H Latency: min = 0.03 ms, max = 0.03 ms, mean = 0.03 ms, median = 0.03 ms, percentile(99%) = 0.03 ms +[06/29/2023-08:25:38] [I] Total Host Walltime: 3.0 s +[06/29/2023-08:25:38] [I] Total GPU Compute Time: 2.9 s +[06/29/2023-08:25:38] [I] Explanations of the performance metrics are printed in the verbose logs. +[06/29/2023-08:25:38] [I] From c7d0beaf9eded6ae681127194131c7309dd58c9a Mon Sep 17 00:00:00 2001 From: Lei Qu <59161330+quge009@users.noreply.github.com> Date: Fri, 30 Jun 2023 19:17:41 +0800 Subject: [PATCH 16/33] Doc - Update outdate references in micro-benchmarks.md (#544) Modify link for Nvidia bandwidth test tool **Description** previous link is 404 **Minor Revision** update the link value to https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest --- docs/user-tutorial/benchmarks/micro-benchmarks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index b2e43db3f..95e087235 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -229,7 +229,7 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/ #### Introduction Measure the memory copy bandwidth across PCI-e and memory copy bandwidth between GPUs, -performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/bandwidthTest) +performed by [NVIDIA](https://github.com/NVIDIA/cuda-samples/tree/master/Samples/1_Utilities/bandwidthTest) or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils/hipBusBandwidth) bandwidth test tool. #### Metrics From 97f7b1df8688eac14b524c2be51340d4b48809fe Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Fri, 30 Jun 2023 12:58:41 +0000 Subject: [PATCH 17/33] Benchmarks: microbenchmark - add auto selecting algorithm support for cudnn functions (#540) **Description** add auto selecting algorithm support for cudnn functions. **Major Revision** - add auto selecting algorithm support for cudnn functions in source code - add 'auto_algo' option in benchmark - add related test --- .../micro_benchmarks/cudnn_function.py | 9 +++++++ .../convolution_backward_data.h | 12 +++++++++ .../convolution_backward_filter.h | 11 ++++++++ .../cudnn_function/convolution_forward.h | 11 ++++++++ .../cudnn_function/cudnn_config.h | 3 +++ .../cudnn_function/cudnn_function.h | 7 +++++ .../cudnn_function/cudnn_function_helper.h | 20 ++++++++++++-- .../micro_benchmarks/test_cudnn_function.py | 26 +++++++++++++++++-- 8 files changed, 95 insertions(+), 4 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function.py b/superbench/benchmarks/micro_benchmarks/cudnn_function.py index 82384ae8b..3bc601742 100644 --- a/superbench/benchmarks/micro_benchmarks/cudnn_function.py +++ b/superbench/benchmarks/micro_benchmarks/cudnn_function.py @@ -357,6 +357,13 @@ def add_parser_arguments(self): required=False, help='The custom json string defining the params in a cudnn function.', ) + self._parser.add_argument( + '--enable_auto_algo', + action='store_true', + default=False, + required=False, + help='Whether to use auto algorithm selection.' + ) def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -373,6 +380,8 @@ def _preprocess(self): command += (' --warm_up ' + str(self._args.num_warmup)) command += (' --num_in_step ' + str(self._args.num_in_step)) command += (' --random_seed ' + str(self._args.random_seed)) + if self._args.enable_auto_algo: + command += (' --enable_auto_algo') try: if not self._args.config_json_str: diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h index 7c40b4a22..1a7c207a8 100644 --- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h +++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_data.h @@ -32,6 +32,18 @@ template class ConvolutionBackwardDataFunction : publ this->h_desc_.desc(), this->bwd_data_algo_, &this->fwd_workspace_size_)); } + /** + * @brief Find the best algorithm for cudnn convolution functions + */ + virtual void find_best_algo() { + int algo_count; + cudnnConvolutionBwdDataAlgoPerf_t perf_results; + CHECK_CUDNN_ERROR(cudnnFindConvolutionBackwardDataAlgorithm( + this->cudnn_handle, this->w_desc_.desc(), this->x_desc_.desc(), this->conv_desc_.desc(), + this->h_desc_.desc(), 1, &algo_count, &perf_results)); + this->algo_ = perf_results.algo; + } + public: /** * @brief Construct a new Convolution Backward Data Function object diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h index 10af651ad..6873ea4c9 100644 --- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h +++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_backward_filter.h @@ -31,6 +31,17 @@ template class ConvolutionBackwardFilterFunction : pu this->cudnn_handle, this->x_desc_.desc(), this->h_desc_.desc(), this->conv_desc_.desc(), this->w_desc_.desc(), this->bwd_filter_algo_, &this->fwd_workspace_size_)); } + /** + * @brief Find the best algorithm for cudnn convolution functions + */ + virtual void find_best_algo() { + int algo_count; + cudnnConvolutionBwdFilterAlgoPerf_t perf_results; + CHECK_CUDNN_ERROR(cudnnFindConvolutionBackwardFilterAlgorithm( + this->cudnn_handle, this->x_desc_.desc(), this->h_desc_.desc(), this->conv_desc_.desc(), + this->w_desc_.desc(), 1, &algo_count, &perf_results)); + this->algo_ = perf_results.algo; + } public: /** diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h index daca82337..7b1f1764b 100644 --- a/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h +++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/convolution_forward.h @@ -31,6 +31,17 @@ template class ConvolutionForwardFunction : public Cu this->cudnn_handle, this->x_desc_.desc(), this->w_desc_.desc(), this->conv_desc_.desc(), this->h_desc_.desc(), this->fwd_algo_, &this->fwd_workspace_size_)); } + /** + * @brief Find the best algorithm for cudnn convolution functions + */ + virtual void find_best_algo() { + int algo_count; + cudnnConvolutionFwdAlgoPerf_t perf_results; + CHECK_CUDNN_ERROR(cudnnFindConvolutionForwardAlgorithm(this->cudnn_handle, this->x_desc_.desc(), + this->w_desc_.desc(), this->conv_desc_.desc(), + this->h_desc_.desc(), 1, &algo_count, &perf_results)); + this->algo_ = perf_results.algo; + } public: /** diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h index 37b259fd1..913ec7fc8 100644 --- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h +++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_config.h @@ -58,6 +58,7 @@ class CudnnConfig { cudnnDataType_t input_type_; ///< selects the data type in which the computation will be done cudnnDataType_t conv_type_; ///< selects the data type in which the convolution will be done std::string function_str_; ///< the str representing the cudnn function with params + bool auto_algo_; ///< whether to use auto algo selection public: void set_num_test(int num_test) { this->num_test = num_test; } @@ -80,6 +81,7 @@ class CudnnConfig { void set_input_type(const cudnnDataType_t &input_type) { input_type_ = input_type; } void set_conv_type(const cudnnDataType_t &conv_type) { input_type_ = conv_type; } void set_function(const std::string &str) { function_str_ = str; } + void set_auto_algo(bool auto_algo) { auto_algo_ = auto_algo; } std::vector &get_input_dims() { return input_dims_; } std::vector &get_input_stride() { return input_stride_; } @@ -98,6 +100,7 @@ class CudnnConfig { std::string &get_name() { return name; } cudnn_function_name_enum get_e_name() { return e_name; } std::string &get_function_str() { return function_str_; } + bool get_auto_algo() { return auto_algo_; } /** * @brief Convert name string to enum name * @return cudnn_function_name_enum diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h index 26b5601ed..f23649f4a 100644 --- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h +++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function.h @@ -45,6 +45,10 @@ template class CudnnFunction : public CudnnConfig { * @brief launch the kernel/function */ virtual void kernel_entry() {} + /** + * @brief Find the best algorithm for cudnn convolution functions + */ + virtual void find_best_algo() {} public: /** @@ -87,6 +91,9 @@ template void CudnnFunction::prepare_for_func // Set Convolution MathType cudnnMathType_t algo = get_use_tensor_op() ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH; CHECK_CUDNN_ERROR(cudnnSetConvolutionMathType(conv_desc_.desc(), algo)); + if (this->auto_algo_) { + find_best_algo(); + } // Set convolution algorithm and workspace size this->get_workspace_size(); zeros(&fwd_workspace_, std::vector{static_cast(this->fwd_workspace_size_ / sizeof(float)), 1}); diff --git a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h index 2ee150599..d1f93a8bd 100644 --- a/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h +++ b/superbench/benchmarks/micro_benchmarks/cudnn_function/cudnn_function_helper.h @@ -67,12 +67,24 @@ class Options { return ""; } + /** @brief Get the bool type value of cmd line argument + * @param option the cmd line argument + * @return bool the bool type value of cmd line argument 'option' + */ + bool get_cmd_line_argument_bool(const std::string &option) { + if (std::find(begin, end, option) != end) { + return true; + } + return false; + } + public: int num_test; int warm_up; int num_in_step; int random_seed; std::string para_info_json; + bool auto_algo; /** * @brief Construct a new Command Line object @@ -91,6 +103,7 @@ class Options { random_seed = get_cmd_line_argument_int("--random_seed"); random_seed = (random_seed == 0 ? time(NULL) : random_seed); para_info_json = get_cmd_line_argument_string("--config_json"); + auto_algo = get_cmd_line_argument_bool("--enable_auto_algo"); para_info_json = para_info_json == "" ? R"({"algo":0,"arrayLength":2,"convType":0,"dilationA":[1,1],"filterStrideA":[1,1],"filterDims":[32,128,3,3],"inputDims":[32,128,14,14],"inputStride":[25088,196,14,1],"inputType":0,"mode":1, "name":"cudnnConvolutionBackwardFilter","outputDims":[32,32,14,14],"outputStride":[6272,196,14,1],"padA":[1,1],"tensorOp":false})" @@ -126,8 +139,10 @@ void from_json(const json &j, cudnn_test::CudnnConfig &fn) { fn.set_input_stride(input_stride); auto output_stride = j.at("outputStride").get>(); fn.set_output_stride(output_stride); - auto algo = j.at("algo").get(); - fn.set_algo(algo); + if (j.contains("algo")) { + auto algo = j.at("algo").get(); + fn.set_algo(algo); + } auto padA = j.at("padA").get>(); fn.set_padA(padA); auto filter_strideA = j.at("filterStrideA").get>(); @@ -178,6 +193,7 @@ void run_benchmark(Options &options) { function.set_warm_up(options.warm_up); function.set_num_in_step(options.num_in_step); function.set_random_seed(options.random_seed); + function.set_auto_algo(options.auto_algo); if (function.get_input_type() == CUDNN_DATA_FLOAT && function.get_conv_type() == CUDNN_DATA_FLOAT) { auto p_function = get_cudnn_function_pointer(function); p_function->benchmark(); diff --git a/tests/benchmarks/micro_benchmarks/test_cudnn_function.py b/tests/benchmarks/micro_benchmarks/test_cudnn_function.py index 590e4e519..d6ca117c2 100644 --- a/tests/benchmarks/micro_benchmarks/test_cudnn_function.py +++ b/tests/benchmarks/micro_benchmarks/test_cudnn_function.py @@ -85,8 +85,7 @@ def test_cudnn_functions(): if metric != 'return_code': assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) - -# Test for custom list configuration + # Test for custom list configuration custom_config_str2 = '{"algo":1,"arrayLength":2,"convType":0,"dilationA":[1,1],"filterStrideA":[1,1],' \ + '"filterDims":[32,128,3,3],"inputDims":[32,32,14,14],"inputStride":[6272, 196, 14, 1],"inputType":2,'\ + '"mode":1,"name":"cudnnConvolutionBackwardData","outputDims":[32, 128, 14, 14],'\ @@ -126,3 +125,26 @@ def test_cudnn_functions(): assert (isinstance(benchmark.result[metric][0], numbers.Number)) if metric != 'return_code': assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) + + # Test for auto_algo parameter + context = BenchmarkRegistry.create_benchmark_context( + 'cudnn-function', + platform=Platform.CUDA, + parameters='--num_warmup 10 --num_steps 10 --num_in_step 100 --enable_auto_algo' + ) + + assert (BenchmarkRegistry.is_benchmark_context_valid(context)) + benchmark = BenchmarkRegistry.launch_benchmark(context) + + # Check basic information. + assert (benchmark) + assert (benchmark._args.enable_auto_algo is True) + + assert (benchmark.return_code == ReturnCode.SUCCESS) + + assert (18 + benchmark.default_metric_count == len(benchmark.result)) + for metric in list(benchmark.result.keys()): + assert (len(benchmark.result[metric]) == 1) + assert (isinstance(benchmark.result[metric][0], numbers.Number)) + if metric != 'return_code': + assert (len(benchmark.raw_data[metric][0]) == benchmark._args.num_steps) From 865472177f47f64895dc02c6fc5e0084a056c665 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 3 Jul 2023 22:43:21 +0800 Subject: [PATCH 18/33] Benchmarks: Build Pipeline - add AMF in third party and build AMF encoding latency test (#543) **Description** add AMF in third party and build AMF encoding latency test. --- dockerfile/directx12.dockerfile | 3 +++ third_party/Makefile | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/dockerfile/directx12.dockerfile b/dockerfile/directx12.dockerfile index 1a958d69a..344141266 100644 --- a/dockerfile/directx12.dockerfile +++ b/dockerfile/directx12.dockerfile @@ -59,6 +59,9 @@ RUN python -m pip install setuptools==65.0.0 && \ python -m pip install --no-cache-dir .[amdworker] && \ make directxbuild +ADD third_party third_party +RUN make -C third_party directx_amd + # Run the entrypoint script for enabling vendor-specific graphics APIs RUN powershell -Command "Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine -Force" CMD [ "python", "dockerfile/directx/enable-graphics-apis.py" ] diff --git a/third_party/Makefile b/third_party/Makefile index f131ee3cb..b0c01d453 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -11,7 +11,7 @@ HPCX_HOME ?= /opt/hpcx CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2) ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) -.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl +.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd # Build all targets. all: cuda rocm @@ -19,6 +19,7 @@ cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcne rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest cpu: common cpu_perftest common: cpu_hpl cpu_stream fio +directx_amd: directx_amf_encoding_latency # Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed. sb_micro_path: @@ -148,3 +149,13 @@ ifneq (,$(wildcard stream-tests/Makefile)) make all cp -v ./stream-tests/stream*.exe $(SB_MICRO_PATH)/bin/ endif + +# Build AMD Encoder Latency Test +directx_amf_encoding_latency: + @if not exist "AMF" (git clone -b v1.4.29 https://github.com/GPUOpen-LibrariesAndSDKs/AMF.git) + @if exist "AMF\amf\public\samples\CPPSamples_vs2019.sln" ( \ + curl -L -o vs_buildtools.exe https://aka.ms/vs/16/release/vs_buildtools.exe && echo "Downloaded vs_buildtools.exe" && \ + start /wait vs_buildtools.exe --quiet --wait --norestart --nocache --installPath C:/temp/BuildTools --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.ATLMFC --includeRecommended && echo "Installed VS Build Tools" && \ + del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \ + "C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \ + ) From 3704a432b90277612da9b1553cda00725e60b03b Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Wed, 5 Jul 2023 11:33:40 +0800 Subject: [PATCH 19/33] CI/CD - Support DirectX test pipeline (#545) **Description** Support DirectX test pipeline. --- .github/workflows/build-win.yml | 24 ++++++++++++++++++++++++ superbench/benchmarks/build.bat | 2 +- tests/common/test_directx_device.py | 15 +++++++++++++++ tests/helper/__init__.py | 4 ++++ tests/helper/decorator.py | 1 + tests/runner/test_ansible.py | 4 +++- 6 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 tests/common/test_directx_device.py create mode 100644 tests/helper/__init__.py diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml index 7226af8c7..24ed3d12a 100644 --- a/.github/workflows/build-win.yml +++ b/.github/workflows/build-win.yml @@ -44,3 +44,27 @@ jobs: TAG: superbench/main:win2004 USER: ${{ secrets.DOCKERHUB_USERNAME }} PASS: ${{ secrets.DOCKERHUB_TOKEN }} + directx-unit-test: + name: DirectX unit test + needs: docker + runs-on: [self-hosted, windows, x64, win2004] + steps: + - name: Add bash to PATH + shell: pwsh + run: | + echo "$env:PATH;C:\Program Files\Git\bin" | Out-File -FilePath $env:GITHUB_PATH -Append -Encoding utf8 + - name: Bash to get codecov env + run: | + ci_env=`bash <(curl -s https://codecov.io/env)` + echo "ci_env=$ci_env" >> $GITHUB_ENV + shell: bash + - name: Run unit tests inside docker + run: | + $command="curl -s -L https://uploader.codecov.io/latest/windows/codecov.exe -o codecov.exe && python -m pip install .[test] && python -m pytest -v --cov=superbench --cov-report=xml --cov-report=term-missing tests/ -k test_directx && codecov -t ${CODECOV_TOKEN} -cF directx-unit-test" + docker run --rm ` + --isolation process ` + --device class/5B45201D-F2F2-4F3B-85BB-30FF1F953599 ` + -e CI=true $ci_env -e SB_TEST_CUDA="0" -e SB_TEST_ROCM="0" -e SB_TEST_PYTORCH="0" -e SB_TEST_DIRECTX="1" -e CODECOV_TOKEN superbench/main:win2004 cmd /c $command + shell: pwsh + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/superbench/benchmarks/build.bat b/superbench/benchmarks/build.bat index 8639e1771..49c785e18 100644 --- a/superbench/benchmarks/build.bat +++ b/superbench/benchmarks/build.bat @@ -12,7 +12,7 @@ for /r %%F in (*.vcxproj) do ( REM Download dependencies "!MSBUILD!" "!PROJ_PATH!" -t:restore -p:RestorePackagesConfig=true REM Build project - "!MSBUILD!" "!PROJ_PATH!" /p:Configuration=Release /p:AdditionalLibraryDirectories="%WindowsSDKDir%\Lib" /p:AdditionalIncludeDirectories="%WindowsSDKDir%\Include" /p:OutDir="%SB_MICRO_PATH%\bin" + "!MSBUILD!" "!PROJ_PATH!" /p:Configuration=Release /p:Platform=x64 /p:AdditionalLibraryDirectories="%WindowsSDKDir%\Lib" /p:AdditionalIncludeDirectories="%WindowsSDKDir%\Include" /p:OutDir="%SB_MICRO_PATH%\bin" ) endlocal diff --git a/tests/common/test_directx_device.py b/tests/common/test_directx_device.py new file mode 100644 index 000000000..b38b495f4 --- /dev/null +++ b/tests/common/test_directx_device.py @@ -0,0 +1,15 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for directx gpu device module.""" + +from superbench.common.devices.gpu import GPU +from tests.helper import decorator + + +@decorator.directx_test +def test_directx_gpu(): + """Test DirectX GPU device.""" + gpu = GPU() + gpu.get_vendor() + assert (gpu.vendor == 'nvidia-graphics' or gpu.vendor == 'amd-graphics') diff --git a/tests/helper/__init__.py b/tests/helper/__init__.py new file mode 100644 index 000000000..e367e58b2 --- /dev/null +++ b/tests/helper/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Helper module for tests.""" diff --git a/tests/helper/decorator.py b/tests/helper/decorator.py index bda2bc5ac..ff08469ac 100644 --- a/tests/helper/decorator.py +++ b/tests/helper/decorator.py @@ -12,6 +12,7 @@ rocm_test = unittest.skipIf(os.environ.get('SB_TEST_ROCM', '0') == '0', 'Skip ROCm tests.') pytorch_test = unittest.skipIf(os.environ.get('SB_TEST_PYTORCH', '1') == '0', 'Skip PyTorch tests.') +directx_test = unittest.skipIf(os.environ.get('SB_TEST_DIRECTX', '0') == '0', 'Skip DirectX tests.') def load_data(filepath): diff --git a/tests/runner/test_ansible.py b/tests/runner/test_ansible.py index 924e1ce2d..550762c43 100644 --- a/tests/runner/test_ansible.py +++ b/tests/runner/test_ansible.py @@ -10,7 +10,9 @@ from omegaconf import OmegaConf -from superbench.runner.ansible import AnsibleClient +from superbench.common.utils import LazyImport + +AnsibleClient = LazyImport('superbench.runner.ansible', 'AnsibleClient') class AnsibleClientTestCase(unittest.TestCase): From f1d608aef77378560f6fb8e795960b4a79059db0 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Wed, 5 Jul 2023 16:56:21 +0800 Subject: [PATCH 20/33] Benchmarks: micro benchmarks - add python code for DirectXGPUCoreFlops (#542) **Description** add python code for DirectX core flops and init DirectX test pipeline. **Major Revision** - add python code for DirectX core flops - init DirectX test pipeline **Minor Revision** - add test for DirectX core flops --- .codecov.yml | 2 + .github/workflows/build-win.yml | 11 +- .../benchmarks/micro_benchmarks/__init__.py | 2 + .../directx_gemm_flops_performance.py | 145 ++++++++++++++++++ .../BenchmarkOptions.h | 4 +- .../GPUCore.cpp | 4 +- .../GPUCore.vcxproj | 2 + .../test_directx_gemm_flops_performance.py | 47 ++++++ 8 files changed, 206 insertions(+), 11 deletions(-) create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py diff --git a/.codecov.yml b/.codecov.yml index 3f36d5612..81d50f8bc 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -17,6 +17,7 @@ coverage: - cpu-python3.6-unit-test - cpu-python3.7-unit-test - cuda-unit-test + - directx-unit-test patch: default: target: 80% @@ -25,3 +26,4 @@ coverage: - cpu-python3.6-unit-test - cpu-python3.7-unit-test - cuda-unit-test + - directx-unit-test diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml index 24ed3d12a..6283544b7 100644 --- a/.github/workflows/build-win.yml +++ b/.github/workflows/build-win.yml @@ -1,4 +1,4 @@ -name: Build on Windows +name: Build on Windows and run directx unit test on: push: @@ -19,6 +19,10 @@ jobs: uses: actions/checkout@v2 with: submodules: true + - name: Clearnup docker data + run: | + docker system prune -a -f + docker volume prune -a -f - name: Build Docker image working-directory: . shell: pwsh @@ -44,11 +48,6 @@ jobs: TAG: superbench/main:win2004 USER: ${{ secrets.DOCKERHUB_USERNAME }} PASS: ${{ secrets.DOCKERHUB_TOKEN }} - directx-unit-test: - name: DirectX unit test - needs: docker - runs-on: [self-hosted, windows, x64, win2004] - steps: - name: Add bash to PATH shell: pwsh run: | diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py index c1cb3a1b9..57304bc43 100644 --- a/superbench/benchmarks/micro_benchmarks/__init__.py +++ b/superbench/benchmarks/micro_benchmarks/__init__.py @@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark +from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops __all__ = [ 'ComputationCommunicationOverlap', @@ -61,4 +62,5 @@ 'ShardingMatmul', 'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', + 'DirectXGPUCoreFlops', ] diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py new file mode 100644 index 000000000..862367543 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the DirectXGPUCoreFlops performance benchmarks.""" + +import os +from superbench.common.utils import logger +from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode +from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke + + +class DirectXGPUCoreFlops(MicroBenchmarkWithInvoke): + """The DirectXGPUCoreFlops benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor. + + Args: + name (str): benchmark name. + parameters (str): benchmark parameters. + """ + super().__init__(name, parameters) + self._bin_name = 'DirectXGPUCoreFlops.exe' + self._support_precisions = ['fp16', 'fp32'] + self._precision_need_to_run = list() + + def add_parser_arguments(self): + """Add the specified arguments.""" + super().add_parser_arguments() + self._parser.add_argument( + '--num_loops', + type=int, + default=10, + required=False, + help='The number of benchmark runs.', + ) + self._parser.add_argument( + '--num_warm_up', + type=int, + default=2, + required=False, + help='The number of warm up runs.', + ) + self._parser.add_argument( + '--n', + type=int, + default=16 * 256, + required=False, + help='The N dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--k', + type=int, + default=16 * 256, + required=False, + help='The K dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--m', + type=int, + default=16 * 256, + required=False, + help='The M dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--precision', + type=str, + nargs='+', + default=list(), + help='Precision for benchmarking. E.g. {}.'.format(' '.join(self._support_precisions)), + ) + + def _preprocess(self): + """Preprocess/preparation operations before the benchmarking. + + Return: + True if _preprocess() succeed. + """ + if not super()._preprocess(): + return False + + if len(self._args.precision) == 0: + self._precision_need_to_run = self._support_precisions + else: + self._args.precision = [p.lower() for p in self._args.precision] + for p in self._args.precision: + if p not in self._support_precisions: + logger.warning( + 'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format( + self._name, p, self._support_precisions + ) + ) + else: + self._precision_need_to_run.append(p) + + if len(self._precision_need_to_run) == 0: + self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION) + return False + + for p in self._precision_need_to_run: + command = os.path.join(self._args.bin_dir, self._bin_name) + command += (' --num_loops ' + str(self._args.num_loops)) + command += (' --num_warm_up ' + str(self._args.num_warm_up)) + command += (' --n ' + str(self._args.n)) + command += (' --k ' + str(self._args.k)) + command += (' --m ' + str(self._args.m)) + command += (' --' + p) + self._commands.append(command) + return True + + def _process_raw_result(self, cmd_idx, raw_output): + """Function to process raw results and save the summarized results. + + self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + + Args: + cmd_idx (int): the index of command corresponding with the raw_output. + raw_output (str): raw output string of the micro-benchmark. + + Return: + True if the raw output string is valid and result can be extracted. + """ + precision = self._precision_need_to_run[cmd_idx] + self._result.add_raw_data('raw_output_' + precision, raw_output, self._args.log_raw_data) + valid = True + flops = list() + content = raw_output.splitlines() + try: + for line in content: + if 'TFLOPs' in line: + flops.append(float(line.split()[0])) + except BaseException: + valid = False + finally: + if valid is False or len(flops) == 0: + logger.error( + 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( + self._curr_run_index, self._name, raw_output + ) + ) + return False + self._result.add_result(precision + '_flops', max(flops)) + return True + + +BenchmarkRegistry.register_benchmark('directx-gpu-core-flops', DirectXGPUCoreFlops, platform=Platform.DIRECTX) diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h index 8ba9fb913..0a244e5d8 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h @@ -42,10 +42,10 @@ class BenchmarkOptions : public Options { m = get_cmd_line_argument_int("--m", 16 * 256); n = get_cmd_line_argument_int("--n", 16 * 256); k = get_cmd_line_argument_int("--k", 16 * 256); - if (get_cmd_line_argument_bool("--f16")) { + if (get_cmd_line_argument_bool("--fp16")) { mode_precision = Option::F16; } - if (get_cmd_line_argument_bool("--f32")) { + if (get_cmd_line_argument_bool("--fp32")) { mode_precision = Option::F32; } } diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp index 206c49f90..d41316a01 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp @@ -25,7 +25,7 @@ void GPUCore::Run() { int loops = opts->num_loops; std::cout << "GPUCoreFLOPs" << std::endl; - + gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); switch (opts->mode_precision) { case Option::F32: { // Prepare input and output data and buffers. @@ -37,7 +37,6 @@ void GPUCore::Run() { ExecuteComputeOp(); } for (int i = 0; i < loops; ++i) { - gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); // Do FLOPs job. double timeInMs = ExecuteComputeOp(); auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs; @@ -55,7 +54,6 @@ void GPUCore::Run() { ExecuteComputeOp(); } for (int i = 0; i < loops; ++i) { - gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); // Do FLOPs job. double timeInMs = ExecuteComputeOp(); auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs; diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj index 109d39305..f70749b48 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj @@ -20,12 +20,14 @@ + DirectXGPUCoreFlops Application true v143 Unicode + DirectXGPUCoreFlops Application false v143 diff --git a/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py new file mode 100644 index 000000000..7571df752 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py @@ -0,0 +1,47 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for DirectXGPUCorefloops benchmark.""" + +import numbers + +from tests.helper import decorator +from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform + + +@decorator.directx_test +def test_directx_gpucoreflops(): + """Test DirectXGPUCoreFlops benchmark.""" + # Test for default configuration + context = BenchmarkRegistry.create_benchmark_context( + 'directx-gpu-core-flops', + platform=Platform.DIRECTX, + parameters=r'--num_loops 10 --n 16384 --k 16384 --m 16384 --precision fp32' + ) + + assert (BenchmarkRegistry.is_benchmark_context_valid(context)) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + + # Check basic information. + assert (benchmark) + assert (benchmark.name == 'directx-gpu-core-flops') + assert (benchmark.type == BenchmarkType.MICRO) + + # Check parameters specified in BenchmarkContext. + assert (benchmark._args.num_loops == 10) + assert (benchmark._args.n == 16384) + assert (benchmark._args.k == 16384) + assert (benchmark._args.m == 16384) + assert (sorted(benchmark._args.precision) == ['fp32']) + + # Check results and metrics. + assert (benchmark.run_count == 1) + assert (benchmark.return_code == ReturnCode.SUCCESS) + assert ('raw_output_fp32' in benchmark.raw_data) + assert (len(benchmark.raw_data['raw_output_fp32']) == 1) + assert (isinstance(benchmark.raw_data['raw_output_fp32'][0], str)) + + assert ('fp32_flops' in benchmark.result) + assert (len(benchmark.result['fp32_flops']) == 1) + assert (isinstance(benchmark.result['fp32_flops'][0], numbers.Number)) From af4cfd5bbfe989b212d5311656be0cbe7cd5ae35 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Wed, 5 Jul 2023 22:07:13 +0800 Subject: [PATCH 21/33] Benchmarks: micro benchmarks - add python code for DirecXGPUMemBw (#547) **Description** add python code for DirecXGPUMemBw. --- .github/workflows/build-win.yml | 1 + .../benchmarks/micro_benchmarks/__init__.py | 2 + .../directx_mem_bw_performance.py | 149 ++++++++++++++++++ .../BenchmarkOptions.h | 2 +- .../GPUMemRwBw.vcxproj | 2 + .../benchmarks/micro_benchmarks/micro_base.py | 2 +- superbench/common/utils/process.py | 17 +- .../test_directx_mem_bw_performance.py | 52 ++++++ 8 files changed, 222 insertions(+), 5 deletions(-) create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml index 6283544b7..d1b9a1c8d 100644 --- a/.github/workflows/build-win.yml +++ b/.github/workflows/build-win.yml @@ -23,6 +23,7 @@ jobs: run: | docker system prune -a -f docker volume prune -a -f + shell: pwsh - name: Build Docker image working-directory: . shell: pwsh diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py index 57304bc43..9fe14336c 100644 --- a/superbench/benchmarks/micro_benchmarks/__init__.py +++ b/superbench/benchmarks/micro_benchmarks/__init__.py @@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark +from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops __all__ = [ @@ -62,5 +63,6 @@ 'ShardingMatmul', 'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', + 'DirectXGPUMemBw', 'DirectXGPUCoreFlops', ] diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py new file mode 100644 index 000000000..ff9d9d239 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py @@ -0,0 +1,149 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the DirectXGPUMemBw performance benchmarks.""" + +import os + +from superbench.common.utils import logger +from superbench.benchmarks import BenchmarkRegistry, Platform +from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke + + +class DirectXGPUMemBw(MicroBenchmarkWithInvoke): + """The DirectXGPUMemBw benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor. + + Args: + name (str): benchmark name. + parameters (str): benchmark parameters. + """ + super().__init__(name, parameters) + self._bin_name = 'DirectXGPUMemRwBw.exe' + self._modes = ['read', 'write', 'readwrite'] + + def add_parser_arguments(self): + """Add the specified arguments.""" + super().add_parser_arguments() + self._parser.add_argument( + '--num_warm_up', + type=int, + default=0, + required=False, + help='Number of warm up rounds.', + ) + self._parser.add_argument( + '--num_loop', + type=int, + default=100, + required=False, + help='Number of loop times to measure the performance.', + ) + self._parser.add_argument( + '--size', + type=int, + default=None, + required=False, + help='Size of data for GPU copy.', + ) + self._parser.add_argument( + '--minbytes', + type=int, + default=4096, + required=False, + help='Lower data size bound to test.', + ) + self._parser.add_argument( + '--maxbytes', + type=int, + default=1024 * 1024 * 1024, + required=False, + help='Upper data size bound to test.', + ) + self._parser.add_argument( + '--check_data', + action='store_true', + required=False, + help='Whether check data correctness.', + ) + self._parser.add_argument( + '--mode', + type=str, + nargs='+', + default=list(), + help='Memory operation mode. E.g. {}.'.format(' '.join(self._modes)), + ) + + def _preprocess(self): + """Preprocess/preparation operations before the benchmarking.""" + if not super()._preprocess(): + return False + + self._args.mode = [m.lower() for m in self._args.mode] + for mode in self._args.mode: + if mode not in self._modes: + logger.warning( + 'Unsupported mode - benchmark: {}, mode: {}, expected: {}.'.format(self._name, mode, self._modes) + ) + self._args.mode.remove(mode) + + if len(self._args.mode) == 0: + logger.error('No valid operation modes are provided.') + return False + + for mode in self._args.mode: + command = os.path.join(self._args.bin_dir, self._bin_name) + command += (' --num_warm_up ' + str(self._args.num_warm_up)) + command += (' --num_loop ' + str(self._args.num_loop)) + if self._args.size is not None: + command += (' --size ' + str(self._args.size)) + else: + command += (' --minbytes ' + str(self._args.minbytes)) + command += (' --maxbytes ' + str(self._args.maxbytes)) + if self._args.check_data: + command += (' --check_data') + command += (' --' + mode) + self._commands.append(command) + return True + + def _process_raw_result(self, cmd_idx, raw_output): + """Function to process raw results and save the summarized results. + + self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + + Args: + cmd_idx (int): the index of command corresponding with the raw_output. + raw_output (str): raw output string of the micro-benchmark. + + Return: + True if the raw output string is valid and result can be extracted. + """ + mode = self._args.mode[cmd_idx] + self._result.add_raw_data('raw_output_' + mode, raw_output, self._args.log_raw_data) + + valid = True + + content = raw_output.splitlines() + try: + for line in content: + if 'GPUMemBw:' in line: + size = int(line.split()[-3]) + bw = float(line.split()[-2]) + self._result.add_result(f'{mode}_{size}_bw', bw) + if 'error' in line.lower(): + valid = False + except BaseException: + valid = False + finally: + if not valid: + logger.error( + 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( + self._curr_run_index, self._name, raw_output + ) + ) + return False + return True + + +BenchmarkRegistry.register_benchmark('directx-gpu-mem-bw', DirectXGPUMemBw, platform=Platform.DIRECTX) diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h index 7893fe8af..c9d7507a3 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h @@ -68,7 +68,7 @@ class BenchmarkOptions : public Options { min_size = get_cmd_line_argument_int("--minbytes", 4 * 1024); max_size = get_cmd_line_argument_ulonglong("--maxbytes", static_cast(1LL * 1024 * 1024 * 1024)); - check_data = get_cmd_line_argument_bool("--check"); + check_data = get_cmd_line_argument_bool("--check_data"); if (get_cmd_line_argument_bool("--read")) { mem_type = Memtype::Read; } diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj index 80ab02e37..b575f8040 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj +++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj @@ -19,12 +19,14 @@ + DirectXGPUMemRwBw Application true v143 Unicode + DirectXGPUMemRwBw Application false v143 diff --git a/superbench/benchmarks/micro_benchmarks/micro_base.py b/superbench/benchmarks/micro_benchmarks/micro_base.py index 7a2d36029..e1e854058 100644 --- a/superbench/benchmarks/micro_benchmarks/micro_base.py +++ b/superbench/benchmarks/micro_benchmarks/micro_base.py @@ -180,7 +180,7 @@ def _benchmark(self): ) ) - output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing) + output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing, cwd=self._args.bin_dir) if output.returncode != 0: self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) logger.error( diff --git a/superbench/common/utils/process.py b/superbench/common/utils/process.py index 334bf7665..75767ead8 100644 --- a/superbench/common/utils/process.py +++ b/superbench/common/utils/process.py @@ -10,13 +10,14 @@ from superbench.common.utils import stdout_logger -def run_command(command, quiet=False, flush_output=False): +def run_command(command, quiet=False, flush_output=False, cwd=None): """Run command in string format, return the result with stdout and stderr. Args: command (str): command to run. quiet (bool): no stdout display of the command if quiet is True. flush_output (bool): enable real-time output flush or not when running the command. + cwd (str): working directory to run the command. Return: result (subprocess.CompletedProcess): The return value from subprocess.run(). @@ -26,7 +27,11 @@ def run_command(command, quiet=False, flush_output=False): try: args = shlex.split(command) process = subprocess.Popen( - args, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True + args, + cwd=os.getcwd() if cwd is None else cwd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True ) output = '' for line in process.stdout: @@ -43,7 +48,13 @@ def run_command(command, quiet=False, flush_output=False): return subprocess.CompletedProcess(args=args, returncode=-1, stdout=str(e)) else: result = subprocess.run( - command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True + command, + cwd=os.getcwd() if cwd is None else cwd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + check=False, + universal_newlines=True ) if not quiet: stdout_logger.log(result.stdout) diff --git a/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py new file mode 100644 index 000000000..baeed54a4 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py @@ -0,0 +1,52 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for DirectXGPUMemBw benchmark.""" + +import numbers + +from tests.helper import decorator +from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform + + +@decorator.directx_test +def test_directx_gpu_mem_bw(): + """Test DirectXGPUMemBw benchmark.""" + # Test for default configuration + context = BenchmarkRegistry.create_benchmark_context( + 'directx-gpu-mem-bw', + platform=Platform.DIRECTX, + parameters=r'--num_warm_up 0 --num_loop 100 --size 1073741824 --mode read write' + ) + + assert (BenchmarkRegistry.is_benchmark_context_valid(context)) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + + # Check basic information. + assert (benchmark) + assert (benchmark.name == 'directx-gpu-mem-bw') + assert (benchmark.type == BenchmarkType.MICRO) + + # Check parameters specified in BenchmarkContext. + assert (benchmark._args.num_warm_up == 0) + assert (benchmark._args.num_loop == 100) + assert (benchmark._args.size == 1073741824) + assert (sorted(benchmark._args.mode) == ['read', 'write']) + + # Check results and metrics. + assert (benchmark.run_count == 1) + assert (benchmark.return_code == ReturnCode.SUCCESS) + assert ('raw_output_read' in benchmark.raw_data) + assert ('raw_output_write' in benchmark.raw_data) + assert (len(benchmark.raw_data['raw_output_read']) == 1) + assert (len(benchmark.raw_data['raw_output_write']) == 1) + assert (isinstance(benchmark.raw_data['raw_output_read'][0], str)) + assert (isinstance(benchmark.raw_data['raw_output_write'][0], str)) + + assert ('read_1073741824_bw' in benchmark.result) + assert ('write_1073741824_bw' in benchmark.result) + assert (len(benchmark.result['read_1073741824_bw']) == 1) + assert (len(benchmark.result['write_1073741824_bw']) == 1) + assert (isinstance(benchmark.result['read_1073741824_bw'][0], numbers.Number)) + assert (isinstance(benchmark.result['write_1073741824_bw'][0], numbers.Number)) From c8c079c2af0a87d5e3de56e05188c2d9349898d3 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 6 Jul 2023 00:15:32 +0800 Subject: [PATCH 22/33] Benchmarks: micro benchmarks - add python code for DirectXGPUCopy (#546) **Description** add python code for DirectXGPUCopy. --- .../benchmarks/micro_benchmarks/__init__.py | 2 + .../directx_gpu_copy_performance.py | 132 ++++++++++++++++++ .../GPUCopyBw.vcxproj | 2 + .../directx_gpu_copy_performance/Main.cpp | 1 + .../test_directx_gpu_copy_performance.py | 49 +++++++ 5 files changed, 186 insertions(+) create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance.py create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_gpu_copy_performance.py diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py index 9fe14336c..47094aa3f 100644 --- a/superbench/benchmarks/micro_benchmarks/__init__.py +++ b/superbench/benchmarks/micro_benchmarks/__init__.py @@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark +from superbench.benchmarks.micro_benchmarks.directx_gpu_copy_performance import DirectXGPUCopyBw from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops @@ -63,6 +64,7 @@ 'ShardingMatmul', 'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', + 'DirectXGPUCopyBw', 'DirectXGPUMemBw', 'DirectXGPUCoreFlops', ] diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance.py b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance.py new file mode 100644 index 000000000..b114bed68 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance.py @@ -0,0 +1,132 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the DirectXGPUCopyBw performance benchmarks.""" + +import os +from superbench.common.utils import logger +from superbench.benchmarks import BenchmarkRegistry, Platform +from superbench.benchmarks.micro_benchmarks import MemBwBenchmark + + +class DirectXGPUCopyBw(MemBwBenchmark): + """The GPUCopyBw benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor. + + Args: + name (str): benchmark name. + parameters (str): benchmark parameters. + """ + super().__init__(name, parameters) + self._mem_types = ['htod', 'dtoh'] + self._bin_name = 'DirectXGPUCopyBw.exe' + + def add_parser_arguments(self): + """Add the specified arguments.""" + super().add_parser_arguments() + + self._parser.add_argument( + '--size', + type=int, + required=False, + default=None, + help='Size of data for GPU copy.', + ) + self._parser.add_argument( + '--warm_up', + type=int, + required=False, + default=20, + help='Number of warm up copy times to run.', + ) + self._parser.add_argument( + '--num_loops', + type=int, + required=False, + default=1000, + help='Number of copy times to run.', + ) + self._parser.add_argument( + '--minbytes', + type=int, + required=False, + default=64, + help='Run size from min_size to max_size for GPU copy.', + ) + self._parser.add_argument( + '--maxbytes', + type=int, + required=False, + default=8 * 1024 * 1024, + help='Run size from min_size to max_size for GPU copy.', + ) + self._parser.add_argument( + '--check', + action='store_true', + help='Whether check data after copy.', + ) + + def _preprocess(self): + """Preprocess/preparation operations before the benchmarking. + + Return: + True if _preprocess() succeed. + """ + if not super()._preprocess(): + return False + + for mem_type in self._args.mem_type: + # Prepare the command line. + command = os.path.join(self._args.bin_dir, self._bin_name) + command += f' --{mem_type}' + command += ' --warm_up ' + str(self._args.warm_up) + command += ' --num_loops ' + str(self._args.num_loops) + if self._args.size is not None: + command += ' --size ' + str(self._args.size) + else: + command += ' --minbytes ' + str(self._args.minbytes) + command += ' --maxbytes ' + str(self._args.maxbytes) + if self._args.check: + command += ' --check' + self._commands.append(command) + return True + + def _process_raw_result(self, cmd_idx, raw_output): + """Function to process raw results and save the summarized results. + + Args: + cmd_idx (int): the index of command corresponding with the raw_output. + raw_output (str): raw output string of the micro-benchmark. + + Return: + True if the raw output string is valid and result can be extracted. + """ + self._result.add_raw_data('raw_output', raw_output, self._args.log_raw_data) + + try: + lines = raw_output.splitlines() + for line in lines: + if 'GB' in line: + type = line.split()[0].strip(':') + size = int(line.strip().split()[1].strip('B')) + bw = float(line.strip().split()[2]) + self._result.add_result(f'{type}_{size}_bw', bw) + if 'error' in line.lower(): + logger.error( + 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( + self._curr_run_index, self._name, raw_output + ) + ) + return False + return True + except Exception as e: + logger.error( + 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, exception: {}.'.format( + self._curr_run_index, self._name, raw_output, str(e) + ) + ) + return False + + +BenchmarkRegistry.register_benchmark('directx-gpu-copy-bw', DirectXGPUCopyBw, platform=Platform.DIRECTX) diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj index 3be231342..cd3b45f61 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/GPUCopyBw.vcxproj @@ -19,12 +19,14 @@ + DirectXGPUCopyBw Application true v143 Unicode + DirectXGPUCopyBw Application false v143 diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp index ac12597c5..fc47e2f2e 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_copy_performance/Main.cpp @@ -16,6 +16,7 @@ int main(int argc, char *argv[]) { } else { // Run all sizes for (SIZE_T usize = option.min_size; usize <= option.max_size; usize += usize) { + option.size = usize; GPUCopyBw benchmark(&option); benchmark.Run(); } diff --git a/tests/benchmarks/micro_benchmarks/test_directx_gpu_copy_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_gpu_copy_performance.py new file mode 100644 index 000000000..49bf73f2b --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_directx_gpu_copy_performance.py @@ -0,0 +1,49 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for DirectXGPUCopyBw benchmark.""" + +import numbers + +from tests.helper import decorator +from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform + + +@decorator.directx_test +def test_directx_gpu_copy_bw(): + """Test DirectXGPUCopyBw benchmark.""" + # Test for default configuration + context = BenchmarkRegistry.create_benchmark_context( + 'directx-gpu-copy-bw', + platform=Platform.DIRECTX, + parameters=r'--warm_up 20 --num_loops 1000 --minbytes 64 --maxbytes 8388608 --mem_type htod dtoh' + ) + + assert (BenchmarkRegistry.is_benchmark_context_valid(context)) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + + # Check basic information. + assert (benchmark) + assert (benchmark.name == 'directx-gpu-copy-bw') + assert (benchmark.type == BenchmarkType.MICRO) + + # Check parameters specified in BenchmarkContext. + assert (benchmark._args.warm_up == 20) + assert (benchmark._args.num_loops == 1000) + assert (benchmark._args.minbytes == 64) + assert (benchmark._args.maxbytes == 8388608) + assert (sorted(benchmark._args.mem_type) == ['dtoh', 'htod']) + + # Check results and metrics. + assert (benchmark.run_count == 1) + assert (benchmark.return_code == ReturnCode.SUCCESS) + assert ('raw_output' in benchmark.raw_data) + assert (isinstance(benchmark.raw_data['raw_output'][0], str)) + size = 64 + while size <= 8388608: + for mem_type in ['htod', 'dtoh']: + assert (f'{mem_type}_{size}_bw' in benchmark.result) + assert (len(benchmark.result[f'{mem_type}_{size}_bw']) == 1) + assert (isinstance(benchmark.result[f'{mem_type}_{size}_bw'][0], numbers.Number)) + size *= 2 From e8ac0b1e28a93903d1f03752803cd5c9e059b1f1 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 6 Jul 2023 15:31:28 +0800 Subject: [PATCH 23/33] Benchmarks: micro benchmarks - add python code for DirectXGPUEncodingLatency (#548) **Description** add python code for DirectXGPUEncodingLatency. --- dockerfile/directx12.dockerfile | 4 +- .../benchmarks/micro_benchmarks/__init__.py | 2 + .../directx_gpu_encoding_latency.py | 157 ++++++++++++++++++ .../test_directx_gpu_encoding_latency.py | 56 +++++++ 4 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_gpu_encoding_latency.py diff --git a/dockerfile/directx12.dockerfile b/dockerfile/directx12.dockerfile index 344141266..cd5ab9ad3 100644 --- a/dockerfile/directx12.dockerfile +++ b/dockerfile/directx12.dockerfile @@ -64,5 +64,5 @@ RUN make -C third_party directx_amd # Run the entrypoint script for enabling vendor-specific graphics APIs RUN powershell -Command "Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine -Force" -CMD [ "python", "dockerfile/directx/enable-graphics-apis.py" ] -ENTRYPOINT [ "cmd.exe" ] +ENTRYPOINT [ "python", "dockerfile/directx/enable-graphics-apis.py" ] +CMD [ "cmd.exe" ] diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py index 47094aa3f..6f3f29953 100644 --- a/superbench/benchmarks/micro_benchmarks/__init__.py +++ b/superbench/benchmarks/micro_benchmarks/__init__.py @@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark +from superbench.benchmarks.micro_benchmarks.directx_gpu_encoding_latency import DirectXGPUEncodingLatency from superbench.benchmarks.micro_benchmarks.directx_gpu_copy_performance import DirectXGPUCopyBw from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops @@ -64,6 +65,7 @@ 'ShardingMatmul', 'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', + 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw', 'DirectXGPUMemBw', 'DirectXGPUCoreFlops', diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py b/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py new file mode 100644 index 000000000..70d6c75ad --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py @@ -0,0 +1,157 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the DirectXGPUEncodingLatency benchmarks.""" + +import os + +from superbench.common.utils import logger +from superbench.benchmarks import BenchmarkRegistry, Platform +from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke + + +def create_nv12_file(file_name, num_frames, width, height): + """Create a NV12 file with the specified name, number of frames, width, and height.""" + import numpy as np + # Generate a Y plane of width x height with values from 0-255 + y_plane = np.random.randint(0, 256, (height, width), dtype=np.uint8) + # Generate a UV plane of width x height/2 with values from 0-255 + uv_plane = np.random.randint(0, 256, (height // 2, width), dtype=np.uint8) + # Create the file + with open(f'{file_name}', 'wb') as f: + for _ in range(num_frames): + # Write the Y plane and UV plane to the file + f.write(y_plane.tobytes()) + f.write(uv_plane.tobytes()) + + +class DirectXGPUEncodingLatency(MicroBenchmarkWithInvoke): + """The DirectXGPUEncodingLatency benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor.""" + super().__init__(name, parameters) + self._bin_name = 'EncoderLatency.exe' + self._test_file = 'test_directx_gpu_encoding_latency.nv12' + + def add_parser_arguments(self): + """Add the specified arguments.""" + super().add_parser_arguments() + self._parser.add_argument( + '--algo', + type=str, + choices=['ASAP', 'OneInOne'], + default='ASAP', + required=False, + help='The algorithm to use for encoding' + ) + self._parser.add_argument( + '--codec', + type=str, + choices=['AVC', 'H264', 'HEVC', 'H265', 'AV1'], + default='H265', + required=False, + help='The codec to use for encoding' + ) + self._parser.add_argument( + '--format', + type=str, + choices=['RGBA_F16', 'R10G10B10A2', 'NV12', 'P010'], + default='NV12', + required=False, + help='The format to use for encoding' + ) + self._parser.add_argument( + '--frames', type=int, default=500, required=False, help='The number of frames to encode' + ) + self._parser.add_argument( + '--height', type=int, default=720, required=False, help='The height of the input video' + ) + self._parser.add_argument( + '--width', type=int, default=1080, required=False, help='The width of the input video' + ) + self._parser.add_argument('--input_file', type=str, default=None, required=False, help='The input video file') + self._parser.add_argument('--output_file', type=str, default=None, required=False, help='The output video file') + self._parser.add_argument( + '--output_height', type=int, default=720, required=False, help='The height of the output video' + ) + self._parser.add_argument( + '--output_width', type=int, default=1080, required=False, help='The width of the output video' + ) + self._parser.add_argument( + '--vcn', type=int, choices=[0, 1], default=0, required=False, help='The VCN instance to use for encoding' + ) + + def _preprocess(self): + """Preprocess/preparation operations before the benchmarking. + + Return: + True if _preprocess() succeed. + """ + if not super()._preprocess(): + return False + + command = os.path.join(self._args.bin_dir, self._bin_name) + command += f' -ALGORITHM {self._args.algo}' + command += f' -CODEC {self._args.codec}' + command += f' -FORMAT {self._args.format}' + command += f' -FRAMES {self._args.frames}' + command += f' -HEIGHT {self._args.height}' + command += f' -WIDTH {self._args.width}' + if self._args.input_file is not None: + command += f' -INPUT {self._args.input_file}' + else: + if not os.path.exists(f'{self._test_file}'): + create_nv12_file(self._test_file, self._args.frames, self._args.width, self._args.height) + command += f' -INPUT {self._test_file}' + if self._args.output_file is not None: + command += f' -OUTPUT {self._args.output_file}' + command += f' -OUTPUT_HEIGHT {self._args.output_height}' + command += f' -OUTPUT_WIDTH {self._args.output_width}' + command += f' -VCNINSTANCE {self._args.vcn}' + self._commands.append(command) + + return True + + def _process_raw_result(self, cmd_idx, raw_output): + """Function to parse raw results and save the summarized results. + + self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + + Args: + cmd_idx (int): the index of command corresponding with the raw_output. + raw_output (str): raw output string of the micro-benchmark. + + Return: + True if the raw output string is valid and result can be extracted. + """ + self._result.add_raw_data('raw_output', raw_output, self._args.log_raw_data) + + content = raw_output.splitlines() + metrics = {} + + try: + for line in content: + if 'Total' in line: + metrics['fps'] = float(line.split('=')[3].strip().strip('frames').split()[0]) + if 'Latency' in line and 'min' in line.lower(): + metrics['min_lat'] = float(line.split('=')[1].split(',')[1].strip('ms').strip()) + metrics['max_lat'] = float(line.split('=')[1].split(',')[2].strip('ms').strip()) + if 'Latency' in line and 'average' in line.lower(): + metrics['avg_lat'] = float(line.split('=')[1].strip('ms').strip()) + except Exception as e: + logger.error( + 'The result format is invalid - benchmark: {}, raw output: {}, error: {}'.format( + self._name, raw_output, str(e) + ) + ) + return False + + for metric, value in metrics.items(): + self._result.add_result(metric, value) + + return True + + +BenchmarkRegistry.register_benchmark( + 'directx-gpu-encoding-latency', DirectXGPUEncodingLatency, platform=Platform.DIRECTX +) diff --git a/tests/benchmarks/micro_benchmarks/test_directx_gpu_encoding_latency.py b/tests/benchmarks/micro_benchmarks/test_directx_gpu_encoding_latency.py new file mode 100644 index 000000000..c9b5c7121 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_directx_gpu_encoding_latency.py @@ -0,0 +1,56 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for DirectXGPUEncodingLatency benchmark.""" + +import numbers + +from tests.helper import decorator +from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform + + +@decorator.directx_test +def test_directx_gpuencodinglatency(): + """Test DirectXGPUEncodingLatency benchmark.""" + context = BenchmarkRegistry.create_benchmark_context( + 'directx-gpu-encoding-latency', + platform=Platform.DIRECTX, + parameters=r'--algo ASAP --codec H265 --format NV12 --frames 500' + + r' --height 720 --width 1080 --output_height 720 --output_width 1080 --vcn 0' + ) + + assert (BenchmarkRegistry.is_benchmark_context_valid(context)) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + + # Check basic information. + assert (benchmark) + assert (benchmark.name == 'directx-gpu-encoding-latency') + assert (benchmark.type == BenchmarkType.MICRO) + + # Check parameters specified in BenchmarkContext. + assert (benchmark._args.algo == 'ASAP') + assert (benchmark._args.codec == 'H265') + assert (benchmark._args.format == 'NV12') + assert (benchmark._args.frames == 500) + assert (benchmark._args.height == 720) + assert (benchmark._args.width == 1080) + assert (benchmark._args.output_height == 720) + assert (benchmark._args.output_width == 1080) + assert (benchmark._args.vcn == 0) + + # Check results and metrics. + assert (benchmark._args.run_count == 1) + assert (benchmark.return_code == ReturnCode.SUCCESS) + assert ('raw_output' in benchmark.raw_data) + assert (len(benchmark.raw_data['raw_output']) == 1) + assert (isinstance(benchmark.raw_data['raw_output'][0], str)) + + assert ('fps' in benchmark.result) + assert ('min_lat' in benchmark.result) + assert ('max_lat' in benchmark.result) + assert ('avg_lat' in benchmark.result) + assert (isinstance(benchmark.result['fps'][0], numbers.Number)) + assert (isinstance(benchmark.result['min_lat'][0], numbers.Number)) + assert (isinstance(benchmark.result['max_lat'][0], numbers.Number)) + assert (isinstance(benchmark.result['avg_lat'][0], numbers.Number)) From 466b477e9d3cd1c3c62a3ae28c88ad980b6c2a68 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jul 2023 17:07:35 +0800 Subject: [PATCH 24/33] Bump semver from 5.7.1 to 5.7.2 in /website (#550) Bumps [semver](https://github.com/npm/node-semver) from 5.7.1 to 5.7.2. - [Release notes](https://github.com/npm/node-semver/releases) - [Changelog](https://github.com/npm/node-semver/blob/v5.7.2/CHANGELOG.md) - [Commits](npm/node-semver@v5.7.1...v5.7.2) --- updated-dependencies: - dependency-name: semver dependency-type: indirect ... Signed-off-by: dependabot[bot] --- website/package-lock.json | 90 +++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/website/package-lock.json b/website/package-lock.json index 7526213de..80c139a56 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -176,9 +176,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -221,9 +221,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -265,9 +265,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -1044,9 +1044,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -1199,9 +1199,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -2441,9 +2441,9 @@ } }, "semver": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", - "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==" + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", + "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==" } } }, @@ -3233,9 +3233,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -4747,9 +4747,9 @@ "integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=" }, "semver": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", - "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==" + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", + "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==" }, "shebang-command": { "version": "1.2.0", @@ -5406,9 +5406,9 @@ } }, "semver": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", - "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==" + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", + "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==" }, "tapable": { "version": "1.1.3", @@ -7011,9 +7011,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -7649,9 +7649,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -8938,9 +8938,9 @@ } }, "semver": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", - "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==" + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", + "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==" } } }, @@ -9287,9 +9287,9 @@ } }, "semver": { - "version": "7.3.5", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.5.tgz", - "integrity": "sha512-PoeGJYh8HK4BTO/a9Tf6ZG3veo/A7ZVsYrSA6J8ny9nb3B1VrpkuN+z9OE5wfE5p6H4LchYZsegiQgbJD94ZFQ==", + "version": "7.5.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", + "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==", "requires": { "lru-cache": "^6.0.0" } @@ -9303,9 +9303,9 @@ }, "dependencies": { "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" } } }, @@ -11215,9 +11215,9 @@ } }, "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==" }, "string_decoder": { "version": "1.1.1", From e1df877bfe4d84b352dff0d84c86b98c36cf3ebc Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 27 Jul 2023 10:42:31 +0800 Subject: [PATCH 25/33] Release - SuperBench v0.9.0 (#558) **Description** Cherry-pick bug fixes from v0.9.0 to main. **Major Revision** - CI/CD: pipeline - clean more disk space to fix rocm building image pipeline(#555 ) - Benchmarks: bug fix - use absolute path for input file in DirectXEncodingLatency(#554) - CI/CD - add push win docker image on release branch in pipeline (#552) - Docs - Upgrade version and release note(#557) --- .github/workflows/build-image.yml | 11 ++++++ .github/workflows/build-win.yml | 29 ++++++++++++-- README.md | 2 +- docs/getting-started/installation.mdx | 2 +- docs/getting-started/run-superbench.md | 2 +- docs/superbench-config.mdx | 2 +- docs/user-tutorial/container-images.mdx | 14 +++++++ docs/user-tutorial/data-diagnosis.md | 2 +- docs/user-tutorial/result-summary.md | 2 +- superbench/__init__.py | 2 +- .../directx_gpu_encoding_latency.py | 4 +- superbench/config/amd_mi100_hpe.yaml | 2 +- superbench/config/amd_mi100_z53.yaml | 2 +- .../inference/standard_nc64as_t4_v3.yaml | 2 +- .../inference/standard_nc96ads_a100_v4.yaml | 2 +- .../inference/standard_nv18ads_a10_v5.yaml | 2 +- superbench/config/azure_ndmv4.yaml | 2 +- superbench/config/azure_ndv4.yaml | 2 +- superbench/config/default.yaml | 2 +- third_party/Makefile | 2 +- website/blog/2023-07-25-release-0-9.md | 38 +++++++++++++++++++ website/docusaurus.config.js | 2 +- website/package-lock.json | 2 +- website/package.json | 2 +- 24 files changed, 109 insertions(+), 25 deletions(-) create mode 100644 website/blog/2023-07-25-release-0-9.md diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 824418a6f..6b796830a 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -64,6 +64,17 @@ jobs: sudo apt-get clean sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps") df -h + - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧 + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: true - name: Prepare metadata id: metadata run: | diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml index d1b9a1c8d..252783421 100644 --- a/.github/workflows/build-win.yml +++ b/.github/workflows/build-win.yml @@ -12,7 +12,7 @@ on: jobs: docker: - name: Docker build win2004 + name: Docker build win directx12 runs-on: [self-hosted, windows, x64, win2004] steps: - name: Checkout @@ -24,6 +24,25 @@ jobs: docker system prune -a -f docker volume prune -a -f shell: pwsh + - name: Set TAG variable based on the branch + run: | + if ($env:GITHUB_EVENT_NAME -match "release") { + $version = $env:GITHUB_REF.Substring($env:GITHUB_REF.LastIndexOf('/') + 1) + echo "TAG=superbench/superbench:$version-directx12" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + } elseif ($env:GITHUB_REF -match "refs/heads/release/(.*)") { + $version = $Matches[1] + echo "TAG=superbench/release:$version-directx12" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + } elseif ($env:GITHUB_BASEREF -match "release/(.*)"){ + $version = $Matches[1] + echo "TAG=superbench/release:$version-directx12" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + } else { + echo "TAG=superbench/main:directx12" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + } + shell: pwsh + env: + GITHUB_REF: ${{ github.ref }} + GITHUB_BASEREF: ${{ github.base_ref }} + GITHUB_EVENT_NAME: ${{ github.event_name }} - name: Build Docker image working-directory: . shell: pwsh @@ -37,7 +56,7 @@ jobs: --isolation=process ` --tag $env:TAG . env: - TAG: superbench/main:win2004 + TAG: ${{ env.TAG }} - name: Push Docker image if: ${{ github.event_name != 'pull_request' }} shell: pwsh @@ -46,7 +65,7 @@ jobs: docker push $env:TAG docker logout env: - TAG: superbench/main:win2004 + TAG: ${{ env.TAG }} USER: ${{ secrets.DOCKERHUB_USERNAME }} PASS: ${{ secrets.DOCKERHUB_TOKEN }} - name: Add bash to PATH @@ -64,7 +83,9 @@ jobs: docker run --rm ` --isolation process ` --device class/5B45201D-F2F2-4F3B-85BB-30FF1F953599 ` - -e CI=true $ci_env -e SB_TEST_CUDA="0" -e SB_TEST_ROCM="0" -e SB_TEST_PYTORCH="0" -e SB_TEST_DIRECTX="1" -e CODECOV_TOKEN superbench/main:win2004 cmd /c $command + -v C:/Windows/System32/DriverStore:C:/Windows/System32/DriverStore ` + -e CI=true $ci_env -e SB_TEST_CUDA="0" -e SB_TEST_ROCM="0" -e SB_TEST_PYTORCH="0" -e SB_TEST_DIRECTX="1" -e CODECOV_TOKEN --entrypoint "cmd" $env:TAG "/c python dockerfile/directx/enable-graphics-apis.py && cmd /c $command" shell: pwsh env: + TAG: ${{ env.TAG }} CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/README.md b/README.md index ffcd51960..cfcd4b6b3 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ __SuperBench__ is a validation and profiling tool for AI infrastructure. -📢 [v0.8.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.8.0) has been released! +📢 [v0.9.0](https://github.com/microsoft/superbenchmark/releases/tag/v0.9.0) has been released! ## _Check [aka.ms/superbench](https://aka.ms/superbench) for more details._ diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx index 82c1fc9c3..8570306c9 100644 --- a/docs/getting-started/installation.mdx +++ b/docs/getting-started/installation.mdx @@ -61,7 +61,7 @@ You can clone the source from GitHub and build it. :::note Note You should checkout corresponding tag to use release version, for example, -`git clone -b v0.8.0 https://github.com/microsoft/superbenchmark` +`git clone -b v0.9.0 https://github.com/microsoft/superbenchmark` ::: ```bash diff --git a/docs/getting-started/run-superbench.md b/docs/getting-started/run-superbench.md index 32a8c6d80..16c6d7a21 100644 --- a/docs/getting-started/run-superbench.md +++ b/docs/getting-started/run-superbench.md @@ -27,7 +27,7 @@ sb deploy -f remote.ini --host-password [password] :::note Note You should deploy corresponding Docker image to use release version, for example, -`sb deploy -f local.ini -i superbench/superbench:v0.8.0-cuda12.1` +`sb deploy -f local.ini -i superbench/superbench:v0.9.0-cuda12.1` You should note that version of git repo only determines version of sb CLI, and not the sb container. You should define the container version even if you specified a release version for the git clone. diff --git a/docs/superbench-config.mdx b/docs/superbench-config.mdx index 5720a8125..8893c46b9 100644 --- a/docs/superbench-config.mdx +++ b/docs/superbench-config.mdx @@ -70,7 +70,7 @@ superbench: ```yaml -version: v0.8 +version: v0.9 superbench: enable: benchmark_1 monitor: diff --git a/docs/user-tutorial/container-images.mdx b/docs/user-tutorial/container-images.mdx index 27cf8da6f..5fd11502c 100644 --- a/docs/user-tutorial/container-images.mdx +++ b/docs/user-tutorial/container-images.mdx @@ -23,12 +23,15 @@ available tags are listed below for all stable versions. values={[ {label: 'CUDA', value: 'cuda'}, {label: 'ROCm', value: 'rocm'}, + {label: 'DirectX', value: 'directx'}, ] }> | Tag | Description | |-------------------|------------------------------------| +| v0.9.0-cuda12.1 | SuperBench v0.9.0 with CUDA 12.1 | +| v0.9.0-cuda11.1.1 | SuperBench v0.9.0 with CUDA 11.1.1 | | v0.8.0-cuda12.1 | SuperBench v0.8.0 with CUDA 12.1 | | v0.8.0-cuda11.1.1 | SuperBench v0.8.0 with CUDA 11.1.1 | | v0.7.0-cuda11.8 | SuperBench v0.7.0 with CUDA 11.8 | @@ -45,6 +48,10 @@ available tags are listed below for all stable versions. | Tag | Description | |-------------------------------|--------------------------------------------------| +| v0.9.0-rocm5.1.3 | SuperBench v0.9.0 with ROCm 5.1.3 | +| v0.9.0-rocm5.1.1 | SuperBench v0.9.0 with ROCm 5.1.1 | +| v0.9.0-rocm5.0.1 | SuperBench v0.9.0 with ROCm 5.0.1 | +| v0.9.0-rocm5.0 | SuperBench v0.9.0 with ROCm 5.0 | | v0.8.0-rocm5.1.3 | SuperBench v0.8.0 with ROCm 5.1.3 | | v0.8.0-rocm5.1.1 | SuperBench v0.8.0 with ROCm 5.1.1 | | v0.8.0-rocm5.0.1 | SuperBench v0.8.0 with ROCm 5.0.1 | @@ -66,5 +73,12 @@ available tags are listed below for all stable versions. | v0.3.0-rocm4.2-pytorch1.7.0 | SuperBench v0.3.0 with ROCm 4.2, PyTorch 1.7.0 | | v0.3.0-rocm4.0-pytorch1.7.0 | SuperBench v0.3.0 with ROCm 4.0, PyTorch 1.7.0 | + + + +| Tag | Description | +|-------------------------------|--------------------------------------------------| +| v0.9.0-directx12 | SuperBench v0.9.0 with DirectX12, Windows10-2004 | + diff --git a/docs/user-tutorial/data-diagnosis.md b/docs/user-tutorial/data-diagnosis.md index 94a2a025d..a0bd99640 100644 --- a/docs/user-tutorial/data-diagnosis.md +++ b/docs/user-tutorial/data-diagnosis.md @@ -65,7 +65,7 @@ superbench: example: ```yaml # SuperBench rules -version: v0.8 +version: v0.9 superbench: rules: failure-rule: diff --git a/docs/user-tutorial/result-summary.md b/docs/user-tutorial/result-summary.md index e53738ff8..7e393a188 100644 --- a/docs/user-tutorial/result-summary.md +++ b/docs/user-tutorial/result-summary.md @@ -58,7 +58,7 @@ superbench: ```yaml title="Example" # SuperBench rules -version: v0.8 +version: v0.9 superbench: rules: kernel_launch: diff --git a/superbench/__init__.py b/superbench/__init__.py index 5b85c9a9a..bc20aebf9 100644 --- a/superbench/__init__.py +++ b/superbench/__init__.py @@ -6,5 +6,5 @@ Provide hardware and software benchmarks for AI systems. """ -__version__ = '0.8.0' +__version__ = '0.9.0' __author__ = 'Microsoft' diff --git a/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py b/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py index 70d6c75ad..ed17ea5bd 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py +++ b/superbench/benchmarks/micro_benchmarks/directx_gpu_encoding_latency.py @@ -98,11 +98,11 @@ def _preprocess(self): command += f' -HEIGHT {self._args.height}' command += f' -WIDTH {self._args.width}' if self._args.input_file is not None: - command += f' -INPUT {self._args.input_file}' + command += f' -INPUT {os.path.abspath(self._args.input_file)}' else: if not os.path.exists(f'{self._test_file}'): create_nv12_file(self._test_file, self._args.frames, self._args.width, self._args.height) - command += f' -INPUT {self._test_file}' + command += f' -INPUT {os.path.abspath(self._test_file)}' if self._args.output_file is not None: command += f' -OUTPUT {self._args.output_file}' command += f' -OUTPUT_HEIGHT {self._args.output_height}' diff --git a/superbench/config/amd_mi100_hpe.yaml b/superbench/config/amd_mi100_hpe.yaml index 150424c0f..718224531 100644 --- a/superbench/config/amd_mi100_hpe.yaml +++ b/superbench/config/amd_mi100_hpe.yaml @@ -3,7 +3,7 @@ # Server: # - Product: HPE Apollo 6500 -version: v0.8 +version: v0.9 superbench: enable: null var: diff --git a/superbench/config/amd_mi100_z53.yaml b/superbench/config/amd_mi100_z53.yaml index 188c93547..8aa8fd85e 100644 --- a/superbench/config/amd_mi100_z53.yaml +++ b/superbench/config/amd_mi100_z53.yaml @@ -4,7 +4,7 @@ # - Product: G482-Z53 # - Link: https://www.gigabyte.cn/FileUpload/Global/MicroSite/553/G482-Z53.html -version: v0.8 +version: v0.9 superbench: enable: null var: diff --git a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml index 62e0d6586..5ffa26311 100644 --- a/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml +++ b/superbench/config/azure/inference/standard_nc64as_t4_v3.yaml @@ -1,4 +1,4 @@ -version: v0.8 +version: v0.9 superbench: enable: null monitor: diff --git a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml index 337affacf..5c78d866d 100644 --- a/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml +++ b/superbench/config/azure/inference/standard_nc96ads_a100_v4.yaml @@ -1,4 +1,4 @@ -version: v0.8 +version: v0.9 superbench: enable: null monitor: diff --git a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml index f95469cb0..75375cd79 100644 --- a/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml +++ b/superbench/config/azure/inference/standard_nv18ads_a10_v5.yaml @@ -1,4 +1,4 @@ -version: v0.8 +version: v0.9 superbench: enable: null monitor: diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml index e482d6ed0..8aabb65f7 100644 --- a/superbench/config/azure_ndmv4.yaml +++ b/superbench/config/azure_ndmv4.yaml @@ -3,7 +3,7 @@ # Azure NDm A100 v4 # reference: https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series -version: v0.8 +version: v0.9 superbench: enable: null monitor: diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml index cb9a93ddc..274556842 100644 --- a/superbench/config/azure_ndv4.yaml +++ b/superbench/config/azure_ndv4.yaml @@ -1,5 +1,5 @@ # SuperBench Config -version: v0.8 +version: v0.9 superbench: enable: null monitor: diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 60d6be7b0..1a6af7dc5 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -1,5 +1,5 @@ # SuperBench Config -version: v0.8 +version: v0.9 superbench: enable: null monitor: diff --git a/third_party/Makefile b/third_party/Makefile index b0c01d453..ec72ccae9 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -157,5 +157,5 @@ directx_amf_encoding_latency: curl -L -o vs_buildtools.exe https://aka.ms/vs/16/release/vs_buildtools.exe && echo "Downloaded vs_buildtools.exe" && \ start /wait vs_buildtools.exe --quiet --wait --norestart --nocache --installPath C:/temp/BuildTools --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.ATLMFC --includeRecommended && echo "Installed VS Build Tools" && \ del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \ - "C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \ + "C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \ ) diff --git a/website/blog/2023-07-25-release-0-9.md b/website/blog/2023-07-25-release-0-9.md new file mode 100644 index 000000000..59e931103 --- /dev/null +++ b/website/blog/2023-07-25-release-0-9.md @@ -0,0 +1,38 @@ +--- +slug: release-sb-v0.9 +title: Releasing SuperBench v0.9 +author: Peng Cheng +author_title: SuperBench Team +author_url: https://github.com/cp5555 +author_image_url: https://github.com/cp5555.png +tags: [superbench, announcement, release] +--- + +We are very happy to announce that **SuperBench 0.9.0 version** is officially released today! + +You can install and try superbench by following [Getting Started Tutorial](https://microsoft.github.io/superbenchmark/docs/getting-started/installation). + +## SuperBench 0.9.0 Release Notes + +### SuperBench Improvement +- Support Ctrl+C and interrupt to stop all SuperBench testing. +- Support Windows Docker for VDI/Gaming GPU. +- Support DirectX platform for Nvidia and AMD GPU. +- Add System Config Info feature in SB runner to support distributed collection. +- Support DirectX test pipeline. + +### Micro-benchmark Improvement +- Add DirectXGPUCopyBw Benchmark to measure HtoD/DtoH bandwidth by DirectX. +- Add DirectXGPUCoreFLops Benchmark to measure peak FLOPS by DirectX.. +- Add DirectXGPUMemBw Benchmark to measure GPU memory bandwidth by DirectX.. +- Add DirectXVCNEncodingLatency Benchmark to measure the VCN hardware encoding latency on AMD graphic GPUs. +- Support best algorithm selection in cudnn-function microbenchmark. +- Revise step time collection in distributed inference benchmark. + +### Model Benchmark Improvement +- Fix early stop logic due to num_steps in model benchmarks. +- Support TensorRT models on Nvidia H100. + +### Documentation +- Improve documentation for System Config Info. +- Update outdate references. diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index cc583913d..c1d83edfa 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -101,7 +101,7 @@ module.exports = { announcementBar: { id: 'supportus', content: - '📢 v0.8.0 has been released! ' + + '📢 v0.9.0 has been released! ' + '⭐️ If you like SuperBench, give it a star on GitHub! ⭐️', }, algolia: { diff --git a/website/package-lock.json b/website/package-lock.json index 80c139a56..a2e3b219d 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -1,6 +1,6 @@ { "name": "superbench-website", - "version": "0.8.0", + "version": "0.9.0", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/website/package.json b/website/package.json index c761f26d8..38ca1f75a 100644 --- a/website/package.json +++ b/website/package.json @@ -1,6 +1,6 @@ { "name": "superbench-website", - "version": "0.8.0", + "version": "0.9.0", "private": true, "scripts": { "docusaurus": "docusaurus", From 67f2aa7237cefb0cf5b3032c8510a6b432407329 Mon Sep 17 00:00:00 2001 From: pnunna93 <104791500+pnunna93@users.noreply.github.com> Date: Tue, 8 Aug 2023 00:03:32 -0500 Subject: [PATCH 26/33] Benchmarks: model benchmarks - change torch.distributed.launch to torchrun (#556) This PR has following changes - torch.distributed.launch changed to torchrun. torch.distributed.launch is deprecated in latest Pytorch and is recommended to move to torchrun - https://pytorch.org/docs/stable/elastic/run.html - Changes to AMD GPU detection logic. The AMD GPU detection logic throws warning when containers have only renderD in /dev/dri, this change would resolve those warnings --------- Co-authored-by: Yuting Jiang --- superbench/common/devices/gpu.py | 2 +- superbench/runner/runner.py | 4 ++-- tests/runner/test_runner.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/superbench/common/devices/gpu.py b/superbench/common/devices/gpu.py index e12889e10..3398d707f 100644 --- a/superbench/common/devices/gpu.py +++ b/superbench/common/devices/gpu.py @@ -26,7 +26,7 @@ def get_vendor(self): logger.warning('Cannot find NVIDIA GPU device.') return 'nvidia' if Path('/dev/kfd').is_char_device() and Path('/dev/dri').is_dir(): - if not list(Path('/dev/dri').glob('card*')): + if not list(Path('/dev/dri').glob('renderD*')): logger.warning('Cannot find AMD GPU device.') return 'amd' if list(Path(r'C:\Windows\System32').glob('*DriverStore/FileRepository/nv*.inf_amd64_*/nvapi64.dll')): diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index bd8cc9c83..7e29f4dfe 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -144,8 +144,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): torch_dist_params = '' if mode.node_num == 1 else \ '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' mode_command = ( - f'python3 -m torch.distributed.launch' - f' --use_env --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}' + f'torchrun' + f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp' f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl' ) diff --git a/tests/runner/test_runner.py b/tests/runner/test_runner.py index 304a6ba22..250942267 100644 --- a/tests/runner/test_runner.py +++ b/tests/runner/test_runner.py @@ -105,8 +105,8 @@ def test_get_mode_command(self): 'node_num': 'all', }, 'expected_command': ( - 'python3 -m torch.distributed.launch ' - '--use_env --no_python --nproc_per_node=1 ' + 'torchrun ' + '--no_python --nproc_per_node=1 ' '--nnodes=$NNODES --node_rank=$NODE_RANK ' '--master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo ' @@ -123,8 +123,8 @@ def test_get_mode_command(self): 'node_num': 1, }, 'expected_command': ( - 'python3 -m torch.distributed.launch ' - '--use_env --no_python --nproc_per_node=8 ' + 'torchrun ' + '--no_python --nproc_per_node=8 ' f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo ' 'superbench.benchmarks.foo.parameters.distributed_impl=ddp ' 'superbench.benchmarks.foo.parameters.distributed_backend=nccl' From 6c0205cece527ed49959619e74865c1bd5e69e6e Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Fri, 18 Aug 2023 13:17:04 +0800 Subject: [PATCH 27/33] Benchmarks: micro benchmarks - add source code for DirectXRenderPerf (#549) **Description** add source code for DirectXRenderPerf. --------- Co-authored-by: yukirora --- .github/workflows/build-image.yml | 2 +- .../BenchmarkOptions.h | 80 ++ .../BufferHelper.cpp | 137 +++ .../directx_render_performance/BufferHelper.h | 126 +++ .../DirectXRenderPerformance.vcxproj | 159 +++ .../GeometryHelper.cpp | 37 + .../GeometryHelper.h | 159 +++ .../directx_render_performance/Main.cpp | 148 +++ .../directx_render_performance/RenderApp.cpp | 388 ++++++++ .../directx_render_performance/RenderApp.h | 221 +++++ .../RenderGeometryPass.cpp | 139 +++ .../RenderGeometryPass.h | 99 ++ .../RenderLightingPass.cpp | 217 +++++ .../RenderLightingPass.h | 119 +++ .../RenderShadowMapPass.cpp | 77 ++ .../RenderShadowMapPass.h | 34 + .../Shaders/Base.hlsl | 134 +++ .../Shaders/DefferredLightingPixel.hlsl | 919 ++++++++++++++++++ .../Shaders/DefferredLightingVertex.hlsl | 199 ++++ .../Shaders/ShadowMap.hlsl | 55 ++ .../directx_third_party/DeviceResources.cpp | 670 +++++++++++++ .../directx_third_party/DeviceResources.h | 138 +++ .../directx_third_party/pch.h | 97 ++ 23 files changed, 4353 insertions(+), 1 deletion(-) create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/BenchmarkOptions.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/DirectXRenderPerformance.vcxproj create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Main.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/Base.hlsl create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingPixel.hlsl create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingVertex.hlsl create mode 100644 superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/ShadowMap.hlsl create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.h create mode 100644 superbench/benchmarks/micro_benchmarks/directx_third_party/pch.h diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 6b796830a..e2dad1a66 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -65,7 +65,7 @@ jobs: sudo docker rmi $(sudo docker images --format "{{.Repository}}:{{.Tag}}" --filter=reference="node" --filter=reference="buildpack-deps") df -h - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧 - uses: jlumbroso/free-disk-space@main + uses: hirnidrin/free-disk-space@main with: # This might remove tools that are actually needed, if set to "true" but frees about 6 GB tool-cache: false diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BenchmarkOptions.h new file mode 100644 index 000000000..5c93b9378 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BenchmarkOptions.h @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "../directx_utils/Options.h" + +using namespace std; + +// enum class for pass type +enum class PassType { GeometryPass, ShadowMapPass, LightingPass }; + +class BenchmarkOptions : public Options { + public: + int m_textureSize = 0; + int m_textureNum = 10; + int m_vertexNum = 3000; + int m_indexNum = 3000; + int m_width = 1080; + int m_height = 720; + int m_warmup = 500; + int m_num_object = 1; + string m_outfile = "outfile.txt"; + PassType m_pass_type = PassType::ShadowMapPass; + int m_num_frames = 3000; + int m_num_light = 1; + bool m_quiet = true; + + BenchmarkOptions(int argc, char *argv[]) : Options(argc, argv) {} + + virtual void get_option_usage() { + cout << "Usage: " << endl; + cout << " --width set the width of the window" << endl; + cout << " --height set the height of the window" << endl; + cout << " --warmup set the warmup frames" << endl; + cout << " --vertex set the number of vertices" << endl; + cout << " --index set the number of indices" << endl; + cout << " --texture_size set the size of textures " << endl; + cout << " --outfile set the output file name" << endl; + cout << " --pass set the pass type" << endl; + cout << " --object set the number of objects" << endl; + cout << " --frame set the number of frames" << endl; + cout << " --light set the number of lights" << endl; + cout << " --quiet disable window" << endl; + } + + virtual void parse_arguments() { + m_width = get_cmd_line_argument_int("--width", 1080); + m_height = get_cmd_line_argument_int("--height", 720); + m_warmup = get_cmd_line_argument_int("--warmup", 500); + m_vertexNum = get_cmd_line_argument_int("--vertex", m_vertexNum); + m_indexNum = get_cmd_line_argument_int("--index", m_indexNum); + m_textureSize = get_cmd_line_argument_int("--texture", 3); + m_textureNum = get_cmd_line_argument_int("--texture_num", 3); + m_outfile = get_cmd_line_argument_string("--outfile"); + auto pass = get_cmd_line_argument_string("--pass"); + std::transform(pass.begin(), pass.end(), pass.begin(), [](unsigned char c) { return std::tolower(c); }); + if (pass == "geometry") { + m_pass_type = PassType::GeometryPass; + } else if (pass == "shadow") { + m_pass_type = PassType::ShadowMapPass; + } else if (pass == "lighting") { + m_pass_type = PassType::LightingPass; + } else { + cout << "Error: Invalid pass type: " << pass << endl; + exit(1); + } + m_num_object = get_cmd_line_argument_int("--object", m_num_object); + m_num_frames = get_cmd_line_argument_int("--frame", m_num_frames); + m_num_light = get_cmd_line_argument_int("--light", m_num_light); + m_quiet = get_cmd_line_argument_bool("--quiet"); + }; +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.cpp new file mode 100644 index 000000000..84cb4e294 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.cpp @@ -0,0 +1,137 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "BufferHelper.h" + +// Function to calculate the byte size of the constant buffer, +// which must be a multiple of 256 bytes. +UINT CalcConstantBufferByteSize(UINT byteSize) { + // Calculate the aligned size. + return (byteSize + 255) & ~255; +} + +/* + * @brief: Create a default buffer. + * @param: device the device of GPU object. + * @param: cmdList the command list of GPU object. + * @param: initData the data to be copied to the default buffer. + * @param: byteSize the size of data. + * @return: the default buffer. + */ +Microsoft::WRL::ComPtr CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, + const void *initData, UINT64 byteSize, + Microsoft::WRL::ComPtr &uploadBuffer) { + ComPtr defaultBuffer; + + // Create the actual default buffer resource. + ThrowIfFailed(device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), + D3D12_HEAP_FLAG_NONE, &CD3DX12_RESOURCE_DESC::Buffer(byteSize), + D3D12_RESOURCE_STATE_COMMON, nullptr, + IID_PPV_ARGS(defaultBuffer.GetAddressOf()))); + + // In order to copy CPU memory data into our default buffer, we need to create + // an intermediate upload heap. + ThrowIfFailed(device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD), + D3D12_HEAP_FLAG_NONE, &CD3DX12_RESOURCE_DESC::Buffer(byteSize), + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, + IID_PPV_ARGS(uploadBuffer.GetAddressOf()))); + + // Describe the data we want to copy into the default buffer. + D3D12_SUBRESOURCE_DATA subResourceData = {}; + subResourceData.pData = initData; + subResourceData.RowPitch = byteSize; + subResourceData.SlicePitch = subResourceData.RowPitch; + + // Schedule to copy the data to the default buffer resource. At a high level, the helper function + // UpdateSubresources will copy the CPU memory into the intermediate upload heap. Then, using + // ID3D12CommandList::CopySubresourceRegion, the intermediate upload heap data will be copied to mBuffer. + cmdList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(defaultBuffer.Get(), D3D12_RESOURCE_STATE_COMMON, + D3D12_RESOURCE_STATE_COPY_DEST)); + UpdateSubresources<1>(cmdList, defaultBuffer.Get(), uploadBuffer.Get(), 0, 0, 1, &subResourceData); + cmdList->ResourceBarrier(1, + &CD3DX12_RESOURCE_BARRIER::Transition(defaultBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_GENERIC_READ)); + + // Note: uploadBuffer has to be kept alive after the above function calls because + // the command list has not been executed yet that performs the actual copy. + // The caller can Release the uploadBuffer after it knows the copy has been executed. + + return defaultBuffer; +} + +std::vector CreateRandomTexture(const UINT width, const UINT height, const UINT texturePixelSize) { + // Create a buffer to store the texture data + std::vector textureData(width * height * texturePixelSize); + + // Initialize the random number generator + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution distribution(0, 255); + + // Generate random data for the texture + for (UINT i = 0; i < width * height * texturePixelSize; ++i) { + textureData[i] = static_cast(distribution(generator)); + } + return textureData; +} + +void UploadTexture(ID3D12Device *device, ID3D12GraphicsCommandList *pCmdList, const std::vector &textureData, + Microsoft::WRL::ComPtr &texture, const UINT width, const UINT height, + const UINT texturePixelSize) { + // Create the GPU upload buffer. + const UINT64 uploadBufferSize = GetRequiredIntermediateSize(texture.Get(), 0, 1); + + ID3D12Resource *textureUploadHeap; + ThrowIfFailed( + device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD), D3D12_HEAP_FLAG_NONE, + &CD3DX12_RESOURCE_DESC::Buffer(uploadBufferSize), + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, IID_PPV_ARGS(&textureUploadHeap))); + + // Copy data to the intermediate upload heap and then schedule a copy + // from the upload heap to the Texture2D. + D3D12_SUBRESOURCE_DATA textureDataDesc = {}; + textureDataDesc.pData = textureData.data(); + textureDataDesc.RowPitch = width * texturePixelSize; + textureDataDesc.SlicePitch = textureDataDesc.RowPitch * height; + + UpdateSubresources(pCmdList, texture.Get(), textureUploadHeap, 0, 0, 1, &textureDataDesc); + pCmdList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(texture.Get(), D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE)); +} + +void CreateTextureResource(ID3D12Device *device, UINT width, UINT height, DXGI_FORMAT format, + Microsoft::WRL::ComPtr &textureResource, UINT16 arraySize) { + D3D12_RESOURCE_DESC textureDesc = {}; + textureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; + textureDesc.Width = width; + textureDesc.Height = height; + textureDesc.DepthOrArraySize = arraySize; + textureDesc.MipLevels = 1; + textureDesc.Format = format; + textureDesc.SampleDesc.Count = 1; + textureDesc.SampleDesc.Quality = 0; + textureDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN; + textureDesc.Flags = D3D12_RESOURCE_FLAG_NONE; + + ThrowIfFailed(device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_DEFAULT), + D3D12_HEAP_FLAG_NONE, &textureDesc, D3D12_RESOURCE_STATE_COPY_DEST, + nullptr, IID_PPV_ARGS(&textureResource))); +} + +void Texture2D(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, + Microsoft::WRL::ComPtr &textureResource, int width, int height, DXGI_FORMAT format) { + CreateTextureResource(device, width, height, format, textureResource, 1); + auto textureData = CreateRandomTexture(width, height); + UploadTexture(device, cmdList, textureData, textureResource, width, height); +} + +void TextureCube(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, + Microsoft::WRL::ComPtr &textureResource, int width, int height, DXGI_FORMAT format) { + CreateTextureResource(device, width, height, format, textureResource, 6); + std::vector textureCubeData; + for (int i = 0; i < 6; ++i) { + auto textureData = CreateRandomTexture(width, height); + textureCubeData.insert(textureCubeData.end(), textureData.begin(), textureData.end()); + } + UploadTexture(device, cmdList, textureCubeData, textureResource, width, height); +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.h new file mode 100644 index 000000000..1ab91ddb6 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/BufferHelper.h @@ -0,0 +1,126 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include + +#include "../directx_third_party/DXSampleHelper.h" +#include "../directx_third_party/d3dx12.h" + +// Helper class for creating and uploading resources to the GPU. +template class UploadBuffer { + public: + UploadBuffer(ID3D12Device *device, UINT elementCount, bool isConstantBuffer) + : m_isConstantBuffer(isConstantBuffer) { + m_elementByteSize = sizeof(T); + + if (isConstantBuffer) + m_elementByteSize = CalcConstantBufferByteSize(sizeof(T)); + + ThrowIfFailed( + device->CreateCommittedResource(&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD), D3D12_HEAP_FLAG_NONE, + &CD3DX12_RESOURCE_DESC::Buffer(m_elementByteSize * elementCount), + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, IID_PPV_ARGS(&m_uploadBuffer))); + } + + UploadBuffer(const UploadBuffer &rhs) = delete; + UploadBuffer &operator=(const UploadBuffer &rhs) = delete; + ~UploadBuffer() { + if (m_uploadBuffer != nullptr) + m_uploadBuffer->Unmap(0, nullptr); + + m_mappedData = nullptr; + } + + ID3D12Resource *Resource() const { return m_uploadBuffer.Get(); } + + void CopyData(int elementIndex, const T &data) { + ThrowIfFailed(m_uploadBuffer->Map(0, nullptr, reinterpret_cast(&m_mappedData))); + memcpy(&m_mappedData[elementIndex * m_elementByteSize], &data, sizeof(T)); + m_uploadBuffer->Unmap(0, nullptr); + } + + private: + Microsoft::WRL::ComPtr m_uploadBuffer; + BYTE *m_mappedData = nullptr; + + UINT m_elementByteSize = 0; + bool m_isConstantBuffer = false; +}; + +/* + * @brief: Create a default buffer. + * @param: device the device of GPU object. + * @param: cmdList the command list of GPU object. + * @param: initData the data to be copied to the default buffer. + * @param: byteSize the size of data. + * @return: the default buffer. + */ +Microsoft::WRL::ComPtr CreateDefaultBuffer(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, + const void *initData, UINT64 byteSize, + Microsoft::WRL::ComPtr &uploadBuffer); + +/* + * @brief: Calculate the size of constant buffer. + */ +UINT CalcConstantBufferByteSize(UINT byteSize); + +/* + * @brief: Create a random texture. + * @param: width the width of texture. + * @param: height the height of texture. + * @param: texturePixelSize the size of texture pixel. + * @return: the random texture data. + */ +std::vector CreateRandomTexture(const UINT width, const UINT height, const UINT texturePixelSize = 4); + +/* + * @brief: Upload the texture to GPU. + * @param: device the device of GPU object. + * @param: pCmdList the command list of GPU object. + * @param: textureData the texture data to be uploaded. + * @param: texture the texture resource. + * @param: width the width of texture. + * @param: height the height of texture. + * @param: texturePixelSize the size of texture pixel. + */ +void UploadTexture(ID3D12Device *device, ID3D12GraphicsCommandList *pCmdList, const std::vector &textureData, + Microsoft::WRL::ComPtr &texture, const UINT width, const UINT height, + const UINT texturePixelSize = 4); + +/* + * @brief: Create a texture resource. + * @param: device the device of GPU object. + * @param: width the width of texture. + * @param: height the height of texture. + * @param: format the format of texture. + * @param: textureResource the texture resource. + * @param: arraySize the size of texture array. + */ +void CreateTextureResource(ID3D12Device *device, UINT width, UINT height, DXGI_FORMAT format, + Microsoft::WRL::ComPtr &textureResource, UINT16 arraySize); + +/* + * @brief: Create a random texture resource and upload it to GPU. + * @param: device the device of GPU object. + * @param: cmdList the command list of GPU object. + * @param: textureResource the texture resource. + * @param: width the width of texture. + * @param: height the height of texture. + * @param: format the format of texture. + */ +void Texture2D(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, + Microsoft::WRL::ComPtr &textureResource, int width, int height, DXGI_FORMAT format); + +/* + * @brief: Create a random texture cube resource and upload it to GPU. + * @param: device the device of GPU object. + * @param: cmdList the command list of GPU object. + * @param: textureResource the texture resource. + * @param: width the width of texture. + * @param: height the height of texture. + * @param: format the format of texture. + */ +void TextureCube(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, + Microsoft::WRL::ComPtr &textureResource, int width, int height, DXGI_FORMAT format); diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/DirectXRenderPerformance.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_render_performance/DirectXRenderPerformance.vcxproj new file mode 100644 index 000000000..46a40606f --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/DirectXRenderPerformance.vcxproj @@ -0,0 +1,159 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + Win32Proj + {627418c9-578a-47a9-8579-45c0e08fe528} + DirectXRenderPerformance + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + + + + + + + + + + + + + + Level3 + true + false + + + Windows + true + + + 5.1 + + + + + Level3 + true + true + true + false + + + Windows + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Compute + 4.0 + Compute + 4.0 + Document + false + copy %(Identity) "$(OutDir)\Shaders" > NUL + $(OutDir)\%(Identity) + false + copy %(Identity) "$(OutDir)\Shaders" > NUL + $(OutDir)\%(Identity) + + + Compute + 4.0 + Compute + 4.0 + Document + false + copy %(Identity) "$(OutDir)\Shaders" > NUL + $(OutDir)\%(Identity) + false + copy %(Identity) "$(OutDir)\Shaders" > NUL + $(OutDir)\%(Identity) + + + Compute + 4.0 + Compute + 4.0 + Document + false + copy %(Identity) "$(OutDir\Shaders)" > NUL + $(OutDir)\%(Identity) + false + copy %(Identity) "$(OutDir)\Shaders" > NUL + $(OutDir)\%(Identity) + + + Compute + 4.0 + Compute + 4.0 + Document + false + copy %(Identity) "$(OutDir)\Shaders" > NUL + $(OutDir)\%(Identity) + false + copy %(Identity) "$(OutDir)\Shaders" > NUL + $(OutDir)\%(Identity) + + + + + + + \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.cpp new file mode 100644 index 000000000..fe5a5bdee --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.cpp @@ -0,0 +1,37 @@ + +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "GeometryHelper.h" + +namespace MathHelper { +DirectX::XMFLOAT4X4 Identity4x4() { + + DirectX::XMFLOAT4X4 identity; + DirectX::XMStoreFloat4x4(&identity, DirectX::XMMatrixIdentity()); + return identity; +} + +float genRand2N_f(int n) { + srand((unsigned int)time(NULL)); + // Seed + std::random_device rd; + + // Random number generator + std::default_random_engine generator(rd()); + std::uniform_real_distribution distribution(0, n); + return distribution(generator); +} + +uint16_t genRand2N_large(int n) { + srand((unsigned int)time(NULL)); + // Seed + std::random_device rd; + + // Random number generator + std::default_random_engine generator(rd()); + // Use std::uniform_int_distribution with the desired range + std::uniform_int_distribution distribution(0, static_cast(n)); + return distribution(generator); +}; +} // namespace MathHelper diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.h new file mode 100644 index 000000000..1445ca5fd --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/GeometryHelper.h @@ -0,0 +1,159 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include + +#include + +#include "../directx_third_party/DXSampleHelper.h" +#include "../directx_third_party/pch.h" +#include "BufferHelper.h" + +using Microsoft::WRL::ComPtr; + +namespace MathHelper { +const float Infinity = FLT_MAX; +const float Pi = 3.1415926535f; +// Create identity4*4 matrix +DirectX::XMFLOAT4X4 Identity4x4(); +// Returns random float in [0, n). +float genRand2N_f(int n); +// Returns random uint16_t in [0, n). +uint16_t genRand2N_large(int n); +} // namespace MathHelper + +// Simple struct to represent a vertex. +class Vertex { + public: + Vertex() { + x = MathHelper::genRand2N_f(2) - 1; + y = MathHelper::genRand2N_f(2) - 1; + z = MathHelper::genRand2N_f(2) - 1; + } + Vertex(float x, float y, float z) : x(x), y(y), z(z) {} + Vertex(const Vertex &v) : x(v.x), y(v.y), z(v.z) {} + float x, y, z; // Position + // You can add other attributes such as color, normal, texture coordinates etc. +}; + +// Simple struct to represent a Geometry object. +struct Geometry { + std::unique_ptr vertexData = nullptr; + std::vector indexData; + UINT vertexNum; + UINT indexNum; + UINT vertexByteSize; + UINT indexByteSize; + UINT vertexByteStride; +}; + +// Create a random geometry data buffer. +template std::unique_ptr CreateRandomGeometry(const UINT vertexNum, const UINT indexNum) { + static_assert(std::is_base_of::value, "T must be a Vertex or derived from Vertex"); + std::unique_ptr geo = make_unique(); + // Create the vertices. + // Allocate memory and reinterpret_cast it to Vertex array + geo->vertexData.reset(reinterpret_cast(new T[vertexNum])); + + // Fill in the random vertex data. + for (UINT i = 0; i < vertexNum; i++) { + // Here you need to reinterpret_cast it back to T for accessing/modifying + T &v = reinterpret_cast(geo->vertexData[i]); + v = T(); + } + + // Create the indices. + // Fill in the random index data. + for (UINT i = 0; i < indexNum; i++) { + geo->indexData.push_back(MathHelper::genRand2N_large(vertexNum)); + } + geo->vertexNum = vertexNum; + geo->indexNum = indexNum; + geo->vertexByteStride = sizeof(T); + geo->vertexByteSize = sizeof(T) * vertexNum; + geo->indexByteSize = sizeof(std::uint16_t) * indexNum; + return geo; +} + +// Helpter class to manage geometry data buffer on GPU. +struct GeometryResource { + ComPtr VertexBufferCPU = nullptr; + ComPtr IndexBufferCPU = nullptr; + + ComPtr VertexBufferGPU = nullptr; + ComPtr IndexBufferGPU = nullptr; + + ComPtr VertexBufferUploader = nullptr; + ComPtr IndexBufferUploader = nullptr; + + // Data about the buffers. + UINT VertexByteStride = 0; + UINT VertexBufferByteSize = 0; + DXGI_FORMAT IndexFormat = DXGI_FORMAT_R16_UINT; + UINT IndexBufferByteSize = 0; + + D3D12_PRIMITIVE_TOPOLOGY PrimitiveType = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + + UINT IndexCount = 0; + UINT StartIndexLocation = 0; + INT BaseVertexLocation = 0; + + /* + * @brief Get the vertex buffer view. + */ + D3D12_VERTEX_BUFFER_VIEW VertexBufferView() const { + D3D12_VERTEX_BUFFER_VIEW vbv; + vbv.BufferLocation = VertexBufferGPU->GetGPUVirtualAddress(); + vbv.StrideInBytes = VertexByteStride; + vbv.SizeInBytes = VertexBufferByteSize; + return vbv; + } + + /* + * @brief Get the index buffer view. + */ + D3D12_INDEX_BUFFER_VIEW IndexBufferView() const { + D3D12_INDEX_BUFFER_VIEW ibv; + ibv.BufferLocation = IndexBufferGPU->GetGPUVirtualAddress(); + ibv.Format = IndexFormat; + ibv.SizeInBytes = IndexBufferByteSize; + return ibv; + } + + /* + * @brief Upload geometry data and set necessary information about the geometry. + */ + void Create(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, std::unique_ptr &geoData) { + if (device == nullptr) { + throw std::runtime_error("device is nullptr"); + } + if (cmdList == nullptr) { + throw std::runtime_error("cmdList is nullptr"); + } + if (geoData == nullptr) { + throw std::runtime_error("geoData is nullptr"); + } + auto geometry = geoData.get(); + ThrowIfFailed(D3DCreateBlob(geometry->vertexByteSize, &this->VertexBufferCPU)); + CopyMemory(this->VertexBufferCPU->GetBufferPointer(), geometry->vertexData.get(), geometry->vertexByteSize); + + ThrowIfFailed(D3DCreateBlob(geometry->indexByteSize, &this->IndexBufferCPU)); + CopyMemory(this->IndexBufferCPU->GetBufferPointer(), geometry->indexData.data(), geometry->indexByteSize); + + this->VertexBufferGPU = CreateDefaultBuffer(device, cmdList, geometry->vertexData.get(), + geometry->vertexByteSize, this->VertexBufferUploader); + + this->IndexBufferGPU = CreateDefaultBuffer(device, cmdList, geometry->indexData.data(), geometry->indexByteSize, + this->IndexBufferUploader); + + this->VertexByteStride = geometry->vertexByteStride; + this->VertexBufferByteSize = geometry->vertexByteSize; + this->IndexBufferByteSize = geometry->indexByteSize; + this->IndexCount = geometry->indexNum; + } +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Main.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Main.cpp new file mode 100644 index 000000000..3add03fc0 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Main.cpp @@ -0,0 +1,148 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "RenderGeometryPass.h" +#include "RenderLightingPass.h" +#include "RenderShadowMapPass.h" +#include +#include +#include +#include +#include +#include +#include + +/* + * @brief: Main message handler for the sample. + */ +LRESULT WindowProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) { + // Handle window event. + switch (message) { + case WM_CLOSE: + DestroyWindow(hWnd); + break; + case WM_DESTROY: + PostQuitMessage(0); + break; + default: + return DefWindowProc(hWnd, message, wParam, lParam); + } + return 0; +} + +/* + * @brief: Main window procedure. + */ +static LRESULT CALLBACK MainWndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam) { + // Forward hwnd on because we can get messages (e.g., WM_CREATE) + // before CreateWindow returns, and thus before mhMainWnd is valid. + return WindowProc(hwnd, msg, wParam, lParam); +} + +/* + * @brief: Register a window app. + */ +bool InitMainWindow(HINSTANCE hInstance, int width, int height, HWND &hMainWnd, const std::wstring &winTitle, + bool quiet_mode) { + WNDCLASS wc; + wc.style = CS_HREDRAW | CS_VREDRAW; + wc.lpfnWndProc = MainWndProc; + wc.cbClsExtra = 0; + wc.cbWndExtra = 0; + wc.cbWndExtra = 0; + wc.hInstance = hInstance; + wc.hIcon = LoadIcon(0, IDI_APPLICATION); + wc.hCursor = LoadCursor(0, IDC_ARROW); + wc.hbrBackground = (HBRUSH)GetStockObject(NULL_BRUSH); + wc.lpszMenuName = 0; + wc.lpszClassName = L"MainWnd"; + + if (!RegisterClass(&wc)) { + return false; + } + + // Compute window rectangle dimensions based on requested client area dimensions. + RECT R = {0, 0, width, height}; + AdjustWindowRect(&R, WS_OVERLAPPEDWINDOW, false); + width = R.right - R.left; + height = R.bottom - R.top; + + hMainWnd = CreateWindow(wc.lpszClassName, winTitle.c_str(), WS_OVERLAPPEDWINDOW, CW_USEDEFAULT, CW_USEDEFAULT, + width, height, 0, 0, hInstance, 0); + if (!hMainWnd) { + return false; + } + + if (!quiet_mode) { + ShowWindow(hMainWnd, SW_SHOW); + UpdateWindow(hMainWnd); + } + return true; +} + +/* + * @brief: Load the render microbenchmark according to the pass type. + */ +std::unique_ptr get_render_pointer(BenchmarkOptions &args, HINSTANCE hInstance, HWND hMainWnd, + std::wstring &winTitle) { + if (args.m_pass_type == PassType::GeometryPass) { + return std::make_unique(&args, hInstance, hMainWnd, winTitle); + } else if (args.m_pass_type == PassType::ShadowMapPass) { + return std::make_unique(&args, hInstance, hMainWnd, winTitle); + } else if (args.m_pass_type == PassType::LightingPass) { + return std::make_unique(&args, hInstance, hMainWnd, winTitle); + } else + throw "invalid pass name"; +} + +/* + * @brief: Main entry point for a Windows application. + */ +int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow) { + // Enable console attach and redirect stdout/stderr to console. + if (AttachConsole(ATTACH_PARENT_PROCESS) || AllocConsole()) { + FILE *stream; + if (freopen_s(&stream, "CONOUT$", "w", stdout) == 0) { + printf("Hello, Console!\n"); + } + if (freopen_s(&stream, "CONOUT$", "w", stderr) == 0) { + fprintf(stderr, "Hello, Error Console!\n"); + } + // Or use std::cout + std::cout << "Hello from std::cout" << std::endl; + } + + MSG msg = {0}; + try { + // Parse command line arguments. + BenchmarkOptions args(__argc, __argv); + args.init(); + // Create the main window. + HWND hMainWnd; + std::wstring_convert, wchar_t> converter; + std::wstring winTitle = converter.from_bytes(""); + if (!InitMainWindow(hInstance, args.m_width, args.m_height, hMainWnd, winTitle, args.m_quiet)) + return -1; + + // Create the render microbenchmark. + auto app_sample = get_render_pointer(args, hInstance, hMainWnd, winTitle); + app_sample->Initialize(); + app_sample->LoadAssets(); + + while (msg.message != WM_QUIT) { + // If there are Window messages then process them. + // We need to handle message here otherwise it is no response. + if (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) { + TranslateMessage(&msg); + DispatchMessage(&msg); + } else { + // Update and render per frame. + app_sample->Tick(); + } + } + } catch (const std::exception &e) { + std::cerr << e.what() << '\n'; + } + + return (int)msg.wParam; +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.cpp new file mode 100644 index 000000000..fa4a8f0eb --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.cpp @@ -0,0 +1,388 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "RenderApp.h" +#include "../directx_third_party/d3dx12.h" + +RenderApp::RenderApp(BenchmarkOptions *args) { + if (args == nullptr) { + throw std::runtime_error("BenchmarkOptions is nullptr"); + } + m_opts = args; + m_width = args->m_width; + m_height = args->m_height; + m_deviceResources = std::make_unique(DXGI_FORMAT_B8G8R8A8_UNORM_SRGB, DXGI_FORMAT_D32_FLOAT, + m_swapChainBufferCount, D3D_FEATURE_LEVEL_11_0, + DX::DeviceResources::c_AllowTearing); +} + +RenderApp::RenderApp(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle) + : RenderApp(args) { + m_hinstance = hInstance; + m_hMainWnd = hMainWnd; + m_winTitle = winTitle; +} + +RenderApp::~RenderApp() { + if (m_outfile.is_open()) { + m_outfile.close(); + } + if (m_deviceResources) { + m_deviceResources->WaitForGpu(); + } +} + +void RenderApp::Initialize() { + if (m_deviceResources == nullptr) { + throw std::runtime_error("DeviceResources is nullptr"); + } + m_deviceResources->SetWindow(m_hMainWnd, m_width, m_height); + m_deviceResources->CreateDeviceResources(); + CreateDeviceDependentResources(); + + m_deviceResources->CreateWindowSizeDependentResources(); + CreateWindowSizeDependentResources(); + + // Wait until initialization is complete. + // Execute the initialization commands. + m_deviceResources->WaitForGpu(); + + auto device = m_deviceResources->GetD3DDevice(); + auto commandQueue = m_deviceResources->GetCommandQueue(); + m_gpuTimer.init(device, commandQueue, m_maxTimerNum, D3D12::QueueType::compute); + m_outfile.open(m_opts->m_outfile, std::ios_base::out); +} + +void RenderApp::CreateDeviceDependentResources() { + auto device = m_deviceResources->GetD3DDevice(); + if (device == nullptr) { + throw std::runtime_error("D3D12Device is nullptr"); + } + // Create a fence for synchronizing between different frames + ThrowIfFailed(device->CreateFence(m_deviceResources->GetCurrentFrameIndex(), D3D12_FENCE_FLAG_NONE, + IID_PPV_ARGS(m_fence.ReleaseAndGetAddressOf()))); + + // Start off the fence with the current frame index + uint64_t currentIdx = m_deviceResources->GetCurrentFrameIndex(); + m_deviceResources->GetCommandQueue()->Signal(m_fence.Get(), currentIdx); + + CreateRootSignatures(device); + BuildPipelineStates(device); +} + +void RenderApp::CreateWindowSizeDependentResources() { + auto device = m_deviceResources->GetD3DDevice(); + auto rtvHeap = m_deviceResources->m_rtvDescriptorHeap.Get(); + auto pCmdList = m_deviceResources->GetCommandList(); + auto cmdListAlloc = m_deviceResources->GetCommandAllocator(); + auto cmdQueue = m_deviceResources->GetCommandQueue(); + if (device == nullptr) { + throw std::runtime_error("D3D12Device is nullptr"); + } + if (rtvHeap == nullptr) { + throw std::runtime_error("RTVDescriptorHeap is nullptr"); + } + if (pCmdList == nullptr) { + throw std::runtime_error("CommandList is nullptr"); + } + if (cmdListAlloc == nullptr) { + throw std::runtime_error("CommandAllocator is nullptr"); + } + if (cmdQueue == nullptr) { + throw std::runtime_error("CommandQueue is nullptr"); + } + + ThrowIfFailed(cmdListAlloc->Reset()); + ThrowIfFailed(pCmdList->Reset(cmdListAlloc, nullptr)); + + // Prepare and init GPU resources. + if (m_numPassRenderTargets > 0) + m_renderTargets.resize(m_numPassRenderTargets); + if (m_numShaderResource > 0) + m_shaderResources.resize(m_numShaderResource); + CreateRenderTargetView(device, m_width, m_height, rtvHeap); + CreateShaderResourceView(device, pCmdList, m_width, m_height); + + // Send the command list off to the GPU for processing. + ThrowIfFailed(pCmdList->Close()); + ID3D12CommandList *commandLists[] = {pCmdList}; + cmdQueue->ExecuteCommandLists(1, commandLists); +} + +void RenderApp::CreateRootSignatures(ID3D12Device *device) { + std::vector rootParameters; + int numRootParameters = DefineRootParameters(rootParameters); + CD3DX12_ROOT_SIGNATURE_DESC rootSignatureDesc = {}; + rootSignatureDesc.NumParameters = numRootParameters; + rootSignatureDesc.pParameters = rootParameters.data(); + std::vector samplers; + auto numSamplers = DefineStaticSamplers(samplers); + rootSignatureDesc.NumStaticSamplers = (UINT)numSamplers; + rootSignatureDesc.pStaticSamplers = samplers.data(); + rootSignatureDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT; + + ID3DBlob *serializedRootSignature = nullptr; + ID3DBlob *errorBlob = nullptr; + auto hr = (D3D12SerializeRootSignature(&rootSignatureDesc, D3D_ROOT_SIGNATURE_VERSION_1_0, &serializedRootSignature, + &errorBlob)); + if (hr != S_OK || errorBlob != nullptr) { + std::cout << ((char *)errorBlob->GetBufferPointer()) << std::endl; + } + + ThrowIfFailed(device->CreateRootSignature(0, serializedRootSignature->GetBufferPointer(), + serializedRootSignature->GetBufferSize(), + IID_PPV_ARGS(&m_rootSignature))); +} + +void RenderApp::CreateRenderTargetResource(ID3D12Device *device, UINT width, UINT height, DXGI_FORMAT format, + D3D12_RESOURCE_FLAGS flags, + Microsoft::WRL::ComPtr &renderTarget) { + // Create the render target resources: + D3D12_CLEAR_VALUE m_clearValue = {}; // Specify a clear value for the render target (optional) + m_clearValue.Format = format; + m_clearValue.Color[0] = 0.0f; // Red component + m_clearValue.Color[1] = 0.0f; // Green component + m_clearValue.Color[2] = 0.0f; // Blue component + m_clearValue.Color[3] = 1.0f; // Alpha component + + D3D12_HEAP_PROPERTIES heapProperties = {}; // Specify heap properties for the render target (optional) + heapProperties.Type = D3D12_HEAP_TYPE_DEFAULT; + + D3D12_RESOURCE_DESC resourceDesc = {}; // Specify resource properties for the render target + resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; + resourceDesc.Alignment = 0; + resourceDesc.Width = width; + resourceDesc.Height = height; + resourceDesc.DepthOrArraySize = 1; + resourceDesc.MipLevels = 1; + resourceDesc.Format = format; + resourceDesc.SampleDesc.Count = 1; + resourceDesc.SampleDesc.Quality = 0; + resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN; + resourceDesc.Flags = flags; + + // Create the render target resource + ThrowIfFailed(device->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE, &resourceDesc, + D3D12_RESOURCE_STATE_COMMON, &m_clearValue, + IID_PPV_ARGS(&renderTarget))); +} + +CD3DX12_CPU_DESCRIPTOR_HANDLE RenderApp::GetRenderTargetView(ID3D12Device *device) { + const CD3DX12_CPU_DESCRIPTOR_HANDLE rtvDescriptor(m_rtvDescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + + return rtvDescriptor; +} + +void RenderApp::CreateRenderTargetView(ID3D12Device *device, UINT width, UINT height, ID3D12DescriptorHeap *rtvHeap) { + D3D12_DESCRIPTOR_HEAP_DESC rtvDescriptorHeapDesc = {}; + rtvDescriptorHeapDesc.NumDescriptors = m_numPassRenderTargets; + rtvDescriptorHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV; + ThrowIfFailed(device->CreateDescriptorHeap(&rtvDescriptorHeapDesc, + IID_PPV_ARGS(m_rtvDescriptorHeap.ReleaseAndGetAddressOf()))); + m_rtvDescriptorSize = device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_RTV); + + // Define the render target properties + DXGI_FORMAT format = m_renderTargetFormat; // Pixel format of the render target + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET; // Specify the resource flags + + // Create the render target resources. + for (int i = 0; i < m_numPassRenderTargets; ++i) { + CreateRenderTargetResource(device, width, height, format, flags, m_renderTargets[i]); + } + + auto rtvHandle = GetRenderTargetView(device); + + // Create a RTV for each custom render target. + for (UINT i = 0; i < m_numPassRenderTargets; ++i) { + // Create the RTV descriptor + device->CreateRenderTargetView(m_renderTargets[i].Get(), nullptr, rtvHandle); + // Increment the handle to the next descriptor + rtvHandle.Offset(1, m_rtvDescriptorSize); + } +} + +D3D12_GRAPHICS_PIPELINE_STATE_DESC RenderApp::DefinePSODesc(const std::vector &inputLayout, + ComPtr vertexShader, + ComPtr pixelShader) { + // Describe and create the graphics pipeline state object (PSO). + D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {}; + ZeroMemory(&psoDesc, sizeof(D3D12_GRAPHICS_PIPELINE_STATE_DESC)); + psoDesc.InputLayout = {inputLayout.data(), (UINT)inputLayout.size()}; + psoDesc.pRootSignature = m_rootSignature.Get(); + psoDesc.VS = {reinterpret_cast(vertexShader->GetBufferPointer()), vertexShader->GetBufferSize()}; + psoDesc.PS = {reinterpret_cast(pixelShader->GetBufferPointer()), pixelShader->GetBufferSize()}; + + psoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); + psoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); + psoDesc.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC(D3D12_DEFAULT); + psoDesc.SampleMask = UINT_MAX; + psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + psoDesc.DSVFormat = DXGI_FORMAT_UNKNOWN; // No depth-stencil + psoDesc.NumRenderTargets = m_numPassRenderTargets; + for (int i = 0; i < m_numPassRenderTargets; i++) { + psoDesc.RTVFormats[i] = m_renderTargetFormat; + } + psoDesc.SampleDesc.Count = 1; + psoDesc.SampleDesc.Quality = 0; + + return psoDesc; +} + +void RenderApp::LoadAssets() { + auto device = m_deviceResources->GetD3DDevice(); + auto pCmdList = m_deviceResources->GetCommandList(); + auto cmdListAlloc = m_deviceResources->GetCommandAllocator(); + auto cmdQueue = m_deviceResources->GetCommandQueue(); + + ThrowIfFailed(cmdListAlloc->Reset()); + ThrowIfFailed(pCmdList->Reset(cmdListAlloc, nullptr)); + + CreateConstantBufferResources(device); + UpdateConstantBufferData(); + BuildShapeGeometry(device, pCmdList); + + ThrowIfFailed(pCmdList->Close()); + ID3D12CommandList *commandLists[] = {pCmdList}; + cmdQueue->ExecuteCommandLists(1, commandLists); + + this->m_deviceResources->WaitForGpu(); +} + +void RenderApp::Tick() { + auto device = m_deviceResources->GetD3DDevice(); + auto pCmdList = m_deviceResources->GetCommandList(); + auto cmdListAlloc = m_deviceResources->GetCommandAllocator(); + auto cmdQueue = m_deviceResources->GetCommandQueue(); + Update(); + Render(); + this->m_deviceResources->WaitForGpu(); + CalculateFrameStats(); +} + +void RenderApp::Update() { + // Check to see if the GPU is keeping up + auto const frameIdx = m_frameIndex; + auto const numBackBuffers = m_deviceResources->GetBackBufferCount(); + uint64_t completedValue = m_fence->GetCompletedValue(); + if ((frameIdx > + completedValue) // if frame index is reset to zero it may temporarily be smaller than the last GPU signal + && (frameIdx - completedValue > numBackBuffers)) { + // GPU not caught up, wait for at least one available frame + ThrowIfFailed(m_fence->SetEventOnCompletion(frameIdx - numBackBuffers, m_fenceEvent.Get())); + WaitForSingleObjectEx(m_fenceEvent.Get(), INFINITE, FALSE); + } +} + +void RenderApp::CalculateFrameStats() { + auto timeInMs = m_gpuTimer.getElapsedMsByTimestampPair(m_gpuTimerIdx); + m_frameTimeList.push_back(timeInMs); + m_gpuTimerIdx++; + if (m_gpuTimerIdx == m_maxTimerNum) { + m_gpuTimerIdx = 0; + } + + m_frameIndex++; + if (m_frameIndex < m_opts->m_warmup) { + m_frameTimeList.clear(); + } else { + cout << m_frameTimeList.back() << endl; + m_outfile << m_frameTimeList.back() << endl; + } + + if (m_frameIndex == m_opts->m_warmup + m_opts->m_num_frames) { + // Calculate the median + double median = 0; + std::sort(m_frameTimeList.begin(), m_frameTimeList.end()); + int size = m_frameTimeList.size(); + if (m_frameTimeList.size() % 2 == 0) { + median = (m_frameTimeList[size / 2 - 1] + m_frameTimeList[size / 2]) / 2; + } else { + median = m_frameTimeList[size / 2]; + } + m_outfile << "Mean: " << median << std::endl; + std::cout << "Mean: " << median << std::endl; + PostMessage(m_hMainWnd, WM_CLOSE, 0, 0); + } +} + +void RenderApp::ClearRenderTargetView() { + auto commandList = m_deviceResources->GetCommandList(); + auto device = m_deviceResources->GetD3DDevice(); + + // Clear the views. + auto rtvDescriptor = GetRenderTargetView(device); + auto const dsvDescriptor = m_deviceResources->GetDepthStencilView(); + float clearColor[4] = {0.0f, 0.0f, 0.0f, 1.0f}; + + std::vector rtvHandles(m_numPassRenderTargets); + for (int i = 0; i < m_numPassRenderTargets; i++) { + commandList->ClearRenderTargetView(rtvDescriptor, clearColor, 0, nullptr); + rtvHandles[i] = rtvDescriptor; + rtvDescriptor.Offset(1, m_rtvDescriptorSize); + } + + rtvDescriptor = GetRenderTargetView(device); + commandList->ClearDepthStencilView(dsvDescriptor, D3D12_CLEAR_FLAG_DEPTH, 1.0f, 0, 0, nullptr); + // Indicate that the back buffer will be used as a render target. + commandList->OMSetRenderTargets(m_numPassRenderTargets, rtvHandles.data(), FALSE, nullptr); + + // Set the viewport and scissor rect. + auto const viewport = m_deviceResources->GetScreenViewport(); + auto const scissorRect = m_deviceResources->GetScissorRect(); + commandList->RSSetViewports(1, &viewport); + commandList->RSSetScissorRects(1, &scissorRect); +} + +void RenderApp::PrepareRenderTarget(ID3D12GraphicsCommandList *pCommandList) { + for (int i = 0; i < m_numPassRenderTargets; i++) { + // Transition from COMMON to RENDER_TARGET + D3D12_RESOURCE_BARRIER barrier = {}; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = m_renderTargets[i].Get(); + barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; + barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_RENDER_TARGET; + pCommandList->ResourceBarrier(1, &barrier); + } +} + +void RenderApp::RestoreRenderTarget(ID3D12GraphicsCommandList *pCommandList) { + for (int i = 0; i < m_numPassRenderTargets; i++) { + // Indicate that the back buffer will now be used to present. + pCommandList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(m_renderTargets[i].Get(), + D3D12_RESOURCE_STATE_RENDER_TARGET, + D3D12_RESOURCE_STATE_COMMON)); + } +} + +void RenderApp::Render() { + auto device = m_deviceResources->GetD3DDevice(); + auto cmdList = m_deviceResources->GetCommandList(); + auto cmdQueue = m_deviceResources->GetCommandQueue(); + m_deviceResources->Prepare(); + PrepareRenderTarget(cmdList); + ClearRenderTargetView(); + SetStatesBeforeDraw(cmdList); + eventStart(cmdList); + Draw(cmdList); + eventEnd(cmdList); + RestoreRenderTarget(cmdList); + m_deviceResources->Present(); + // GPU will signal an increasing value each frame + m_deviceResources->GetCommandQueue()->Signal(m_fence.Get(), m_frameIndex); +} + +void RenderApp::DrawRenderItems(ID3D12GraphicsCommandList *pCmdList, int drawNum) { + auto ri = m_geometry.get(); + for (int i = 0; i < drawNum; ++i) { + pCmdList->DrawIndexedInstanced(ri->IndexCount, 1, ri->StartIndexLocation, ri->BaseVertexLocation, 0); + } +} + +void RenderApp::BuildShapeGeometry(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList) { + // Create random geometry. + std::unique_ptr geoData = CreateRandomGeometry(m_opts->m_vertexNum, m_opts->m_indexNum); + m_geometry = std::make_unique(); + m_geometry->Create(device, cmdList, geoData); +} \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.h new file mode 100644 index 000000000..3912010e6 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderApp.h @@ -0,0 +1,221 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "../directx_third_party/DeviceResources.h" +#include "../directx_utils/D3D12Timer.h" + +#include "BenchmarkOptions.h" +#include "GeometryHelper.h" + +using Microsoft::WRL::ComPtr; +using namespace DirectX; +using namespace std; + +class RenderApp { + public: + RenderApp(BenchmarkOptions *args); + RenderApp(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle); + RenderApp(const RenderApp &rhs) = delete; + RenderApp &operator=(const RenderApp &rhs) = delete; + ~RenderApp(); + + /* + * @brief: Execute the update and render per frame. + */ + void Tick(); + /* + * @brief: Initialize the application. + */ + virtual void Initialize(); + /* + * @brief: Prepare the data assets needed for render. + */ + virtual void LoadAssets(); + /* + * @brief: Calculate the frame stats. + */ + void CalculateFrameStats(); + /* + * @brief: Update to run next frame. + */ + void Update(); + /* + * @brief: Executes basic render loop . + */ + void Render(); + + protected: + /* + * @brief: Define the root parameters. + * @param: rootParameters The root parameters to be defined. + * @return: The number of root parameters. + */ + virtual int DefineRootParameters(std::vector &rootParameters) = 0; + /* + * @brief: Define the static samplers. + * @param: samplers The static samplers to be defined. + * @return: The number of static samplers. + */ + virtual int DefineStaticSamplers(std::vector &samplers) = 0; + /* + * @brief: Build the pipeline states. + * @param: device The device to build the pipeline states. + */ + virtual void BuildPipelineStates(ID3D12Device *device) = 0; + /* + * @brief: Create the shader resource view. + * @param: device The device to create the shader resource view. + * @param: cmdList The command list to create the shader resource view. + * @param: width The width of the shader resource view. + * @param: height The height of the shader resource view. + */ + virtual void CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width, + int height) = 0; + /* + * @brief: Create the constant buffer resources. + * @param: device The device to create the constant buffer resources. + */ + virtual void CreateConstantBufferResources(ID3D12Device *device) = 0; + /* + * @brief: Update the constant buffer data. + */ + virtual void UpdateConstantBufferData() = 0; + /* + * @brief: Render and draw defined by pass. + * @param: cmdList The command list to draw the render items. + */ + virtual void Draw(ID3D12GraphicsCommandList *cmdList) = 0; + /* + * @brief: Set the states before draw. + * @param: cmdList The command list to set the states before draw. + */ + virtual void SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) = 0; + /* + * @brief: Create the device dependent resources. + */ + virtual void CreateDeviceDependentResources(); + /* + * @brief: Create the window size dependent resources. + */ + virtual void CreateWindowSizeDependentResources(); + /* + * @brief: Create the root signature. + * @param: device The device to create the root signature. + */ + virtual void CreateRootSignatures(ID3D12Device *device); + /* + * @brief: Build the geometry. + */ + virtual void BuildShapeGeometry(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList); + /* + * @brief: Draw the render items. + * @param: pCmdList The command list to draw the render item. + * @param: drawTimes The times to draw the render item. + */ + virtual void DrawRenderItems(ID3D12GraphicsCommandList *pCmdList, int drawTimes); + /* + * @brief: Create the render target view. + * @param: device The device to create the render target view. + * @param: width The width of the render target view. + * @param: height The height of the render target view. + * @param: rtvHeap The descriptor heap to create the render target view. + */ + virtual void CreateRenderTargetView(ID3D12Device *device, UINT width, UINT height, ID3D12DescriptorHeap *rtvHeap); + /* + * @brief: Create the Render target resource. + * @param: device The device to create the render target resource. + * @param: width The width of the render target resource. + * @param: height The height of the render target resource. + * @param: format The format of the render target resource. + * @param: flags The flags of the render target resource. + * @param: renderTarget The render target resource to be created. + */ + virtual void CreateRenderTargetResource(ID3D12Device *device, UINT width, UINT height, DXGI_FORMAT format, + D3D12_RESOURCE_FLAGS flags, + Microsoft::WRL::ComPtr &renderTarget); + /* + * @brief: Define the pipeline state description. + * @param: inputLayout The input layout of the pipeline state description. + * @param: vertexShader The vertex shader of the pipeline state description. + * @param: pixelShader The pixel shader of the pipeline state description. + * @return: The pipeline state description. + */ + D3D12_GRAPHICS_PIPELINE_STATE_DESC DefinePSODesc(const std::vector &inputLayout, + ComPtr vertexShader, ComPtr pixelShader); + /* + * @brief: Prepare the render target state to draw. + */ + void PrepareRenderTarget(ID3D12GraphicsCommandList *pCommandList); + /* + * @brief: restore render target state. + */ + void RestoreRenderTarget(ID3D12GraphicsCommandList *pCommandList); + /* + * @brief: Clear, bind the render target view and set the viewport and scissor rect. + */ + void ClearRenderTargetView(); + /* + * @brief: Get the first render target view of the pass. + */ + CD3DX12_CPU_DESCRIPTOR_HANDLE GetRenderTargetView(ID3D12Device *device); + + // Window info. + std::wstring m_winTitle; + int m_width = 1280; + int m_height = 720; + HINSTANCE m_hinstance = nullptr; + HWND m_hMainWnd = nullptr; + int m_swapChainBufferCount = 2; + // Device resources. + std::unique_ptr m_deviceResources; + D3D_DRIVER_TYPE m_d3dDriverType = D3D_DRIVER_TYPE_HARDWARE; + // Root signature. + ComPtr m_rootSignature = nullptr; + // Render target view. + ComPtr m_rtvDescriptorHeap = nullptr; + DXGI_FORMAT m_renderTargetFormat = DXGI_FORMAT_R16G16B16A16_FLOAT; + DXGI_FORMAT m_colorFormat = DXGI_FORMAT_R8G8B8A8_UNORM; + UINT m_numPassRenderTargets = 1; // Number of render targets + std::vector> m_renderTargets; // Array of render target resources + UINT m_rtvDescriptorSize = 0; + // Shader resource view. + UINT m_numShaderResource = 0; // Number of ShaderResources + std::vector> m_shaderResources; + UINT m_cbvSrvDescriptorSize = 0; + ComPtr m_srvDescriptorHeap = nullptr; + // PSO objects. + std::unordered_map> m_PSOs; + std::unique_ptr m_geometry; + // A synchronization fence and an event. These members will be used + // to synchronize the CPU with the GPU so that there will be no + // contention for the constant buffers. + Microsoft::WRL::ComPtr m_fence; + Microsoft::WRL::Wrappers::Event m_fenceEvent; + + // Frame + UINT64 m_frameIndex = 0; + vector m_frameTimeList; + // Benchmark options. + BenchmarkOptions *m_opts; + ofstream m_outfile; + // GPU timer + D3D12::D3D12Timer m_gpuTimer; + int m_maxTimerNum = 500; + int m_gpuTimerIdx = 0; + + void eventStart(ID3D12GraphicsCommandList *pCommandList) { m_gpuTimer.start(pCommandList, m_gpuTimerIdx); } + + void eventEnd(ID3D12GraphicsCommandList *pCommandList) { + m_gpuTimer.stop(pCommandList, m_gpuTimerIdx); + m_gpuTimer.resolveQueryToCPU(pCommandList, m_gpuTimerIdx); + } +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.cpp new file mode 100644 index 000000000..3b3537e5d --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.cpp @@ -0,0 +1,139 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "RenderGeometryPass.h" + +int RenderGeometryPass::DefineRootParameters(std::vector &rootParameters) { + int numRootParams = 5; + rootParameters.resize(numRootParams); + + std::unique_ptr texTable0 = std::make_unique(); + texTable0->Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0); + std::unique_ptr texTable1 = std::make_unique(); + texTable1->Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, m_numShaderResource - 1, 1, 0); + + rootParameters[0].InitAsConstantBufferView(0); // obj cb + rootParameters[1].InitAsConstantBufferView(1); // pass cb + rootParameters[2].InitAsConstantBufferView(2); // material cb + rootParameters[3].InitAsDescriptorTable(1, texTable0.release(), D3D12_SHADER_VISIBILITY_PIXEL); // cube texture + rootParameters[4].InitAsDescriptorTable(1, texTable1.release(), D3D12_SHADER_VISIBILITY_PIXEL); // texture array + + return numRootParams; +} + +int RenderGeometryPass::DefineStaticSamplers(std::vector &samplers) { + int samplersCount = 1; + samplers.resize(samplersCount); + + CD3DX12_STATIC_SAMPLER_DESC anisotropicWrap(0, // shaderRegister + D3D12_FILTER_ANISOTROPIC, // filter + D3D12_TEXTURE_ADDRESS_MODE_WRAP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_WRAP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_WRAP, // addressW + 0.0f, // mipLODBias + 8); // maxAnisotropy + samplers[0] = anisotropicWrap; + + return samplersCount; +} + +void RenderGeometryPass::CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width, + int height) { + // Create a descriptor heap that will store the SRV: + D3D12_DESCRIPTOR_HEAP_DESC srvHeapDesc = {}; + srvHeapDesc.NumDescriptors = m_numShaderResource; + srvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; + srvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; + ThrowIfFailed(device->CreateDescriptorHeap(&srvHeapDesc, IID_PPV_ARGS(&m_srvDescriptorHeap))); + + CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(m_srvDescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + m_cbvSrvDescriptorSize = device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); + + // Whole screen texture. + TextureCube(device, cmdList, m_shaderResources[0], m_width, m_height, m_colorFormat); + D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {}; + srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; + srvDesc.Format = m_shaderResources[0]->GetDesc().Format; + srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURECUBE; + srvDesc.Texture2D.MipLevels = m_shaderResources[0]->GetDesc().MipLevels; + device->CreateShaderResourceView(m_shaderResources[0].Get(), &srvDesc, cpuHandle); + cpuHandle.Offset(m_cbvSrvDescriptorSize); + + // Small texture. + for (int i = 1; i < m_numShaderResource; i++) { + Texture2D(device, cmdList, m_shaderResources[i], width, height, m_colorFormat); + D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {}; + srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; + srvDesc.Format = m_shaderResources[i]->GetDesc().Format; + srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D; + srvDesc.Texture2D.MipLevels = m_shaderResources[i]->GetDesc().MipLevels; + device->CreateShaderResourceView(m_shaderResources[i].Get(), &srvDesc, cpuHandle); + cpuHandle.Offset(m_cbvSrvDescriptorSize); + } +} + +void RenderGeometryPass::CreateConstantBufferResources(ID3D12Device *device) { + m_viewCB = std::make_unique>(device, 1, true); + m_objectCB = std::make_unique>(device, 1, true); + m_materialCB = std::make_unique>(device, 1, true); +} + +void RenderGeometryPass::UpdateConstantBufferData() { + BaseViewConstantBuffer viewCBData; + ObjectConstantBuffer objectCBData; + MaterialConstantBuffer materialCBData; + m_viewCB->CopyData(0, viewCBData); + m_objectCB->CopyData(0, objectCBData); + m_materialCB->CopyData(0, materialCBData); +} + +void RenderGeometryPass::BuildShapeGeometry(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList) { + // Create random geometry. + std::unique_ptr geoData = CreateRandomGeometry(m_opts->m_vertexNum, m_opts->m_indexNum); + m_geometry = std::make_unique(); + m_geometry->Create(device, cmdList, geoData); +} + +void RenderGeometryPass::BuildPipelineStates(ID3D12Device *device) { + std::string textureCount_str = std::to_string(m_numShaderResource - 1); + LPCSTR textureCount = textureCount_str.c_str(); + D3D_SHADER_MACRO defines[] = { + {"TEXTURECOUNT", textureCount}, + {nullptr, nullptr}}; // The last entry must be nullptr to indicate the end of the array + ComPtr vertexShader = CompileShader(L"Shaders/Base.hlsl", defines, "VS", "vs_5_1"); + ComPtr pixelShader = CompileShader(L"Shaders/Base.hlsl", defines, "PS", "ps_5_1"); + + // Define shader input layout. + std::vector inputLayout = { + {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}, + {"NORMAL", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 12, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 24, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}, + {"TANGENT", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 32, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}, + }; + + auto psoDesc = DefinePSODesc(inputLayout, vertexShader, pixelShader); + ThrowIfFailed(device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_PSOs["deferredBase"]))); +} + +void RenderGeometryPass::Draw(ID3D12GraphicsCommandList *cmdList) { DrawRenderItems(cmdList, m_opts->m_num_object); } + +void RenderGeometryPass::SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) { + cmdList->SetPipelineState(m_PSOs["deferredBase"].Get()); + cmdList->SetGraphicsRootSignature(m_rootSignature.Get()); + ID3D12DescriptorHeap *heaps[] = {m_srvDescriptorHeap.Get()}; + cmdList->SetDescriptorHeaps(_countof(heaps), heaps); + + cmdList->SetGraphicsRootConstantBufferView(0, m_objectCB.get()->Resource()->GetGPUVirtualAddress()); + cmdList->SetGraphicsRootConstantBufferView(1, m_viewCB.get()->Resource()->GetGPUVirtualAddress()); + cmdList->SetGraphicsRootConstantBufferView(2, m_materialCB.get()->Resource()->GetGPUVirtualAddress()); + CD3DX12_GPU_DESCRIPTOR_HANDLE srvHandle(m_srvDescriptorHeap->GetGPUDescriptorHandleForHeapStart()); + cmdList->SetGraphicsRootDescriptorTable(3, srvHandle); + srvHandle.Offset(1, m_cbvSrvDescriptorSize); + cmdList->SetGraphicsRootDescriptorTable(4, m_srvDescriptorHeap->GetGPUDescriptorHandleForHeapStart()); + + auto ri = m_geometry.get(); + // Set vertex and index buffers + cmdList->IASetVertexBuffers(0, 1, &ri->VertexBufferView()); + cmdList->IASetIndexBuffer(&ri->IndexBufferView()); + cmdList->IASetPrimitiveTopology(ri->PrimitiveType); +} \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.h new file mode 100644 index 000000000..db1056844 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderGeometryPass.h @@ -0,0 +1,99 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "RenderApp.h" + +class GeometryVertex : Vertex { + public: + GeometryVertex() : Vertex() { + float tx = MathHelper::genRand2N_f(2) - 1; + float ty = MathHelper::genRand2N_f(2) - 1; + float tz = MathHelper::genRand2N_f(2) - 1; + + float nx = MathHelper::genRand2N_f(2) - 1; + float ny = MathHelper::genRand2N_f(2) - 1; + float nz = MathHelper::genRand2N_f(2) - 1; + + float u = MathHelper::genRand2N_f(1); + float v = MathHelper::genRand2N_f(1); + + Normal = {tx, ty, tz}; + TangentU = {nx, ny, nz}; + TexC = {u, v}; + } + GeometryVertex(const DirectX::XMFLOAT3 &p, const DirectX::XMFLOAT3 &n, const DirectX::XMFLOAT2 &uv, + const DirectX::XMFLOAT3 &t) + : Vertex(p.x, p.y, p.z), Normal(n), TangentU(t), TexC(uv) {} + GeometryVertex(float px, float py, float pz, float nx, float ny, float nz, float tx, float ty, float tz, float u, + float v) + : Vertex(px, py, pz), Normal(nx, ny, nz), TangentU(tx, ty, tz), TexC(u, v) {} + GeometryVertex(const GeometryVertex &rhs) { + Normal = rhs.Normal; + TangentU = rhs.TangentU; + TexC = rhs.TexC; + x = rhs.x; + y = rhs.y; + z = rhs.z; + } + + DirectX::XMFLOAT3 Normal; + DirectX::XMFLOAT2 TexC; + DirectX::XMFLOAT3 TangentU; +}; + +struct ObjectConstantBuffer { + DirectX::XMFLOAT4X4 World = MathHelper::Identity4x4(); + DirectX::XMFLOAT4X4 TexTransform = MathHelper::Identity4x4(); + UINT MaterialIndex; +}; + +struct BaseViewConstantBuffer { + DirectX::XMFLOAT4X4 View = MathHelper::Identity4x4(); + DirectX::XMFLOAT4X4 ViewProj = MathHelper::Identity4x4(); + DirectX::XMFLOAT3 EyePosW = {0.0f, 0.0f, 0.0f}; +}; + +struct MaterialConstantBuffer { + DirectX::XMFLOAT4 DiffuseAlbedo = {1.0f, 1.0f, 1.0f, 1.0f}; + DirectX::XMFLOAT3 FresnelR0 = {0.01f, 0.01f, 0.01f}; + float Roughness = 0.5f; + + // Used in texture mapping. + DirectX::XMFLOAT4X4 MatTransform = MathHelper::Identity4x4(); + + UINT DiffuseMapIndex = 0; + UINT NormalMapIndex = 1; +}; + +class RenderGeometryPass : public RenderApp { + public: + RenderGeometryPass(BenchmarkOptions *args) : RenderApp(args) { + // screen + texture size + m_numShaderResource = args->m_textureNum + 1; + m_numPassRenderTargets = 3; + } + RenderGeometryPass(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle) + : RenderApp(args, hInstance, hMainWnd, winTitle) { + m_numShaderResource = args->m_textureNum + 1; + m_numPassRenderTargets = 3; + } + RenderGeometryPass(const RenderGeometryPass &rhs) = delete; + RenderGeometryPass &operator=(const RenderGeometryPass &rhs) = delete; + ~RenderGeometryPass() = default; + + protected: + virtual int DefineRootParameters(std::vector &rootParameters) override; + virtual int DefineStaticSamplers(std::vector &samplers) override; + virtual void CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width, + int height) override; + virtual void CreateConstantBufferResources(ID3D12Device *device) override; + virtual void UpdateConstantBufferData() override; + virtual void BuildPipelineStates(ID3D12Device *device) override; + virtual void Draw(ID3D12GraphicsCommandList *cmdList) override; + virtual void SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) override; + virtual void BuildShapeGeometry(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList) override; + + std::unique_ptr> m_objectCB = nullptr; + std::unique_ptr> m_viewCB = nullptr; + std::unique_ptr> m_materialCB = nullptr; +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.cpp new file mode 100644 index 000000000..418657149 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.cpp @@ -0,0 +1,217 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "RenderLightingPass.h" + +void RenderLightingPass::CreateConstantBufferResources(ID3D12Device *device) { + m_stencilingCB = std::make_unique>(device, 1, true); + m_viewCB = std::make_unique>(device, 1, true); + m_lightingCB = std::make_unique>(device, 1, true); + m_shadowProjectionCB = std::make_unique>(device, 1, true); +} + +void RenderLightingPass::UpdateConstantBufferData() { + StencilingConstantBuffer stencilCBData; + ViewConstantBuffer viewDBData; + DeferredLightUniformsConstantBuffer lightingCBData; + ShadowProjectionConstantBuffer shadowProjectionCBData; + + m_stencilingCB.get()->CopyData(0, stencilCBData); + m_viewCB.get()->CopyData(0, viewDBData); + m_lightingCB.get()->CopyData(0, lightingCBData); + m_shadowProjectionCB.get()->CopyData(0, shadowProjectionCBData); +} + +void RenderLightingPass::CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width, + int height) { + // Create a descriptor heap that will store the SRV: + D3D12_DESCRIPTOR_HEAP_DESC srvHeapDesc = {}; + srvHeapDesc.NumDescriptors = m_numShaderResource; + srvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; + srvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; + ThrowIfFailed(device->CreateDescriptorHeap(&srvHeapDesc, IID_PPV_ARGS(&m_srvDescriptorHeap))); + + CD3DX12_CPU_DESCRIPTOR_HANDLE cpuHandle(m_srvDescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + m_cbvSrvDescriptorSize = device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); + + // Fill out the heap with actual descriptors. + for (int i = 0; i < m_numShaderResource; i++) { + Texture2D(device, cmdList, m_shaderResources[i], width, height, m_colorFormat); + D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {}; + srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; + srvDesc.Format = m_shaderResources[i]->GetDesc().Format; + srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D; + srvDesc.Texture2D.MipLevels = m_shaderResources[i]->GetDesc().MipLevels; + device->CreateShaderResourceView(m_shaderResources[i].Get(), &srvDesc, cpuHandle); + cpuHandle.Offset(m_cbvSrvDescriptorSize); + } +} + +int RenderLightingPass::DefineRootParameters(std::vector &rootParameters) { + const int numRootParameters = 5; + rootParameters.resize(numRootParameters); + // Root signature defines what resources are bound to the graphics pipeline. + int rootParametersIndex = 0; + + // Create root signatures consisting of 3 constant buffers. + rootParameters[rootParametersIndex].InitAsConstantBufferView(rootParametersIndex, 0, + D3D12_SHADER_VISIBILITY_VERTEX); + rootParametersIndex++; + rootParameters[rootParametersIndex].InitAsConstantBufferView(rootParametersIndex, 0, D3D12_SHADER_VISIBILITY_ALL); + rootParametersIndex++; + rootParameters[rootParametersIndex].InitAsConstantBufferView(rootParametersIndex, 0, D3D12_SHADER_VISIBILITY_PIXEL); + rootParametersIndex++; + rootParameters[rootParametersIndex].InitAsConstantBufferView(rootParametersIndex, 0, D3D12_SHADER_VISIBILITY_PIXEL); + rootParametersIndex++; + + // SRV root parameter + std::unique_ptr descriptorRange = std::make_unique(1); + descriptorRange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, m_numShaderResource, 0, + 0); // Using valid D3D12_DESCRIPTOR_RANGE_TYPE_SRV + rootParameters[rootParametersIndex].InitAsDescriptorTable(1, descriptorRange.release(), + D3D12_SHADER_VISIBILITY_PIXEL); + rootParametersIndex++; + + return numRootParameters; +} + +void RenderLightingPass::BuildPipelineStates(ID3D12Device *device) { + // Define shader input layout. + std::vector inputLayout = {D3D12_INPUT_ELEMENT_DESC{ + "POSITION", // SemanticName + 0, // SemanticIndex + DXGI_FORMAT_R32G32B32_FLOAT, // Format + 0, // InputSlot + D3D12_APPEND_ALIGNED_ELEMENT, // AlignedByteOffset + D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, // InputSlotClass + 0 // InstanceDataStepRate + }}; + // Create the pipeline state, which includes compiling and loading shaders. + ComPtr vertexShader = + CompileShader(L"Shaders/DefferredLightingVertex.hlsl", nullptr, "RadialVertexMain", "vs_5_1"); + ComPtr pixelShader = + CompileShader(L"Shaders/DefferredLightingPixel.hlsl", nullptr, "DeferredLightPixelMain", "ps_5_1"); + auto psoDesc = DefinePSODesc(inputLayout, vertexShader, pixelShader); + ThrowIfFailed(device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_PSOs["deferredLighting"]))); + + vertexShader = CompileShader(L"Shaders/DefferredLightingVertex.hlsl", nullptr, "RadialVertexMain", "vs_5_1"); + pixelShader = + CompileShader(L"Shaders/DefferredLightingPixel.hlsl", nullptr, "MainOnePassPointLightShadowPS", "ps_5_1"); + psoDesc = DefinePSODesc(inputLayout, vertexShader, pixelShader); + ThrowIfFailed(device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_PSOs["ShadowProjection"]))); +} + +void RenderLightingPass::SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) { + ID3D12DescriptorHeap *ppHeaps[] = {m_srvDescriptorHeap.Get()}; + cmdList->SetDescriptorHeaps(ARRAYSIZE(ppHeaps), ppHeaps); + cmdList->SetGraphicsRootSignature(m_rootSignature.Get()); + cmdList->SetGraphicsRootConstantBufferView(0, m_stencilingCB.get()->Resource()->GetGPUVirtualAddress()); + cmdList->SetGraphicsRootConstantBufferView(1, m_viewCB.get()->Resource()->GetGPUVirtualAddress()); + cmdList->SetGraphicsRootConstantBufferView(2, m_lightingCB.get()->Resource()->GetGPUVirtualAddress()); + cmdList->SetGraphicsRootConstantBufferView(3, m_shadowProjectionCB.get()->Resource()->GetGPUVirtualAddress()); + cmdList->SetGraphicsRootDescriptorTable(4, m_srvDescriptorHeap->GetGPUDescriptorHandleForHeapStart()); + + auto ri = m_geometry.get(); + // Set vertex and index buffers + cmdList->IASetVertexBuffers(0, 1, &ri->VertexBufferView()); + cmdList->IASetIndexBuffer(&ri->IndexBufferView()); + cmdList->IASetPrimitiveTopology(ri->PrimitiveType); +} + +void RenderLightingPass::Draw(ID3D12GraphicsCommandList *cmdList) { + DrawShadowProjection(cmdList); + DrawLighting(cmdList); +} + +void RenderLightingPass::DrawShadowProjection(ID3D12GraphicsCommandList *cmdList) { + cmdList->SetPipelineState(m_PSOs["ShadowProjection"].Get()); + DrawRenderItems(cmdList, m_opts->m_num_light); +} + +void RenderLightingPass::DrawLighting(ID3D12GraphicsCommandList *cmdList) { + + cmdList->SetPipelineState(m_PSOs["deferredLighting"].Get()); + DrawRenderItems(cmdList, m_opts->m_num_light); +} + +/* + * @brief: Get the samplers. + * @return: The static samplers. + */ +int RenderLightingPass::DefineStaticSamplers(std::vector &samplerData) + +{ + int samplersCount = 10; + samplerData.resize(samplersCount); + + int samplerIndex = 0; + CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_SceneDepthTextureSampler( + samplerIndex, // shaderRegister + D3D12_FILTER_MIN_MAG_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + + samplerData[samplerIndex++] = SceneTexturesStruct_SceneDepthTextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferATextureSampler( + samplerIndex, // shaderRegister + D3D12_FILTER_MIN_MAG_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + samplerData[samplerIndex++] = SceneTexturesStruct_GBufferATextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferBTextureSampler( + samplerIndex, // shaderRegister + D3D12_FILTER_MIN_MAG_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + samplerData[samplerIndex++] = SceneTexturesStruct_GBufferBTextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferCTextureSampler( + samplerIndex, // shaderRegister + D3D12_FILTER_MIN_MAG_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + samplerData[samplerIndex++] = SceneTexturesStruct_GBufferCTextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferDTextureSampler( + samplerIndex, // shaderRegister + D3D12_FILTER_MIN_MAG_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + samplerData[samplerIndex++] = SceneTexturesStruct_GBufferDTextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_GBufferETextureSampler( + samplerIndex, // shaderRegister + D3D12_FILTER_MIN_MAG_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + samplerData[samplerIndex++] = SceneTexturesStruct_GBufferETextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_ScreenSpaceAOTextureSampler( + samplerIndex, // shaderRegister + D3D12_FILTER_MIN_MAG_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + samplerData[samplerIndex++] = SceneTexturesStruct_ScreenSpaceAOTextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC LightAttenuationTextureSampler( + samplerIndex, D3D12_FILTER_MIN_MAG_MIP_POINT, D3D12_TEXTURE_ADDRESS_MODE_WRAP, D3D12_TEXTURE_ADDRESS_MODE_WRAP, + D3D12_TEXTURE_ADDRESS_MODE_WRAP); + samplerData[samplerIndex++] = LightAttenuationTextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC SceneTexturesStruct_CustomDepthTextureSampler( + 8, // shaderRegister + D3D12_FILTER_MIN_MAG_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + samplerData[samplerIndex++] = SceneTexturesStruct_CustomDepthTextureSampler; + const CD3DX12_STATIC_SAMPLER_DESC ShadowDepthCubeTextureSampler(9, // shaderRegister + D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT, // filter + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressU + D3D12_TEXTURE_ADDRESS_MODE_CLAMP, // addressV + D3D12_TEXTURE_ADDRESS_MODE_CLAMP); // addressW + samplerData[samplerIndex++] = ShadowDepthCubeTextureSampler; + + return samplersCount; +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.h new file mode 100644 index 000000000..8a2388bf1 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderLightingPass.h @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include "RenderApp.h" + +struct DeferredLightUniformsConstantBuffer { + XMFLOAT4 DeferredLightUniforms_ShadowMapChannelMask = {0.00, 0.00, 0.00, 0.00}; + XMFLOAT2 DeferredLightUniforms_DistanceFadeMAD = {0.00, 0.00}; + float DeferredLightUniforms_ContactShadowLength = 0.00; + float DeferredLightUniforms_VolumetricScatteringIntensity = 1.00; + UINT DeferredLightUniforms_ShadowedBits = 3; + UINT DeferredLightUniforms_LightingChannelMask = 1; + float PrePadding_DeferredLightUniforms_40 = 0.00; + float PrePadding_DeferredLightUniforms_44 = 0.00; + XMFLOAT3 DeferredLightUniforms_Position = {722.74805, 2515.36084, 94.87169}; + float DeferredLightUniforms_InvRadius = 0.00195; + XMFLOAT3 DeferredLightUniforms_Color = {8.64818, 6.97867, 4.4531}; + float DeferredLightUniforms_FalloffExponent = 8.00; + XMFLOAT3 DeferredLightUniforms_Direction = {1.00, 0.00, 0.00}; + float DeferredLightUniforms_SpecularScale = 1.00; + XMFLOAT3 DeferredLightUniforms_Tangent = {0.00, 0.00, 1.00}; + float DeferredLightUniforms_SourceRadius = 0.00; + XMFLOAT2 DeferredLightUniforms_SpotAngles = {2.00, 1.00}; + float DeferredLightUniforms_SoftSourceRadius = 0.00; + float DeferredLightUniforms_SourceLength = 0.00; + float DeferredLightUniforms_RectLightBarnCosAngle = 2652.84375; + float DeferredLightUniforms_RectLightBarnLength = 5.89947E-43; +}; + +struct ViewConstantBuffer { + XMFLOAT4 View_InvDeviceZToWorldZTransform = {0.00, 0.00, 0.10, -1.00000E-08}; + XMFLOAT4 View_TemporalAAParams = {0.00, 1.00, 0.00, 0.00}; + XMFLOAT4 View_BufferSizeAndInvSize = {1384.00, 676.00, 0.00072, 0.00148}; + XMFLOAT4 View_DiffuseOverrideParameter = {0.00, 0.00, 0.00, 1.00}; + XMFLOAT4 View_SpecularOverrideParameter = {0.00, 0.00, 0.00, 1.00}; + XMFLOAT4X4 View_ClipToView = {0.00, 0.48821, 0.00, 0.00, 0.00, 0.00, 0.00, 0.10, + 0.00, 0.00, 1.00, 0.00, 0, 0, 0, 0}; + XMFLOAT4X4 View_ViewToClip = { + 1.00, 0.00, 0.00, 0.00, 0.00, 2.04831, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 10.00, 0.00, + }; + XMFLOAT4X4 View_ScreenToWorld = {-0.04472, 0.00587, 0.48612, 0.00, -0.98725, 0.12963, -0.09239, 0.00, + -7.70195, 2584.20215, 184.65012, 1.00, 0, 0, 0, 0}; + XMFLOAT3 View_WorldCameraOrigin = {-7.70195, 2584.20215, 184.65012}; + float padding0 = 0; + + XMFLOAT3 View_PreViewTranslation = {7.70195, -2584.20215, -184.65012}; + float padding1 = 0; + XMFLOAT4 View_ScreenPositionScaleBias = {0.49928, -0.49926, 0.49926, 0.49928}; + + XMFLOAT4X4 View_TranslatedWorldToClip = { + 1.00, 0.00, 0.00, 0.00, 0.00, 2.04831, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 10.00, 0.00, + }; + UINT View_StateFrameIndexMod8View_StateFrameIndexMod8 = 1; + XMFLOAT3 Padding = {0, 0, 0}; // Add padding to maintain 16-byte alignment +}; + +struct StencilingConstantBuffer { + XMFLOAT4 StencilingGeometryPosAndScale = {715.04608, -68.84131, -89.77843, 530.0614}; + XMFLOAT4 StencilingConeParameters = {0.00, 0.00, 0.00, 0.00}; + XMFLOAT4X4 StencilingConeTransform = {0.00, 0.00, -0.005, 0.00, -1.00, 1.00, 0.50, 1.00, + 1.00, 0.00, -1.00143, -1.00, 0.00, -1.00, 0.00, 0.00}; + + XMFLOAT3 StencilingPreViewTranslation = {1.00, 0.00, 0.00}; +}; + +struct ShadowProjectionConstantBuffer { + XMFLOAT4 LightPositionAndInvRadius = {722.74805, 2515.36084, 94.87169, 0.00195}; + XMFLOAT4 PointLightDepthBiasAndProjParameters = {0.025, 0.00, -0.99805, -1.00}; + XMFLOAT4X4 ShadowViewProjectionMatrices[6] = {{0.00, 0.00, -1.00196, -1.00, 0.00, -1.00, 0.00, 0.00, 1.00, 0.00, + 0.00, 0.00, -94.87168, 2515.3606, -725.16437, -722.74805}, + {0.00, 0.00, 1.00196, 1.00, 0.00, -1.00, 0.00, 0.00, -1.00, 0.00, + 0.00, 0.00, 94.87168, 2515.3606, 723.16046, 722.74805}, + {-1.00, 0.00, 0.00, 0.00, 0.00, 0.00, -1.00196, -1.00, 0.00, 1.00, + 0.00, 0.00, -722.74799, -94.87168, 2519.28125, 2515.36084}, + {-1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1.00196, 1.00, 0.00, -1.00, + 0.00, 0.00, -722.74799, 94.87168, -2521.28516, -2515.36084}, + {-1.00, 0.00, 0.00, 0.00, 0.00, -1.00, 0.00, 0.00, 0.00, 0.00, + -1.00196, -1.00, -722.74799, 2515.3606, 94.05539, 94.87169}, + {1.00, 0.00, 0.00, 0.00, 0.00, -1.00, 0.00, 0.00, 0.00, 0.00, 1.00196, + 1.00, 722.74799, 2515.3606, -96.05931, -94.87169}}; + + float ShadowSharpen = 1; + float ShadowFadeFraction = 1; + float InvShadowmapResolution = 0.00098; +}; + +class RenderLightingPass : public RenderApp { + public: + RenderLightingPass(BenchmarkOptions *opts) : RenderApp(opts) { + m_numShaderResource = 10; + m_numPassRenderTargets = 1; + } + RenderLightingPass(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle) + : RenderApp(args, hInstance, hMainWnd, winTitle) {} + RenderLightingPass(const RenderLightingPass &rhs) = delete; + RenderLightingPass &operator=(const RenderLightingPass &rhs) = delete; + ~RenderLightingPass() = default; + + void DrawShadowProjection(ID3D12GraphicsCommandList *cmdList); + void DrawLighting(ID3D12GraphicsCommandList *cmdList); + + protected: + virtual int DefineRootParameters(std::vector &rootParameters) override; + virtual int DefineStaticSamplers(std::vector &samplers) override; + virtual void CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width, + int height) override; + virtual void CreateConstantBufferResources(ID3D12Device *device) override; + virtual void UpdateConstantBufferData() override; + virtual void BuildPipelineStates(ID3D12Device *device) override; + virtual void Draw(ID3D12GraphicsCommandList *cmdList) override; + virtual void SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) override; + + std::unique_ptr> m_stencilingCB = nullptr; + std::unique_ptr> m_viewCB = nullptr; + std::unique_ptr> m_lightingCB = nullptr; + std::unique_ptr> m_shadowProjectionCB = nullptr; +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.cpp b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.cpp new file mode 100644 index 000000000..d058c3937 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.cpp @@ -0,0 +1,77 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "RenderShadowMapPass.h" + +int RenderShadowMapPass::DefineRootParameters(std::vector &rootParameters) { + const int numRootParameters = 1; + + rootParameters.resize(numRootParameters); + rootParameters[0].InitAsConstantBufferView(0); // obj cb + return numRootParameters; +} + +int RenderShadowMapPass::DefineStaticSamplers(std::vector &samplers) { return 0; } + +void RenderShadowMapPass::CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width, + int height) { + return; +} + +void RenderShadowMapPass::CreateConstantBufferResources(ID3D12Device *device) { + m_viewCB = std::make_unique>(device, 1, true); +} + +void RenderShadowMapPass::UpdateConstantBufferData() { + ShadowViewConstantBuffer viewDBData; + viewDBData.world = MathHelper::Identity4x4(); + viewDBData.projection = MathHelper::Identity4x4(); + m_viewCB.get()->CopyData(0, viewDBData); +} + +void RenderShadowMapPass::BuildPipelineStates(ID3D12Device *device) { + // Define shader input layout. + std::vector inputLayout = { + {"POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 24, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0} + + }; + // Create the pipeline state, which includes compiling and loading shaders. + ComPtr vertexShader = CompileShader(L"Shaders/ShadowMap.hlsl", nullptr, "VS", "vs_5_1"); + ComPtr pixelShader = CompileShader(L"Shaders/ShadowMap.hlsl", nullptr, "PS", "ps_5_1"); + + CD3DX12_DEPTH_STENCIL_DESC depthStencilDesc(D3D12_DEFAULT); + depthStencilDesc.DepthEnable = true; + depthStencilDesc.DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ALL; + depthStencilDesc.DepthFunc = D3D12_COMPARISON_FUNC_LESS; + depthStencilDesc.StencilEnable = false; + + auto psoDesc = DefinePSODesc(inputLayout, vertexShader, pixelShader); + psoDesc.DSVFormat = m_deviceResources->m_depthBufferFormat; + psoDesc.RasterizerState.DepthBias = 100000; + psoDesc.RasterizerState.DepthBiasClamp = 0.0f; + psoDesc.RasterizerState.SlopeScaledDepthBias = 1.0f; + psoDesc.DepthStencilState = depthStencilDesc; + + ThrowIfFailed(device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&m_PSOs["ShadowMap"]))); +} + +void RenderShadowMapPass::SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) { + // Set necessary state. + cmdList->SetPipelineState(m_PSOs["ShadowMap"].Get()); + cmdList->SetGraphicsRootSignature(m_rootSignature.Get()); + auto dsv = m_deviceResources->GetDepthStencilView(); + cmdList->ClearDepthStencilView(dsv, D3D12_CLEAR_FLAG_DEPTH | D3D12_CLEAR_FLAG_STENCIL, 1.0f, 0, 0, nullptr); + auto device = m_deviceResources->GetD3DDevice(); + cmdList->OMSetRenderTargets(1, &GetRenderTargetView(device), true, &dsv); + // Set root arguments. + cmdList->SetGraphicsRootConstantBufferView(0, m_viewCB->Resource()->GetGPUVirtualAddress()); + + auto ri = m_geometry.get(); + // Set vertex and index buffers + cmdList->IASetVertexBuffers(0, 1, &ri->VertexBufferView()); + cmdList->IASetIndexBuffer(&ri->IndexBufferView()); + cmdList->IASetPrimitiveTopology(ri->PrimitiveType); +} + +void RenderShadowMapPass::Draw(ID3D12GraphicsCommandList *cmdList) { DrawRenderItems(cmdList, m_opts->m_num_object); } diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.h b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.h new file mode 100644 index 000000000..c6f3a6909 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/RenderShadowMapPass.h @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#pragma once + +#include "RenderApp.h" + +struct ShadowViewConstantBuffer { + XMFLOAT4X4 world; + XMFLOAT4X4 projection; +}; + +class RenderShadowMapPass : public RenderApp { + public: + RenderShadowMapPass(BenchmarkOptions *opts) : RenderApp(opts) {} + RenderShadowMapPass(BenchmarkOptions *args, HINSTANCE hInstance, HWND hMainWnd, std::wstring &winTitle) + : RenderApp(args, hInstance, hMainWnd, winTitle) {} + RenderShadowMapPass(const RenderShadowMapPass &rhs) = delete; + RenderShadowMapPass &operator=(const RenderShadowMapPass &rhs) = delete; + ~RenderShadowMapPass() = default; + + protected: + virtual int DefineRootParameters(std::vector &rootParameters) override; + virtual int DefineStaticSamplers(std::vector &samplers) override; + virtual void CreateShaderResourceView(ID3D12Device *device, ID3D12GraphicsCommandList *cmdList, int width, + int height) override; + virtual void CreateConstantBufferResources(ID3D12Device *device) override; + virtual void UpdateConstantBufferData() override; + virtual void BuildPipelineStates(ID3D12Device *device) override; + virtual void Draw(ID3D12GraphicsCommandList *cmdList) override; + virtual void SetStatesBeforeDraw(ID3D12GraphicsCommandList *cmdList) override; + + std::unique_ptr> m_viewCB = nullptr; +}; diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/Base.hlsl b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/Base.hlsl new file mode 100644 index 000000000..3f7643d3c --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/Base.hlsl @@ -0,0 +1,134 @@ +// Scene cube. +TextureCube gCubeMap : register(t0); + +// An array of textures, which is only supported in shader model 5.1+. Unlike Texture2DArray, the textures +// in this array can be different sizes and formats, making it more flexible than texture arrays. +Texture2D gTextureMaps[TEXTURECOUNT] : register(t1); + + +SamplerState gsamAnisotropicWrap : register(s0); + + +// Constant data that varies per frame. +cbuffer D3DObjectConstantBuffer : register(b0) +{ + float4x4 gWorld; + float4x4 gTexTransform; + uint gMaterialIndex; +}; + +// Constant data that varies per material. +cbuffer PassConstantBuffer : register(b1) +{ + float4x4 gView; + float4x4 gViewProj; + float3 gEyePosW; +}; + +cbuffer MaterialDataConstantBuffer : register(b2) +{ + float4 DiffuseAlbedo; + float3 FresnelR0; + float Roughness; + float4x4 MatTransform; + uint DiffuseMapIndex; + uint NormalMapIndex; +}; + + +//--------------------------------------------------------------------------------------- +// Transforms a normal map sample to world space. +//--------------------------------------------------------------------------------------- +float3 NormalSampleToWorldSpace(float3 normalMapSample, float3 unitNormalW, float3 tangentW) +{ + // Transform from [0,1] to [-1,1]. + float3 normalT = 2.0f * normalMapSample - 1.0f; + + float3 N = unitNormalW; + float3 T = normalize(tangentW - dot(tangentW, N) * N); + float3 B = cross(N, T); + + float3x3 TBN = float3x3(T, B, N); + + // Trans to world space. + float3 bumpedNormalW = mul(normalT, TBN); + + return bumpedNormalW; +} + + +struct VertexIn +{ + float3 PosL : POSITION; + float3 NormalL : NORMAL; + float2 TexC : TEXCOORD; + float3 TangentU : TANGENT; +}; + +struct VertexOut +{ + float4 PosH : SV_POSITION; + float3 PosW : POSITION; + float3 NormalW : NORMAL; + float3 TangentW : TANGENT; + float2 TexC : TEXCOORD; +}; + +struct PixelOut +{ + float4 position : SV_Target0; + float4 normal : SV_Target1; + float4 color : SV_Target2; +}; + +float3 SchlickFresnel(float3 R0, float3 normal, float3 lightVec) +{ + float cosIncidentAngle = saturate(dot(normal, lightVec)); + + float f0 = 1.0f - cosIncidentAngle; + float3 reflectPercent = R0 + (1.0f - R0) * (f0 * f0 * f0 * f0 * f0); + + return reflectPercent; +} + +VertexOut VS(VertexIn vin) +{ + VertexOut vout = (VertexOut)0.0f; + + float4 posW = mul(float4(vin.PosL, 1.0f), gWorld); + vout.PosW = posW.xyz; + vout.PosH = mul(posW, gViewProj); + vout.NormalW = mul(vin.NormalL, (float3x3) gWorld); + vout.TangentW = mul(vin.TangentU, (float3x3) gWorld); + float4 texC = mul(float4(vin.TexC, 0.0f, 1.0f), gTexTransform); + vout.TexC = mul(texC, MatTransform).xy; + + return vout; +} + +PixelOut PS(VertexOut pin) +{ + // Normalize normap map. + pin.NormalW = normalize(pin.NormalW); + + float4 normalSample = gTextureMaps[NormalMapIndex].Sample(gsamAnisotropicWrap, pin.TexC); + float3 bumpedNormalW = NormalSampleToWorldSpace(normalSample.xyz, pin.NormalW, pin.TangentW); + float4 diffuseAlbedo = DiffuseAlbedo * + gTextureMaps[DiffuseMapIndex].Sample(gsamAnisotropicWrap, pin.TexC); + + const float shininess = (1.0f - Roughness) * normalSample.a; + + PixelOut pout; + pout.position = float4(pin.PosW, FresnelR0.x); + pout.normal = float4(bumpedNormalW, shininess); + + float3 toEyeW = normalize(gEyePosW - pin.PosW); + float3 ref = reflect(-toEyeW, bumpedNormalW); + float4 reflectColor = gCubeMap.Sample(gsamAnisotropicWrap, ref); + float3 fresnelFactor = SchlickFresnel(FresnelR0, bumpedNormalW, ref); + pout.color = float4(diffuseAlbedo.xyz + shininess * fresnelFactor * reflectColor.xyz, 1.0f); + + return pout; +} + + diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingPixel.hlsl b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingPixel.hlsl new file mode 100644 index 000000000..29aa971a1 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingPixel.hlsl @@ -0,0 +1,919 @@ + + +// Constant buffers +cbuffer ViewConstantBuffer : register(b1) +{ + float4 View_InvDeviceZToWorldZTransform; + float4 View_TemporalAAParams; + float4 View_BufferSizeAndInvSize; + float4 View_DiffuseOverrideParameter; + float4 View_SpecularOverrideParameter; + float4x4 View_ClipToView; + float4x4 View_ViewToClip; + float4x4 View_ScreenToWorld; + float3 View_WorldCameraOrigin; + float Padding0; + float3 View_PreViewTranslation; + float Padding1; + float4 View_ScreenPositionScaleBias; + float4x4 View_TranslatedWorldToClip; + uint View_StateFrameIndexMod8; + float3 Padding; // Add padding to maintain 16-byte alignment +}; + + + +cbuffer DeferredLightUniformsConstantBuffer : register(b2) +{ + float4 DeferredLightUniforms_ShadowMapChannelMask; + float2 DeferredLightUniforms_DistanceFadeMAD; + float DeferredLightUniforms_ContactShadowLength; + float DeferredLightUniforms_VolumetricScatteringIntensity; + uint DeferredLightUniforms_ShadowedBits; + uint DeferredLightUniforms_LightingChannelMask; + float3 DeferredLightUniforms_Position; + float DeferredLightUniforms_InvRadius; + float3 DeferredLightUniforms_Color; + float DeferredLightUniforms_FalloffExponent; + float3 DeferredLightUniforms_Direction; + float DeferredLightUniforms_SpecularScale; + float3 DeferredLightUniforms_Tangent; + float DeferredLightUniforms_SourceRadius; + float2 DeferredLightUniforms_SpotAngles; + float DeferredLightUniforms_SoftSourceRadius; + float DeferredLightUniforms_SourceLength; + float DeferredLightUniforms_RectLightBarnCosAngle; + float DeferredLightUniforms_RectLightBarnLength; +}; + +cbuffer ShadowConstantBuffer : register(b3) +{ + float4 LightPositionAndInvRadius; + float4 PointLightDepthBiasAndProjParameters; + float4x4 ShadowViewProjectionMatrices[6]; + float ShadowSharpen; + float ShadowFadeFraction; + float InvShadowmapResolution; +} + + + +// Texture declarations +Texture2D SceneTexturesStruct_SceneDepthTexture: register(t0); +Texture2D SceneTexturesStruct_GBufferATexture : register(t1); +Texture2D SceneTexturesStruct_GBufferBTexture : register(t2); +Texture2D SceneTexturesStruct_GBufferCTexture : register(t3); +Texture2D SceneTexturesStruct_GBufferDTexture : register(t4); +Texture2D SceneTexturesStruct_GBufferETexture : register(t5); +Texture2D SceneTexturesStruct_ScreenSpaceAOTexture : register(t6); +Texture2D LightAttenuationTexture: register(t7); +Texture2D SceneTexturesStruct_CustomDepthTexture : register(t8); +Texture2D ShadowDepthCubeTexture : register(t9); + + + +// Sampler declarations (assuming sampler registers are in the same register space) +SamplerState SceneTexturesStruct_SceneDepthTextureSampler : register(s0); +SamplerState SceneTexturesStruct_GBufferATextureSampler : register(s1); +SamplerState SceneTexturesStruct_GBufferBTextureSampler : register(s2); +SamplerState SceneTexturesStruct_GBufferCTextureSampler : register(s3); +SamplerState SceneTexturesStruct_GBufferDTextureSampler : register(s4); +SamplerState SceneTexturesStruct_GBufferETextureSampler : register(s5); +SamplerState SceneTexturesStruct_ScreenSpaceAOTextureSampler : register(s6); +SamplerState LightAttenuationTextureSampler : register(s7); +SamplerState SceneTexturesStruct_CustomDepthTextureSampler : register(s8); +SamplerComparisonState ShadowDepthCubeTextureSampler : register(s9); + +const static float PI = 3.1415926535897932f; +const static float MaxHalfFloat = 65504.0f; + +struct FLightAccumulator { + float3 Diffuse; + float3 Specular; + float3 Transmission; + float EstimatedCost; + float3 TotalLight; +}; + +struct FGBufferData { + float3 WorldNormal; + float PerObjectGBufferData; + float Metallic; + float Specular; + float Roughness; + uint ShadingModelID; + uint SelectiveOutputMask; + float3 BaseColor; + float GBufferAO; + float IndirectIrradiance; + float4 CustomData; + float4 PrecomputedShadowFactors; + float CustomDepth; + uint CustomStencil; + float Depth; + float3 StoredBaseColor; + float StoredMetallic; + float StoredSpecular; + float3 SpecularColor; + float3 DiffuseColor; + float4 Velocity; +}; + + +struct FDeferredLightData { + float3 Position; + float InvRadius; + float3 Color; + float FalloffExponent; + float3 Direction; + float3 Tangent; + float2 SpotAngles; + float SourceRadius; + float SourceLength; + float SoftSourceRadius; + float SpecularScale; + float ContactShadowLength; + bool ContactShadowLengthInWS; + float2 DistanceFadeMAD; + float4 ShadowMapChannelMask; + uint ShadowedBits; + bool bInverseSquared; + bool bRadialLight; + bool bSpotLight; + bool bRectLight; + float RectLightBarnCosAngle; + float RectLightBarnLength; +}; + + +struct FShadowTerms { + float SurfaceShadow; + float TransmissionShadow; + float TransmissionThickness; +}; + +struct FDirectLighting { + float3 Diffuse; + float3 Specular; + float3 Transmission; +}; + +struct FRectTexture { + float Dummy; +}; + +struct FCapsuleLight { + float Length; + float Radius; + float SoftRadius; + float DistBiasSqr; + float3 LightPos[2]; +}; + +struct FRect { + float Dummy; +}; + + +struct FAreaLight +{ + float SphereSinAlpha; + float SphereSinAlphaSoft; + float LineCosSubtended; + float FalloffColor; + FRect Rect; // Assuming FRect is a custom struct representing a rectangle + bool bIsRect; + FRectTexture Texture; +}; + + +struct BxDFContext { + float NoL; // Normal dot Light + float NoV; // Normal dot View + float VoL; // View dot Light + float NoH; // Normal dot Half + float VoH; // View dot Half +}; + +struct FScreenSpaceData { + FGBufferData GBuffer; + float AmbientOcclusion; +}; + +Texture2D DummyRectLightTextureForCapsuleCompilerWarning; +Texture2D DeferredLightUniforms_SourceTexture; + +FLightAccumulator LightAccumulator_Init() { + FLightAccumulator acc; + acc.TotalLight = float3(0, 0, 0); + acc.EstimatedCost = 0; + return acc; +} + +FRectTexture InitRectTexture(Texture2D SourceTexture) { + FRectTexture Output; + Output.Dummy = 0; + + return Output; +} + +float4 Texture2DSampleLevel(Texture2D Tex, SamplerState Sampler, float2 UV, + float Mip) { + return Tex.SampleLevel(Sampler, UV, Mip); +} + +float ConvertFromDeviceZ(float DeviceZ) { + + return DeviceZ * View_InvDeviceZToWorldZTransform[0] + + View_InvDeviceZToWorldZTransform[1] + + 1.0f / (DeviceZ * View_InvDeviceZToWorldZTransform[2] - + View_InvDeviceZToWorldZTransform[3]); +} + +float CalcSceneDepth(float2 ScreenUV) { + + return ConvertFromDeviceZ( + Texture2DSampleLevel(SceneTexturesStruct_SceneDepthTexture, + SceneTexturesStruct_SceneDepthTextureSampler, + ScreenUV, 0) + .r); +} + + +bool CheckerFromPixelPos(uint2 PixelPos) { + + uint TemporalAASampleIndex = View_TemporalAAParams.x; + + return (PixelPos.x + PixelPos.y + TemporalAASampleIndex) % 2; +} + +bool UseSubsurfaceProfile(int ShadingModel) { + return ShadingModel == 5 || ShadingModel == 9; +} + +bool CheckerFromSceneColorUV(float2 UVSceneColor) { + + uint2 PixelPos = uint2(UVSceneColor * View_BufferSizeAndInvSize.xy); + + return CheckerFromPixelPos(PixelPos); +} + +float3 DecodeNormal(float3 N) { return N * 2 - 1; } + +uint DecodeShadingModelId(float InPackedChannel) { + return ((uint)round(InPackedChannel * (float)0xFF)) & 0xF; +} + +uint DecodeSelectiveOutputMask(float InPackedChannel) { + return ((uint)round(InPackedChannel * (float)0xFF)) & ~0xF; +} + +float3 DecodeBaseColor(float3 BaseColor) { return BaseColor; } + +float DecodeIndirectIrradiance(float IndirectIrradiance) { + + const float OneOverPreExposure = 1.f; + + float LogL = IndirectIrradiance; + const float LogBlackPoint = 0.00390625; + return OneOverPreExposure * (exp2(LogL * 16 - 8) - LogBlackPoint); +} + +float DielectricSpecularToF0(float Specular) { return 0.08f * Specular; } + +float Lerp(float a, float b, float t) { return a + (b - a) * t; } + +float3 ComputeF0(float Specular, float3 BaseColor, float Metallic) { + float4 F0 = DielectricSpecularToF0(Specular); + return lerp(F0.xxx, BaseColor, Metallic.xxx); +} + +FGBufferData DecodeGBufferData(float4 InGBufferA, float4 InGBufferB, + float4 InGBufferC, float4 InGBufferD, + float4 InGBufferE, float4 InGBufferVelocity, + float CustomNativeDepth, uint CustomStencil, + float SceneDepth, bool bGetNormalizedNormal, + bool bChecker) { + FGBufferData GBuffer; + + GBuffer.WorldNormal = DecodeNormal(InGBufferA.xyz); + if (bGetNormalizedNormal) { + GBuffer.WorldNormal = normalize(GBuffer.WorldNormal); + } + + GBuffer.PerObjectGBufferData = InGBufferA.a; + GBuffer.Metallic = InGBufferB.r; + GBuffer.Specular = InGBufferB.g; + GBuffer.Roughness = InGBufferB.b; + + GBuffer.ShadingModelID = DecodeShadingModelId(InGBufferB.a); + GBuffer.SelectiveOutputMask = DecodeSelectiveOutputMask(InGBufferB.a); + + GBuffer.BaseColor = DecodeBaseColor(InGBufferC.rgb); + + GBuffer.GBufferAO = 1; + GBuffer.IndirectIrradiance = DecodeIndirectIrradiance(InGBufferC.a); + + GBuffer.CustomData = + !(GBuffer.SelectiveOutputMask & (1 << 4)) ? InGBufferD : 0; + + GBuffer.PrecomputedShadowFactors = + !(GBuffer.SelectiveOutputMask & (1 << 5)) + ? InGBufferE + : ((GBuffer.SelectiveOutputMask & (1 << 6)) ? 0 : 1); + GBuffer.CustomDepth = ConvertFromDeviceZ(CustomNativeDepth); + GBuffer.CustomStencil = CustomStencil; + GBuffer.Depth = SceneDepth; + + GBuffer.StoredBaseColor = GBuffer.BaseColor; + GBuffer.StoredMetallic = GBuffer.Metallic; + GBuffer.StoredSpecular = GBuffer.Specular; + + [flatten] if (GBuffer.ShadingModelID == 9) { GBuffer.Metallic = 0.0; } + + { + GBuffer.SpecularColor = + ComputeF0(GBuffer.Specular, GBuffer.BaseColor, GBuffer.Metallic); + + GBuffer.DiffuseColor = + GBuffer.BaseColor - GBuffer.BaseColor * GBuffer.Metallic; + + { + + GBuffer.DiffuseColor = + GBuffer.DiffuseColor * View_DiffuseOverrideParameter.www + + View_DiffuseOverrideParameter.xyz; + GBuffer.SpecularColor = + GBuffer.SpecularColor * View_SpecularOverrideParameter.w + + View_SpecularOverrideParameter.xyz; + } + } + + GBuffer.Velocity = + !(GBuffer.SelectiveOutputMask & (1 << 7)) ? InGBufferVelocity : 0; + + return GBuffer; +} + +FGBufferData GetGBufferData(float2 UV, bool bGetNormalizedNormal = true) { + float4 GBufferA = + Texture2DSampleLevel(SceneTexturesStruct_GBufferATexture, + SceneTexturesStruct_GBufferATextureSampler, UV, 0); + float4 GBufferB = + Texture2DSampleLevel(SceneTexturesStruct_GBufferBTexture, + SceneTexturesStruct_GBufferBTextureSampler, UV, 0); + float4 GBufferC = + Texture2DSampleLevel(SceneTexturesStruct_GBufferCTexture, + SceneTexturesStruct_GBufferCTextureSampler, UV, 0); + float4 GBufferD = + Texture2DSampleLevel(SceneTexturesStruct_GBufferDTexture, + SceneTexturesStruct_GBufferDTextureSampler, UV, 0); + float CustomNativeDepth = + Texture2DSampleLevel(SceneTexturesStruct_CustomDepthTexture, + SceneTexturesStruct_CustomDepthTextureSampler, UV, 0) + .r; + uint CustomStencil; + + float4 GBufferE = + Texture2DSampleLevel(SceneTexturesStruct_GBufferETexture, + SceneTexturesStruct_GBufferETextureSampler, UV, 0); + + float4 GBufferVelocity = 0; + + float SceneDepth = CalcSceneDepth(UV); + + return DecodeGBufferData(GBufferA, GBufferB, GBufferC, GBufferD, GBufferE, + GBufferVelocity, CustomNativeDepth, CustomStencil, + SceneDepth, bGetNormalizedNormal, + CheckerFromSceneColorUV(UV)); +} + + +FScreenSpaceData GetScreenSpaceData(float2 UV, + bool bGetNormalizedNormal = true) { + FScreenSpaceData Out; + + Out.GBuffer = GetGBufferData(UV, bGetNormalizedNormal); + float4 ScreenSpaceAO = Texture2DSampleLevel( + SceneTexturesStruct_ScreenSpaceAOTexture, + SceneTexturesStruct_ScreenSpaceAOTextureSampler, UV, 0); + + Out.AmbientOcclusion = ScreenSpaceAO.r; + + return Out; +} + +FDeferredLightData SetupLightDataForStandardDeferred() { + + FDeferredLightData LightData; + LightData.Position = DeferredLightUniforms_Position; + LightData.InvRadius = DeferredLightUniforms_InvRadius; + LightData.Color = DeferredLightUniforms_Color; + LightData.FalloffExponent = DeferredLightUniforms_FalloffExponent; + LightData.Direction = DeferredLightUniforms_Direction; + LightData.Tangent = DeferredLightUniforms_Tangent; + LightData.SpotAngles = DeferredLightUniforms_SpotAngles; + LightData.SourceRadius = DeferredLightUniforms_SourceRadius, + LightData.SourceLength = DeferredLightUniforms_SourceLength; + LightData.SoftSourceRadius = DeferredLightUniforms_SoftSourceRadius; + LightData.SpecularScale = DeferredLightUniforms_SpecularScale; + LightData.ContactShadowLength = + abs(DeferredLightUniforms_ContactShadowLength); + LightData.ContactShadowLengthInWS = + DeferredLightUniforms_ContactShadowLength < 0.0f; + LightData.DistanceFadeMAD = DeferredLightUniforms_DistanceFadeMAD; + LightData.ShadowMapChannelMask = DeferredLightUniforms_ShadowMapChannelMask; + LightData.ShadowedBits = DeferredLightUniforms_ShadowedBits; + + LightData.bInverseSquared = 0; + LightData.bRadialLight = 1 > 0; + + LightData.bSpotLight = 1 > 0; + LightData.bRectLight = 1 == 2; + + LightData.RectLightBarnCosAngle = DeferredLightUniforms_RectLightBarnCosAngle; + LightData.RectLightBarnLength = DeferredLightUniforms_RectLightBarnLength; + + return LightData; +} + + +float InterleavedGradientNoise(float2 uv, float FrameId) { + + uv += FrameId * (float2(47, 17) * 0.695f); + + const float3 magic = float3(0.06711056f, 0.00583715f, 52.9829189f); + return frac(magic.z * frac(dot(uv, magic.xy))); +} + +float4 Square(float4 x) { + return x * x; +} + +float Square(float x) { + return x * x; +} + +float2 Square(float2 x) { + return x * x; +} + +float3 Square(float3 x) { + return x * x; +} + +float4 GetPerPixelLightAttenuation(float2 UV) { + return Square(Texture2DSampleLevel(LightAttenuationTexture, + LightAttenuationTextureSampler, UV, 0)); +} + +float RadialAttenuation(float3 WorldLightVector, float FalloffExponent) { + float NormalizeDistanceSquared = dot(WorldLightVector, WorldLightVector); + + return pow(1.0f - saturate(NormalizeDistanceSquared), FalloffExponent); +} + +float SpotAttenuation(float3 L, float3 SpotDirection, float2 SpotAngles) { + float ConeAngleFalloff = + Square(saturate((dot(L, -SpotDirection) - SpotAngles.x) * SpotAngles.y)); + return ConeAngleFalloff; +} + + +float GetLocalLightAttenuation(float3 WorldPosition, + FDeferredLightData LightData, + inout float3 ToLight, inout float3 L) { + ToLight = LightData.Position - WorldPosition; + + float DistanceSqr = dot(ToLight, ToLight); + L = ToLight * rsqrt(DistanceSqr); + + float LightMask; + if (LightData.bInverseSquared) { + LightMask = + Square(saturate(1 - Square(DistanceSqr * Square(LightData.InvRadius)))); + } + else { + LightMask = RadialAttenuation(ToLight * LightData.InvRadius, + LightData.FalloffExponent); + } + + if (LightData.bSpotLight) { + LightMask *= SpotAttenuation(L, -LightData.Direction, LightData.SpotAngles); + } + + if (LightData.bRectLight) { + + LightMask = dot(LightData.Direction, L) < 0 ? 0 : LightMask; + } + + return LightMask; +} + + +// Example implementation of DistanceFromCameraFade() function +float DistanceFromCameraFade(float depth, FDeferredLightData lightData, float3 worldPosition, float3 cameraOrigin) +{ + // Calculate the distance between the world position and the camera origin + float distance = length(worldPosition - cameraOrigin); + + // Apply a fade function based on the distance + float fadeFactor = saturate(1.0f - distance / depth); + + return fadeFactor; +} + + +// Main function +void GetShadowTerms(FGBufferData GBuffer, FDeferredLightData LightData, + float3 WorldPosition, float3 L, float4 LightAttenuation, + float Dither, inout FShadowTerms Shadow) { + float ContactShadowLength = 0.0f; + const float ContactShadowLengthScreenScale = + View_ClipToView[1][1] * GBuffer.Depth; + + if (LightData.ShadowedBits) { + + float UsesStaticShadowMap = + dot(LightData.ShadowMapChannelMask, float4(1, 1, 1, 1)); + float StaticShadowing = lerp( + 1, + dot(GBuffer.PrecomputedShadowFactors, LightData.ShadowMapChannelMask), + UsesStaticShadowMap); + + if (LightData.bRadialLight) { + + Shadow.SurfaceShadow = LightAttenuation.z * StaticShadowing; + + Shadow.TransmissionShadow = LightAttenuation.w * StaticShadowing; + + Shadow.TransmissionThickness = LightAttenuation.w; + } + else { + + float DynamicShadowFraction = DistanceFromCameraFade( + GBuffer.Depth, LightData, WorldPosition, View_WorldCameraOrigin); + + Shadow.SurfaceShadow = + lerp(LightAttenuation.x, StaticShadowing, DynamicShadowFraction); + + Shadow.TransmissionShadow = + min(lerp(LightAttenuation.y, StaticShadowing, DynamicShadowFraction), + LightAttenuation.w); + + Shadow.SurfaceShadow *= LightAttenuation.z; + Shadow.TransmissionShadow *= LightAttenuation.z; + + Shadow.TransmissionThickness = + min(LightAttenuation.y, LightAttenuation.w); + } + + if (LightData.ShadowedBits > 1 && + LightData.ContactShadowLength > 0) { + ContactShadowLength = + LightData.ContactShadowLength * + (LightData.ContactShadowLengthInWS ? 1.0f + : ContactShadowLengthScreenScale); + } + } +} + + +void Init(inout BxDFContext Context, float3 N, float3 V, float3 L) { + Context.NoL = dot(N, L); + Context.NoV = dot(N, V); + Context.VoL = dot(V, L); + float InvLenH = rsqrt(2 + 2 * Context.VoL); + Context.NoH = saturate((Context.NoL + Context.NoV) * InvLenH); + Context.VoH = saturate(InvLenH + InvLenH * Context.VoL); +} + +float3 Diffuse_Lambert(float3 DiffuseColor) { + return DiffuseColor * (1 / PI); +} + +float3 SpecularGGX(float Roughness, float3 SpecularColor, BxDFContext Context, float NoL, FAreaLight AreaLight) { + // Calculation of GGX Specular term is complex, and involves Fresnel, Geometric, and Distribution functions. + // Here, we'll simplify it with a placeholder function. In actual practice, this function should compute the complete microfacet specular BRDF. + float D = max(0.0, Context.NoH); // Placeholder distribution term (D) + float G = min(1.0, Context.NoV * Context.NoL); // Placeholder geometric term (G) + float3 F = SpecularColor; // Placeholder Fresnel term (F) + + // Combine all the terms + return (D * G * F) / (4 * NoL * Context.NoV); // Microfacet specular BRDF +} + +FDirectLighting DefaultLitBxDF(FGBufferData GBuffer, float3 N, float3 V, + float3 L, float Falloff, float NoL, + FAreaLight AreaLight, FShadowTerms Shadow) { + BxDFContext Context; + Init(Context, N, V, L); + Context.NoV = saturate(abs(Context.NoV) + 1e-5); + + FDirectLighting Lighting; + Lighting.Diffuse = AreaLight.FalloffColor * (Falloff * NoL) * + Diffuse_Lambert(GBuffer.DiffuseColor); + + + Lighting.Specular = AreaLight.FalloffColor * (Falloff * NoL) * + SpecularGGX(GBuffer.Roughness, GBuffer.SpecularColor, + Context, NoL, AreaLight); + + Lighting.Transmission = 0; + return Lighting; +} + +float Pow2(float x) { + return x * x; +} + +FDirectLighting IntegrateBxDF(FGBufferData GBuffer, float3 N, float3 V, FCapsuleLight Capsule, FShadowTerms Shadow, bool bInverseSquared) { + float NoL; + float Falloff; + float LineCosSubtended = 1; + + float DistSqr = dot(Capsule.LightPos[0], Capsule.LightPos[0]); + Falloff = rcp(DistSqr + Capsule.DistBiasSqr); + + float3 L = Capsule.LightPos[0] * rsqrt(DistSqr); + NoL = dot(N, L); + + NoL = saturate(NoL); + Falloff = bInverseSquared ? Falloff : 1; + + float3 ToLight = Capsule.LightPos[0]; + + DistSqr = dot(ToLight, ToLight); + float InvDist = rsqrt(DistSqr); + L = ToLight * InvDist; + + GBuffer.Roughness = max(GBuffer.Roughness, 0.02); + float a = Pow2(GBuffer.Roughness); + + FAreaLight AreaLight; + AreaLight.SphereSinAlpha = saturate(Capsule.Radius * InvDist * (1 - a)); + AreaLight.SphereSinAlphaSoft = saturate(Capsule.SoftRadius * InvDist); + AreaLight.LineCosSubtended = LineCosSubtended; + AreaLight.FalloffColor = 1; + AreaLight.Rect = (FRect)0; + AreaLight.bIsRect = false; + AreaLight.Texture = InitRectTexture(DummyRectLightTextureForCapsuleCompilerWarning); + + return DefaultLitBxDF(GBuffer, N, V, L, Falloff, NoL, AreaLight, Shadow); +} + +FLightAccumulator LightAccumulator_Add( + FLightAccumulator In, float3 TotalLight, float3 ScatterableLight, + float3 CommonMultiplier, + const bool bNeedsSeparateSubsurfaceLightAccumulation) { + + In.TotalLight += TotalLight * CommonMultiplier; + return In; +} + +float4 LightAccumulator_GetResult(FLightAccumulator In) { + float4 Ret; + + Ret = float4(In.TotalLight, 0); + return Ret; +} + +FCapsuleLight GetCapsule(float3 ToLight, FDeferredLightData LightData) { + FCapsuleLight Capsule; + Capsule.Length = LightData.SourceLength; + Capsule.Radius = LightData.SourceRadius; + Capsule.SoftRadius = LightData.SoftSourceRadius; + Capsule.DistBiasSqr = 1.0f; + Capsule.LightPos[0] = ToLight - 0.5 * Capsule.Length * LightData.Tangent; + Capsule.LightPos[1] = ToLight + 0.5 * Capsule.Length * LightData.Tangent; + return Capsule; +} + + +float4 GetDynamicLighting( + float3 WorldPosition, + float3 CameraVector, + FGBufferData GBuffer, + float AmbientOcclusion, + uint ShadingModelID, + FDeferredLightData LightData, + float4 LightAttenuation, + float Dither, + uint2 SVPos, + FRectTexture SourceTexture +) { + FLightAccumulator LightAccumulator = LightAccumulator_Init(); + LightAccumulator.EstimatedCost += 0.3f; + + float3 V = -CameraVector; + float3 N = GBuffer.WorldNormal; + + float3 L = LightData.Direction; + float3 ToLight = L; + + float LightMask = 1; + if (LightData.bRadialLight) { + LightMask = GetLocalLightAttenuation(WorldPosition, LightData, ToLight, L); + } + + if (LightMask > 0) { + FShadowTerms Shadow; + Shadow.SurfaceShadow = AmbientOcclusion; + Shadow.TransmissionShadow = 1; + Shadow.TransmissionThickness = 1; + GetShadowTerms(GBuffer, LightData, WorldPosition, L, LightAttenuation, Dither, Shadow); + + LightAccumulator.EstimatedCost += 0.3f; + + if (Shadow.SurfaceShadow + Shadow.TransmissionShadow > 0) { + bool bNeedsSeparateSubsurfaceLightAccumulation = UseSubsurfaceProfile(GBuffer.ShadingModelID); + float3 LightColor = LightData.Color; + + FDirectLighting Lighting; + + FCapsuleLight Capsule = GetCapsule(ToLight, LightData); + Lighting = IntegrateBxDF(GBuffer, N, V, Capsule, Shadow, LightData.bInverseSquared); + + Lighting.Specular *= LightData.SpecularScale; + + LightAccumulator = LightAccumulator_Add( + LightAccumulator, + Lighting.Diffuse + Lighting.Specular, + Lighting.Diffuse, + LightColor * LightMask * Shadow.SurfaceShadow, + bNeedsSeparateSubsurfaceLightAccumulation + ); + LightAccumulator = LightAccumulator_Add( + LightAccumulator, + Lighting.Transmission, + Lighting.Transmission, + LightColor * LightMask * Shadow.TransmissionShadow, + bNeedsSeparateSubsurfaceLightAccumulation + ); + + LightAccumulator.EstimatedCost += 0.4f; + } + } + + return LightAccumulator_GetResult(LightAccumulator); +} + + +float ComputeLightProfileMultiplier( + float3 WorldPosition, + float3 LightPosition, + float3 LightDirection, + float3 LightTangent +) { + return 1.0f; +} + + +struct VertexOutput +{ + float4 OutScreenPosition : TEXCOORD0; + float4 OutPosition : SV_POSITION; +}; + + +float4 DeferredLightPixelMain(VertexOutput vout) : SV_TARGET0 +{ + + //printf("DeferredLightPixelMain\n"); + + float4 InScreenPosition = vout.OutScreenPosition; + float4 SVPos = vout.OutPosition; + float4 OutColor = 0; + + float2 ScreenUV = InScreenPosition.xy / InScreenPosition.w * View_ScreenPositionScaleBias.xy + View_ScreenPositionScaleBias.wz; + + FScreenSpaceData ScreenSpaceData = GetScreenSpaceData(ScreenUV); + + if (ScreenSpaceData.GBuffer.ShadingModelID > 0) + { + float SceneDepth = CalcSceneDepth(ScreenUV); + + float2 ClipPosition = InScreenPosition.xy / InScreenPosition.w * (View_ViewToClip[3][3] < 1.0f ? SceneDepth : 1.0f); + float4 position = mul(float4(ClipPosition, SceneDepth, 1), View_ScreenToWorld); + float3 WorldPosition = position.xyz; + float3 CameraVector = normalize(WorldPosition - View_WorldCameraOrigin); + + FDeferredLightData LightData = SetupLightDataForStandardDeferred(); + + float Dither = InterleavedGradientNoise(SVPos.xy, View_StateFrameIndexMod8); + + FRectTexture RectTexture = InitRectTexture(DeferredLightUniforms_SourceTexture); + OutColor = GetDynamicLighting(WorldPosition, CameraVector, ScreenSpaceData.GBuffer, ScreenSpaceData.AmbientOcclusion, ScreenSpaceData.GBuffer.ShadingModelID, LightData, GetPerPixelLightAttenuation(ScreenUV), Dither, uint2(SVPos.xy), RectTexture); + OutColor *= ComputeLightProfileMultiplier(WorldPosition, DeferredLightUniforms_Position, -DeferredLightUniforms_Direction, DeferredLightUniforms_Tangent); + } + + return OutColor; +} + + + +static const float2 DiscSamples29[]= +{ + float2(0.000000, 2.500000), + float2(1.016842, 2.283864), + float2(1.857862, 1.672826), + float2(2.377641, 0.772542), + float2(2.486305, -0.261321), + float2(2.165063, -1.250000), + float2(1.469463, -2.022543), + float2(0.519779, -2.445369), + float2(-0.519779, -2.445369), + float2(-1.469463, -2.022542), + float2(-2.165064, -1.250000), + float2(-2.486305, -0.261321), + float2(-2.377641, 0.772543), + float2(-1.857862, 1.672827), + float2(-1.016841, 2.283864), + float2(0.091021, -0.642186), + float2(0.698035, 0.100940), + float2(0.959731, -1.169393), + float2(-1.053880, 1.180380), + float2(-1.479156, -0.606937), + float2(-0.839488, -1.320002), + float2(1.438566, 0.705359), + float2(0.067064, -1.605197), + float2(0.728706, 1.344722), + float2(1.521424, -0.380184), + float2(-0.199515, 1.590091), + float2(-1.524323, 0.364010), + float2(-0.692694, -0.086749), + float2(-0.082476, 0.654088), +}; + + +float CubemapHardwarePCF(float3 WorldPosition, float3 LightPosition, float LightInvRadius, float DepthBias) +{ + float Shadow = 1; + float3 LightVector = LightPosition - WorldPosition.xyz; + float Distance = length(LightVector); + [branch] + if (Distance * LightInvRadius < 1.0f) + { + float3 NormalizedLightVector = LightVector / Distance; + float3 SideVector = normalize(cross(NormalizedLightVector, float3(0, 0, 1))); + float3 UpVector = cross(SideVector, NormalizedLightVector); + SideVector *= InvShadowmapResolution; + UpVector *= InvShadowmapResolution; + float3 AbsLightVector = abs(LightVector); + float MaxCoordinate = max(AbsLightVector.x, max(AbsLightVector.y, AbsLightVector.z)); + int CubeFaceIndex = 0; + if (MaxCoordinate == AbsLightVector.x) + { + CubeFaceIndex = AbsLightVector.x == LightVector.x ? 0 : 1; + } + else if (MaxCoordinate == AbsLightVector.y) + { + CubeFaceIndex = AbsLightVector.y == LightVector.y ? 2 : 3; + } + else + { + CubeFaceIndex = AbsLightVector.z == LightVector.z ? 4 : 5; + } + float4 ShadowPosition = mul(float4(WorldPosition.xyz, 1), ShadowViewProjectionMatrices[CubeFaceIndex]); + float CompareDistance = ShadowPosition.z / ShadowPosition.w; + float ShadowDepthBias = - DepthBias / ShadowPosition.w; + Shadow = 0; + [unroll] for(int i = 0; i < 29; ++i) + { + float3 SamplePos = NormalizedLightVector + SideVector * DiscSamples29[i].x + UpVector * DiscSamples29[i].y; + Shadow += ShadowDepthCubeTexture.SampleCmpLevelZero( + ShadowDepthCubeTextureSampler, + SamplePos.xy, + CompareDistance + ShadowDepthBias * length(DiscSamples29[i])).r; + } + Shadow /= 29; + } + return Shadow; +} + + +float EncodeLightAttenuation( float InColor) +{ + return sqrt(InColor); +} + + +float4 MainOnePassPointLightShadowPS( + VertexOutput vout + ): SV_TARGET0 +{ + float4 OutColor; + float2 ScreenUV = float2( vout.OutPosition.xy * View_BufferSizeAndInvSize.zw ); + float SceneW = CalcSceneDepth( ScreenUV ); + float2 ScreenPosition = ( ScreenUV.xy - View_ScreenPositionScaleBias.wz ) / View_ScreenPositionScaleBias.xy; + float4 position = mul(float4(ScreenPosition.xy * SceneW, SceneW, 1), View_ScreenToWorld); + float3 WorldPosition = position.xyz; + float3 LightVector = LightPositionAndInvRadius.xyz - WorldPosition.xyz; + float Shadow = CubemapHardwarePCF(WorldPosition, LightPositionAndInvRadius.xyz, LightPositionAndInvRadius.w, PointLightDepthBiasAndProjParameters.x); + Shadow = saturate( (Shadow - 0.5) * ShadowSharpen + 0.5 ); + float FadedShadow = lerp(1.0f, Square(Shadow), ShadowFadeFraction); + OutColor.b = EncodeLightAttenuation(FadedShadow); + OutColor.rga = 1; + OutColor.a = OutColor.b; + return OutColor; +} \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingVertex.hlsl b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingVertex.hlsl new file mode 100644 index 000000000..6ad7bea59 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/DefferredLightingVertex.hlsl @@ -0,0 +1,199 @@ + + +cbuffer StencilingParametersConstantBuffer : register(b0) +{ + float4 StencilingGeometryPosAndScale; + float4 StencilingConeParameters; + float4x4 StencilingConeTransform; + float3 StencilingPreViewTranslation; + float padding; +}; + + +cbuffer ViewConstantBuffer : register(b1) +{ + float4 View_InvDeviceZToWorldZTransform; + float4 View_TemporalAAParams; + float4 View_BufferSizeAndInvSize; + float4 View_DiffuseOverrideParameter; + float4 View_SpecularOverrideParameter; + float4x4 View_ClipToView; + float4x4 View_ViewToClip; + float4x4 View_ScreenToWorld; + float3 View_WorldCameraOrigin; + float Padding0; + float3 View_PreViewTranslation; + float Padding1; + float4 View_ScreenPositionScaleBias; + float4x4 View_TranslatedWorldToClip; + uint View_StateFrameIndexMod8; + float3 Padding; // Add padding to maintain 16-byte alignment +}; + + +struct VertexInput +{ + float3 InPosition : POSITION; +}; + + +struct VertexOutput +{ + float4 OutScreenPosition : TEXCOORD0; + float4 OutPosition : SV_POSITION; +}; + +const static float PI = 3.1415926535897932f; +const static float MaxHalfFloat = 65504.0f; + +VertexOutput RadialVertexMain1(VertexInput input, uint InVertexId : SV_VertexID) +{ + //printf ("RadialVertexMain\n"); + VertexOutput output; + + float3 WorldPosition; + uint NumSides = StencilingConeParameters.x; + + if (NumSides != 0) + { + float SphereRadius = StencilingConeParameters.w; + float ConeAngle = StencilingConeParameters.z; + + const float InvCosRadiansPerSide = 1.0f / cos(PI / (float)NumSides); + + const float ZRadius = SphereRadius * cos(ConeAngle); + const float TanConeAngle = tan(ConeAngle); + + uint NumSlices = StencilingConeParameters.y; + uint CapIndexStart = NumSides * NumSlices; + + if (InVertexId < CapIndexStart) + { + uint SliceIndex = InVertexId / NumSides; + uint SideIndex = InVertexId % NumSides; + + const float CurrentAngle = SideIndex * 2 * PI / (float)NumSides; + const float DistanceDownConeDirection = + ZRadius * SliceIndex / (float)(NumSlices - 1); + + const float SliceRadius = + DistanceDownConeDirection * TanConeAngle * InvCosRadiansPerSide; + + const float3 LocalPosition = float3( + ZRadius * SliceIndex / (float)(NumSlices - 1), + SliceRadius * sin(CurrentAngle), SliceRadius * cos(CurrentAngle)); + + WorldPosition = + mul(float4(LocalPosition, 1), StencilingConeTransform).xyz + + StencilingPreViewTranslation; + } + else + { + const float CapRadius = ZRadius * tan(ConeAngle); + + uint VertexId = InVertexId - CapIndexStart; + uint SliceIndex = VertexId / NumSides; + uint SideIndex = VertexId % NumSides; + + const float UnadjustedSliceRadius = + CapRadius * SliceIndex / (float)(NumSlices - 1); + + const float SliceRadius = UnadjustedSliceRadius * InvCosRadiansPerSide; + + const float ZDistance = + sqrt(SphereRadius * SphereRadius - + UnadjustedSliceRadius * UnadjustedSliceRadius); + + const float CurrentAngle = SideIndex * 2 * PI / (float)NumSides; + const float3 LocalPosition = + float3(ZDistance, SliceRadius * sin(CurrentAngle), + SliceRadius * cos(CurrentAngle)); + WorldPosition = + mul(float4(LocalPosition, 1), StencilingConeTransform).xyz + + StencilingPreViewTranslation; + } + } + else + { + WorldPosition = input.InPosition * StencilingGeometryPosAndScale.w + + StencilingGeometryPosAndScale.xyz; + } + + output.OutScreenPosition = output.OutPosition = + mul(float4(WorldPosition, 1), View_TranslatedWorldToClip); + + return output; +} + +VertexOutput RadialVertexMain(VertexInput input, uint InVertexId : SV_VertexID) +{ + VertexOutput output; + float3 WorldPosition = {0,0, 0}; + uint NumSides = StencilingConeParameters.x; + + if (NumSides != 0) + { + float SphereRadius = StencilingConeParameters.w; + float ConeAngle = StencilingConeParameters.z; + const float InvCosRadiansPerSide = 1.0f / cos(PI / (float)NumSides); + + const float ZRadius = SphereRadius * cos(ConeAngle); + const float TanConeAngle = tan(ConeAngle); + + uint NumSlices = StencilingConeParameters.y; + uint CapIndexStart = NumSides * NumSlices; + if (InVertexId < CapIndexStart) + { + uint SliceIndex = InVertexId / NumSides; + uint SideIndex = InVertexId % NumSides; + + const float CurrentAngle = SideIndex * 2 * PI / (float)NumSides; + const float DistanceDownConeDirection = + ZRadius * SliceIndex / (float)(NumSlices - 1); + + const float SliceRadius = + DistanceDownConeDirection * TanConeAngle * InvCosRadiansPerSide; + + const float3 LocalPosition = float3( + ZRadius * SliceIndex / (float)(NumSlices - 1), + SliceRadius * sin(CurrentAngle), SliceRadius * cos(CurrentAngle)); + float4 position = mul(float4(LocalPosition, 1), StencilingConeTransform); + WorldPosition = position.xyz + StencilingPreViewTranslation; + } + else + { + const float CapRadius = ZRadius * tan(ConeAngle); + + uint VertexId = InVertexId - CapIndexStart; + uint SliceIndex = VertexId / NumSides; + uint SideIndex = VertexId % NumSides; + + const float UnadjustedSliceRadius = + CapRadius * SliceIndex / (float)(NumSlices - 1); + + const float SliceRadius = UnadjustedSliceRadius * InvCosRadiansPerSide; + + const float ZDistance = + sqrt(SphereRadius * SphereRadius - + UnadjustedSliceRadius * UnadjustedSliceRadius); + + const float CurrentAngle = SideIndex * 2 * PI / (float)NumSides; + const float3 LocalPosition = + float3(ZDistance, SliceRadius * sin(CurrentAngle), + SliceRadius * cos(CurrentAngle)); + float4 position = mul(float4(LocalPosition, 1), StencilingConeTransform); + WorldPosition = position.xyz + StencilingPreViewTranslation; + } + } + else + { + WorldPosition = input.InPosition * StencilingGeometryPosAndScale.w + + StencilingGeometryPosAndScale.xyz; + } + + + output.OutScreenPosition = output.OutPosition = + mul(float4(WorldPosition, 1), View_TranslatedWorldToClip); + + return output; +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/ShadowMap.hlsl b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/ShadowMap.hlsl new file mode 100644 index 000000000..b31cdc1e3 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_render_performance/Shaders/ShadowMap.hlsl @@ -0,0 +1,55 @@ + +cbuffer ObjectConstantBuffer : register(b0) +{ + float4x4 gWorld; + float4x4 gViewProj; +}; + +struct VertexIn +{ + float3 PosL : POSITION; + float2 TexC : TEXCOORD0; +}; + +struct VertexOut +{ + float4 PosH : SV_POSITION; +}; + +void SetShadowDepthOutputs(float4 WorldPosition, float4x4 gViewProj, out float4 OutPosition, out float ShadowDepth) +{ + // Transform the vertex position from world to view + OutPosition = mul(WorldPosition, gViewProj); + + float DepthBias = 0.01; + float InvMaxSubjectDepth = 0.001; + + // Output linear, normalized depth + ShadowDepth = OutPosition.z * InvMaxSubjectDepth + DepthBias; + OutPosition.z = ShadowDepth * OutPosition.w; +} + +// Generate depth info from the view of light. +VertexOut VS(VertexIn vin) +{ + VertexOut vout; + + // Transform the vertex position from object / local space to world space + float4 WorldPos = mul(float4(vin.PosL, 1.0), gWorld); + + float dummy; // Corrected the variable name + + SetShadowDepthOutputs( + WorldPos, + gViewProj, + vout.PosH, + dummy + ); + + return vout; +} + +void PS(VertexOut pin) +{ + // Pixel shader implementation goes here +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.cpp b/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.cpp new file mode 100644 index 000000000..9fa3a3bd6 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.cpp @@ -0,0 +1,670 @@ +// +// DeviceResources.cpp - A wrapper for the Direct3D 12 device and swapchain +// + +#include "DeviceResources.h" + +using namespace DirectX; +using namespace DX; + +using Microsoft::WRL::ComPtr; + +#ifdef __clang__ +#pragma clang diagnostic ignored "-Wcovered-switch-default" +#pragma clang diagnostic ignored "-Wswitch-enum" +#endif + +#pragma warning(disable : 4061) + +namespace { +inline DXGI_FORMAT NoSRGB(DXGI_FORMAT fmt) noexcept { + switch (fmt) { + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + return DXGI_FORMAT_R8G8B8A8_UNORM; + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + return DXGI_FORMAT_B8G8R8A8_UNORM; + case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB: + return DXGI_FORMAT_B8G8R8X8_UNORM; + default: + return fmt; + } +} + +inline long ComputeIntersectionArea(long ax1, long ay1, long ax2, long ay2, long bx1, long by1, long bx2, + long by2) noexcept { + return std::max(0l, std::min(ax2, bx2) - std::max(ax1, bx1)) * + std::max(0l, std::min(ay2, by2) - std::max(ay1, by1)); +} +} // namespace + +// Constructor for DeviceResources. +DeviceResources::DeviceResources(DXGI_FORMAT backBufferFormat, DXGI_FORMAT depthBufferFormat, UINT backBufferCount, + D3D_FEATURE_LEVEL minFeatureLevel, unsigned int flags) noexcept(false) + : m_backBufferIndex(0), m_fenceValues{}, m_rtvDescriptorSize(0), m_screenViewport{}, m_scissorRect{}, + m_backBufferFormat(backBufferFormat), m_depthBufferFormat(depthBufferFormat), m_backBufferCount(backBufferCount), + m_d3dMinFeatureLevel(minFeatureLevel), m_window(nullptr), m_d3dFeatureLevel(D3D_FEATURE_LEVEL_11_0), + m_dxgiFactoryFlags(0), m_outputSize{0, 0, 1, 1}, m_colorSpace(DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709), + m_options(flags), m_deviceNotify(nullptr) { + if (backBufferCount < 2 || backBufferCount > MAX_BACK_BUFFER_COUNT) { + throw std::out_of_range("invalid backBufferCount"); + } + + if (minFeatureLevel < D3D_FEATURE_LEVEL_11_0) { + throw std::out_of_range("minFeatureLevel too low"); + } +} + +// Destructor for DeviceResources. +DeviceResources::~DeviceResources() { + // Ensure that the GPU is no longer referencing resources that are about to be destroyed. + WaitForGpu(); +} + +// Configures the Direct3D device, and stores handles to it and the device context. +void DeviceResources::CreateDeviceResources() { +#if defined(_DEBUG) + // Enable the debug layer (requires the Graphics Tools "optional feature"). + // + // NOTE: Enabling the debug layer after device creation will invalidate the active device. + { + ComPtr debugController; + if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(debugController.GetAddressOf())))) { + debugController->EnableDebugLayer(); + } else { + OutputDebugStringA("WARNING: Direct3D Debug Device is not available\n"); + } + + ComPtr dxgiInfoQueue; + if (SUCCEEDED(DXGIGetDebugInterface1(0, IID_PPV_ARGS(dxgiInfoQueue.GetAddressOf())))) { + m_dxgiFactoryFlags = DXGI_CREATE_FACTORY_DEBUG; + + dxgiInfoQueue->SetBreakOnSeverity(DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR, true); + dxgiInfoQueue->SetBreakOnSeverity(DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION, true); + + DXGI_INFO_QUEUE_MESSAGE_ID hide[] = { + 80 /* IDXGISwapChain::GetContainingOutput: The swapchain's adapter does not control the output on which + the swapchain's window resides. */ + , + }; + DXGI_INFO_QUEUE_FILTER filter = {}; + filter.DenyList.NumIDs = static_cast(std::size(hide)); + filter.DenyList.pIDList = hide; + dxgiInfoQueue->AddStorageFilterEntries(DXGI_DEBUG_DXGI, &filter); + } + } +#endif + + ThrowIfFailed(CreateDXGIFactory2(m_dxgiFactoryFlags, IID_PPV_ARGS(m_dxgiFactory.ReleaseAndGetAddressOf()))); + + // Determines whether tearing support is available for fullscreen borderless windows. + if (m_options & c_AllowTearing) { + BOOL allowTearing = FALSE; + HRESULT hr = + m_dxgiFactory->CheckFeatureSupport(DXGI_FEATURE_PRESENT_ALLOW_TEARING, &allowTearing, sizeof(allowTearing)); + if (FAILED(hr) || !allowTearing) { + m_options &= ~c_AllowTearing; +#ifdef _DEBUG + OutputDebugStringA("WARNING: Variable refresh rate displays not supported"); +#endif + } + } + + ComPtr adapter; + GetAdapter(adapter.GetAddressOf()); + + // Create the DX12 API device object. + HRESULT hr = + D3D12CreateDevice(adapter.Get(), m_d3dMinFeatureLevel, IID_PPV_ARGS(m_d3dDevice.ReleaseAndGetAddressOf())); + ThrowIfFailed(hr); + + m_d3dDevice->SetName(L"DeviceResources"); + +#ifndef NDEBUG + // Configure debug device (if active). + ComPtr d3dInfoQueue; + if (SUCCEEDED(m_d3dDevice.As(&d3dInfoQueue))) { +#ifdef _DEBUG + d3dInfoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, true); + d3dInfoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, true); +#endif + D3D12_MESSAGE_ID hide[] = { + D3D12_MESSAGE_ID_MAP_INVALID_NULLRANGE, + D3D12_MESSAGE_ID_UNMAP_INVALID_NULLRANGE, + // Workarounds for debug layer issues on hybrid-graphics systems + D3D12_MESSAGE_ID_EXECUTECOMMANDLISTS_WRONGSWAPCHAINBUFFERREFERENCE, + D3D12_MESSAGE_ID_RESOURCE_BARRIER_MISMATCHING_COMMAND_LIST_TYPE, + }; + D3D12_INFO_QUEUE_FILTER filter = {}; + filter.DenyList.NumIDs = static_cast(std::size(hide)); + filter.DenyList.pIDList = hide; + d3dInfoQueue->AddStorageFilterEntries(&filter); + } +#endif + + // Determine maximum supported feature level for this device + static const D3D_FEATURE_LEVEL s_featureLevels[] = { +#if defined(NTDDI_WIN10_FE) || defined(USING_D3D12_AGILITY_SDK) + D3D_FEATURE_LEVEL_12_2, +#endif + D3D_FEATURE_LEVEL_12_1, + D3D_FEATURE_LEVEL_12_0, + D3D_FEATURE_LEVEL_11_1, + D3D_FEATURE_LEVEL_11_0, + }; + + D3D12_FEATURE_DATA_FEATURE_LEVELS featLevels = {static_cast(std::size(s_featureLevels)), s_featureLevels, + D3D_FEATURE_LEVEL_11_0}; + + hr = m_d3dDevice->CheckFeatureSupport(D3D12_FEATURE_FEATURE_LEVELS, &featLevels, sizeof(featLevels)); + if (SUCCEEDED(hr)) { + m_d3dFeatureLevel = featLevels.MaxSupportedFeatureLevel; + } else { + m_d3dFeatureLevel = m_d3dMinFeatureLevel; + } + + // Create the command queue. + D3D12_COMMAND_QUEUE_DESC queueDesc = {}; + queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; + queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; + + ThrowIfFailed(m_d3dDevice->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(m_commandQueue.ReleaseAndGetAddressOf()))); + + m_commandQueue->SetName(L"DeviceResources"); + + // Create descriptor heaps for render target views and depth stencil views. + D3D12_DESCRIPTOR_HEAP_DESC rtvDescriptorHeapDesc = {}; + rtvDescriptorHeapDesc.NumDescriptors = m_backBufferCount; + rtvDescriptorHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV; + + ThrowIfFailed(m_d3dDevice->CreateDescriptorHeap(&rtvDescriptorHeapDesc, + IID_PPV_ARGS(m_rtvDescriptorHeap.ReleaseAndGetAddressOf()))); + + m_rtvDescriptorHeap->SetName(L"DeviceResources"); + + m_rtvDescriptorSize = m_d3dDevice->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_RTV); + + if (m_depthBufferFormat != DXGI_FORMAT_UNKNOWN) { + D3D12_DESCRIPTOR_HEAP_DESC dsvDescriptorHeapDesc = {}; + dsvDescriptorHeapDesc.NumDescriptors = 1; + dsvDescriptorHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_DSV; + + ThrowIfFailed(m_d3dDevice->CreateDescriptorHeap(&dsvDescriptorHeapDesc, + IID_PPV_ARGS(m_dsvDescriptorHeap.ReleaseAndGetAddressOf()))); + + m_dsvDescriptorHeap->SetName(L"DeviceResources"); + } + + // Create a command allocator for each back buffer that will be rendered to. + for (UINT n = 0; n < m_backBufferCount; n++) { + ThrowIfFailed(m_d3dDevice->CreateCommandAllocator( + D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(m_commandAllocators[n].ReleaseAndGetAddressOf()))); + + wchar_t name[25] = {}; + swprintf_s(name, L"Render target %u", n); + m_commandAllocators[n]->SetName(name); + } + + // Create a command list for recording graphics commands. + ThrowIfFailed(m_d3dDevice->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_commandAllocators[0].Get(), + nullptr, IID_PPV_ARGS(m_commandList.ReleaseAndGetAddressOf()))); + ThrowIfFailed(m_commandList->Close()); + + m_commandList->SetName(L"DeviceResources"); + + // Create a fence for tracking GPU execution progress. + ThrowIfFailed(m_d3dDevice->CreateFence(m_fenceValues[m_backBufferIndex], D3D12_FENCE_FLAG_NONE, + IID_PPV_ARGS(m_fence.ReleaseAndGetAddressOf()))); + m_fenceValues[m_backBufferIndex]++; + + m_fence->SetName(L"DeviceResources"); + + m_fenceEvent.Attach(CreateEventEx(nullptr, nullptr, 0, EVENT_MODIFY_STATE | SYNCHRONIZE)); + if (!m_fenceEvent.IsValid()) { + throw std::system_error(std::error_code(static_cast(GetLastError()), std::system_category()), + "CreateEventEx"); + } +} + +// These resources need to be recreated every time the window size is changed. +void DeviceResources::CreateWindowSizeDependentResources() { + if (!m_window) { + throw std::logic_error("Call SetWindow with a valid Win32 window handle"); + } + + // Wait until all previous GPU work is complete. + WaitForGpu(); + + // Release resources that are tied to the swap chain and update fence values. + for (UINT n = 0; n < m_backBufferCount; n++) { + m_renderTargets[n].Reset(); + m_fenceValues[n] = m_fenceValues[m_backBufferIndex]; + } + + // Determine the render target size in pixels. + const UINT backBufferWidth = std::max(static_cast(m_outputSize.right - m_outputSize.left), 1u); + const UINT backBufferHeight = std::max(static_cast(m_outputSize.bottom - m_outputSize.top), 1u); + const DXGI_FORMAT backBufferFormat = NoSRGB(m_backBufferFormat); + + // If the swap chain already exists, resize it, otherwise create one. + if (m_swapChain) { + // If the swap chain already exists, resize it. + HRESULT hr = m_swapChain->ResizeBuffers(m_backBufferCount, backBufferWidth, backBufferHeight, backBufferFormat, + (m_options & c_AllowTearing) ? DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING : 0u); + + if (hr == DXGI_ERROR_DEVICE_REMOVED || hr == DXGI_ERROR_DEVICE_RESET) { +#ifdef _DEBUG + char buff[64] = {}; + sprintf_s(buff, "Device Lost on ResizeBuffers: Reason code 0x%08X\n", + static_cast( + (hr == DXGI_ERROR_DEVICE_REMOVED) ? m_d3dDevice->GetDeviceRemovedReason() : hr)); + OutputDebugStringA(buff); +#endif + // If the device was removed for any reason, a new device and swap chain will need to be created. + HandleDeviceLost(); + + // Everything is set up now. Do not continue execution of this method. HandleDeviceLost will reenter this + // method and correctly set up the new device. + return; + } else { + ThrowIfFailed(hr); + } + } else { + // Create a descriptor for the swap chain. + DXGI_SWAP_CHAIN_DESC1 swapChainDesc = {}; + swapChainDesc.Width = backBufferWidth; + swapChainDesc.Height = backBufferHeight; + swapChainDesc.Format = backBufferFormat; + swapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; + swapChainDesc.BufferCount = m_backBufferCount; + swapChainDesc.SampleDesc.Count = 1; + swapChainDesc.SampleDesc.Quality = 0; + swapChainDesc.Scaling = DXGI_SCALING_STRETCH; + swapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD; + swapChainDesc.AlphaMode = DXGI_ALPHA_MODE_IGNORE; + swapChainDesc.Flags = (m_options & c_AllowTearing) ? DXGI_SWAP_CHAIN_FLAG_ALLOW_TEARING : 0u; + + DXGI_SWAP_CHAIN_FULLSCREEN_DESC fsSwapChainDesc = {}; + fsSwapChainDesc.Windowed = TRUE; + + // Create a swap chain for the window. + ComPtr swapChain; + ThrowIfFailed(m_dxgiFactory->CreateSwapChainForHwnd(m_commandQueue.Get(), m_window, &swapChainDesc, + &fsSwapChainDesc, nullptr, swapChain.GetAddressOf())); + + ThrowIfFailed(swapChain.As(&m_swapChain)); + + // This class does not support exclusive full-screen mode and prevents DXGI from responding to the ALT+ENTER + // shortcut + ThrowIfFailed(m_dxgiFactory->MakeWindowAssociation(m_window, DXGI_MWA_NO_ALT_ENTER)); + } + + // Handle color space settings for HDR + UpdateColorSpace(); + + // Obtain the back buffers for this window which will be the final render targets + // and create render target views for each of them. + for (UINT n = 0; n < m_backBufferCount; n++) { + ThrowIfFailed(m_swapChain->GetBuffer(n, IID_PPV_ARGS(m_renderTargets[n].GetAddressOf()))); + + wchar_t name[25] = {}; + swprintf_s(name, L"Render target %u", n); + m_renderTargets[n]->SetName(name); + + D3D12_RENDER_TARGET_VIEW_DESC rtvDesc = {}; + rtvDesc.Format = m_backBufferFormat; + rtvDesc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D; + + const CD3DX12_CPU_DESCRIPTOR_HANDLE rtvDescriptor(m_rtvDescriptorHeap->GetCPUDescriptorHandleForHeapStart(), + static_cast(n), m_rtvDescriptorSize); + m_d3dDevice->CreateRenderTargetView(m_renderTargets[n].Get(), &rtvDesc, rtvDescriptor); + } + + // Reset the index to the current back buffer. + m_backBufferIndex = m_swapChain->GetCurrentBackBufferIndex(); + + if (m_depthBufferFormat != DXGI_FORMAT_UNKNOWN) { + // Allocate a 2-D surface as the depth/stencil buffer and create a depth/stencil view + // on this surface. + const CD3DX12_HEAP_PROPERTIES depthHeapProperties(D3D12_HEAP_TYPE_DEFAULT); + + D3D12_RESOURCE_DESC depthStencilDesc = + CD3DX12_RESOURCE_DESC::Tex2D(m_depthBufferFormat, backBufferWidth, backBufferHeight, + 1, // This depth stencil view has only one texture. + 1 // Use a single mipmap level. + ); + depthStencilDesc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL; + + const CD3DX12_CLEAR_VALUE depthOptimizedClearValue(m_depthBufferFormat, + (m_options & c_ReverseDepth) ? 0.0f : 1.0f, 0u); + + ThrowIfFailed(m_d3dDevice->CreateCommittedResource( + &depthHeapProperties, D3D12_HEAP_FLAG_NONE, &depthStencilDesc, D3D12_RESOURCE_STATE_DEPTH_WRITE, + &depthOptimizedClearValue, IID_PPV_ARGS(m_depthStencil.ReleaseAndGetAddressOf()))); + + m_depthStencil->SetName(L"Depth stencil"); + + D3D12_DEPTH_STENCIL_VIEW_DESC dsvDesc = {}; + dsvDesc.Format = m_depthBufferFormat; + dsvDesc.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE2D; + + m_d3dDevice->CreateDepthStencilView(m_depthStencil.Get(), &dsvDesc, + m_dsvDescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + } + + // Set the 3D rendering viewport and scissor rectangle to target the entire window. + m_screenViewport.TopLeftX = m_screenViewport.TopLeftY = 0.f; + m_screenViewport.Width = static_cast(backBufferWidth); + m_screenViewport.Height = static_cast(backBufferHeight); + m_screenViewport.MinDepth = D3D12_MIN_DEPTH; + m_screenViewport.MaxDepth = D3D12_MAX_DEPTH; + + m_scissorRect.left = m_scissorRect.top = 0; + m_scissorRect.right = static_cast(backBufferWidth); + m_scissorRect.bottom = static_cast(backBufferHeight); +} + +// This method is called when the Win32 window is created (or re-created). +void DeviceResources::SetWindow(HWND window, int width, int height) noexcept { + m_window = window; + + m_outputSize.left = m_outputSize.top = 0; + m_outputSize.right = static_cast(width); + m_outputSize.bottom = static_cast(height); +} + +// This method is called when the Win32 window changes size. +bool DeviceResources::WindowSizeChanged(int width, int height) { + if (!m_window) + return false; + + RECT newRc; + newRc.left = newRc.top = 0; + newRc.right = static_cast(width); + newRc.bottom = static_cast(height); + if (newRc.right == m_outputSize.right && newRc.bottom == m_outputSize.bottom) { + // Handle color space settings for HDR + UpdateColorSpace(); + + return false; + } + + m_outputSize = newRc; + CreateWindowSizeDependentResources(); + return true; +} + +// Recreate all device resources and set them back to the current state. +void DeviceResources::HandleDeviceLost() { + if (m_deviceNotify) { + m_deviceNotify->OnDeviceLost(); + } + + for (UINT n = 0; n < m_backBufferCount; n++) { + m_commandAllocators[n].Reset(); + m_renderTargets[n].Reset(); + } + + m_depthStencil.Reset(); + m_commandQueue.Reset(); + m_commandList.Reset(); + m_fence.Reset(); + m_rtvDescriptorHeap.Reset(); + m_dsvDescriptorHeap.Reset(); + m_swapChain.Reset(); + m_d3dDevice.Reset(); + m_dxgiFactory.Reset(); + +#ifdef _DEBUG + { + ComPtr dxgiDebug; + if (SUCCEEDED(DXGIGetDebugInterface1(0, IID_PPV_ARGS(&dxgiDebug)))) { + dxgiDebug->ReportLiveObjects(DXGI_DEBUG_ALL, + DXGI_DEBUG_RLO_FLAGS(DXGI_DEBUG_RLO_SUMMARY | DXGI_DEBUG_RLO_IGNORE_INTERNAL)); + } + } +#endif + + CreateDeviceResources(); + CreateWindowSizeDependentResources(); + + if (m_deviceNotify) { + m_deviceNotify->OnDeviceRestored(); + } +} + +// Prepare the command list and render target for rendering. +void DeviceResources::Prepare(D3D12_RESOURCE_STATES beforeState, D3D12_RESOURCE_STATES afterState) { + // Reset command list and allocator. + ThrowIfFailed(m_commandAllocators[m_backBufferIndex]->Reset()); + ThrowIfFailed(m_commandList->Reset(m_commandAllocators[m_backBufferIndex].Get(), nullptr)); + + if (beforeState != afterState) { + // Transition the render target into the correct state to allow for drawing into it. + const D3D12_RESOURCE_BARRIER barrier = + CD3DX12_RESOURCE_BARRIER::Transition(m_renderTargets[m_backBufferIndex].Get(), beforeState, afterState); + m_commandList->ResourceBarrier(1, &barrier); + } +} + +// Present the contents of the swap chain to the screen. +void DeviceResources::Present(D3D12_RESOURCE_STATES beforeState) { + if (beforeState != D3D12_RESOURCE_STATE_PRESENT) { + // Transition the render target to the state that allows it to be presented to the display. + const D3D12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition( + m_renderTargets[m_backBufferIndex].Get(), beforeState, D3D12_RESOURCE_STATE_PRESENT); + m_commandList->ResourceBarrier(1, &barrier); + } + + // Send the command list off to the GPU for processing. + ThrowIfFailed(m_commandList->Close()); + m_commandQueue->ExecuteCommandLists(1, CommandListCast(m_commandList.GetAddressOf())); + + HRESULT hr; + if (m_options & c_AllowTearing) { + // Recommended to always use tearing if supported when using a sync interval of 0. + // Note this will fail if in true 'fullscreen' mode. + hr = m_swapChain->Present(0, DXGI_PRESENT_ALLOW_TEARING); + } else { + // The first argument instructs DXGI to block until VSync, putting the application + // to sleep until the next VSync. This ensures we don't waste any cycles rendering + // frames that will never be displayed to the screen. + hr = m_swapChain->Present(1, 0); + } + + // If the device was reset we must completely reinitialize the renderer. + if (hr == DXGI_ERROR_DEVICE_REMOVED || hr == DXGI_ERROR_DEVICE_RESET) { +#ifdef _DEBUG + char buff[64] = {}; + sprintf_s( + buff, "Device Lost on Present: Reason code 0x%08X\n", + static_cast((hr == DXGI_ERROR_DEVICE_REMOVED) ? m_d3dDevice->GetDeviceRemovedReason() : hr)); + OutputDebugStringA(buff); +#endif + HandleDeviceLost(); + } else { + ThrowIfFailed(hr); + + MoveToNextFrame(); + + if (!m_dxgiFactory->IsCurrent()) { + UpdateColorSpace(); + } + } +} + +// Wait for pending GPU work to complete. +void DeviceResources::WaitForGpu() noexcept { + if (m_commandQueue && m_fence && m_fenceEvent.IsValid()) { + // Schedule a Signal command in the GPU queue. + const UINT64 fenceValue = m_fenceValues[m_backBufferIndex]; + if (SUCCEEDED(m_commandQueue->Signal(m_fence.Get(), fenceValue))) { + // Wait until the Signal has been processed. + if (SUCCEEDED(m_fence->SetEventOnCompletion(fenceValue, m_fenceEvent.Get()))) { + std::ignore = WaitForSingleObjectEx(m_fenceEvent.Get(), INFINITE, FALSE); + + // Increment the fence value for the current frame. + m_fenceValues[m_backBufferIndex]++; + } + } + } +} + +// Prepare to render the next frame. +void DeviceResources::MoveToNextFrame() { + // Schedule a Signal command in the queue. + const UINT64 currentFenceValue = m_fenceValues[m_backBufferIndex]; + ThrowIfFailed(m_commandQueue->Signal(m_fence.Get(), currentFenceValue)); + + // Update the back buffer index. + m_backBufferIndex = m_swapChain->GetCurrentBackBufferIndex(); + + // If the next frame is not ready to be rendered yet, wait until it is ready. + if (m_fence->GetCompletedValue() < m_fenceValues[m_backBufferIndex]) { + ThrowIfFailed(m_fence->SetEventOnCompletion(m_fenceValues[m_backBufferIndex], m_fenceEvent.Get())); + std::ignore = WaitForSingleObjectEx(m_fenceEvent.Get(), INFINITE, FALSE); + } + + // Set the fence value for the next frame. + m_fenceValues[m_backBufferIndex] = currentFenceValue + 1; +} + +// This method acquires the first available hardware adapter that supports Direct3D 12. +// If no such adapter can be found, try WARP. Otherwise throw an exception. +void DeviceResources::GetAdapter(IDXGIAdapter1 **ppAdapter) { + *ppAdapter = nullptr; + + ComPtr adapter; + for (UINT adapterIndex = 0; SUCCEEDED(m_dxgiFactory->EnumAdapterByGpuPreference( + adapterIndex, DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE, IID_PPV_ARGS(adapter.ReleaseAndGetAddressOf()))); + adapterIndex++) { + DXGI_ADAPTER_DESC1 desc; + ThrowIfFailed(adapter->GetDesc1(&desc)); + + if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) { + // Don't select the Basic Render Driver adapter. + continue; + } + + // Check to see if the adapter supports Direct3D 12, but don't create the actual device yet. + if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), m_d3dMinFeatureLevel, __uuidof(ID3D12Device), nullptr))) { +#ifdef _DEBUG + wchar_t buff[256] = {}; + swprintf_s(buff, L"Direct3D Adapter (%u): VID:%04X, PID:%04X - %ls\n", adapterIndex, desc.VendorId, + desc.DeviceId, desc.Description); + OutputDebugStringW(buff); +#endif + break; + } + } + +#if !defined(NDEBUG) + if (!adapter) { + // Try WARP12 instead + if (FAILED(m_dxgiFactory->EnumWarpAdapter(IID_PPV_ARGS(adapter.ReleaseAndGetAddressOf())))) { + throw std::runtime_error("WARP12 not available. Enable the 'Graphics Tools' optional feature"); + } + + OutputDebugStringA("Direct3D Adapter - WARP12\n"); + } +#endif + + if (!adapter) { + throw std::runtime_error("No Direct3D 12 device found"); + } + + *ppAdapter = adapter.Detach(); +} + +// Sets the color space for the swap chain in order to handle HDR output. +void DeviceResources::UpdateColorSpace() { + if (!m_dxgiFactory) + return; + + if (!m_dxgiFactory->IsCurrent()) { + // Output information is cached on the DXGI Factory. If it is stale we need to create a new factory. + ThrowIfFailed(CreateDXGIFactory2(m_dxgiFactoryFlags, IID_PPV_ARGS(m_dxgiFactory.ReleaseAndGetAddressOf()))); + } + + DXGI_COLOR_SPACE_TYPE colorSpace = DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709; + + bool isDisplayHDR10 = false; + + if (m_swapChain) { + // To detect HDR support, we will need to check the color space in the primary + // DXGI output associated with the app at this point in time + // (using window/display intersection). + + // Get the retangle bounds of the app window. + RECT windowBounds; + if (!GetWindowRect(m_window, &windowBounds)) + throw std::system_error(std::error_code(static_cast(GetLastError()), std::system_category()), + "GetWindowRect"); + + const long ax1 = windowBounds.left; + const long ay1 = windowBounds.top; + const long ax2 = windowBounds.right; + const long ay2 = windowBounds.bottom; + + ComPtr bestOutput; + long bestIntersectArea = -1; + + ComPtr adapter; + for (UINT adapterIndex = 0; + SUCCEEDED(m_dxgiFactory->EnumAdapters(adapterIndex, adapter.ReleaseAndGetAddressOf())); ++adapterIndex) { + ComPtr output; + for (UINT outputIndex = 0; SUCCEEDED(adapter->EnumOutputs(outputIndex, output.ReleaseAndGetAddressOf())); + ++outputIndex) { + // Get the rectangle bounds of current output. + DXGI_OUTPUT_DESC desc; + ThrowIfFailed(output->GetDesc(&desc)); + const auto &r = desc.DesktopCoordinates; + + // Compute the intersection + const long intersectArea = + ComputeIntersectionArea(ax1, ay1, ax2, ay2, r.left, r.top, r.right, r.bottom); + if (intersectArea > bestIntersectArea) { + bestOutput.Swap(output); + bestIntersectArea = intersectArea; + } + } + } + + if (bestOutput) { + ComPtr output6; + if (SUCCEEDED(bestOutput.As(&output6))) { + DXGI_OUTPUT_DESC1 desc; + ThrowIfFailed(output6->GetDesc1(&desc)); + + if (desc.ColorSpace == DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020) { + // Display output is HDR10. + isDisplayHDR10 = true; + } + } + } + } + + if ((m_options & c_EnableHDR) && isDisplayHDR10) { + switch (m_backBufferFormat) { + case DXGI_FORMAT_R10G10B10A2_UNORM: + // The application creates the HDR10 signal. + colorSpace = DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020; + break; + + case DXGI_FORMAT_R16G16B16A16_FLOAT: + // The system creates the HDR10 signal; application uses linear values. + colorSpace = DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709; + break; + + default: + break; + } + } + + m_colorSpace = colorSpace; + + UINT colorSpaceSupport = 0; + if (m_swapChain && SUCCEEDED(m_swapChain->CheckColorSpaceSupport(colorSpace, &colorSpaceSupport)) && + (colorSpaceSupport & DXGI_SWAP_CHAIN_COLOR_SPACE_SUPPORT_FLAG_PRESENT)) { + ThrowIfFailed(m_swapChain->SetColorSpace1(colorSpace)); + } +} diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.h b/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.h new file mode 100644 index 000000000..792e533e8 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/DeviceResources.h @@ -0,0 +1,138 @@ +// +// DeviceResources.h - A wrapper for the Direct3D 12 device and swapchain +// + +#pragma once + +#include +#include + +#include "pch.h" + +namespace DX { +// Provides an interface for an application that owns DeviceResources to be notified of the device being lost or +// created. +interface IDeviceNotify { + virtual void OnDeviceLost() = 0; + virtual void OnDeviceRestored() = 0; + + protected: + ~IDeviceNotify() = default; +}; + +// Controls all the DirectX device resources. +class DeviceResources { + public: + static constexpr unsigned int c_AllowTearing = 0x1; + static constexpr unsigned int c_EnableHDR = 0x2; + static constexpr unsigned int c_ReverseDepth = 0x4; + + DeviceResources(DXGI_FORMAT backBufferFormat = DXGI_FORMAT_B8G8R8A8_UNORM, + DXGI_FORMAT depthBufferFormat = DXGI_FORMAT_D32_FLOAT, UINT backBufferCount = 2, + D3D_FEATURE_LEVEL minFeatureLevel = D3D_FEATURE_LEVEL_11_0, unsigned int flags = 0) noexcept(false); + ~DeviceResources(); + + DeviceResources(DeviceResources &&) = default; + DeviceResources &operator=(DeviceResources &&) = default; + + DeviceResources(DeviceResources const &) = delete; + DeviceResources &operator=(DeviceResources const &) = delete; + + void CreateDeviceResources(); + void CreateWindowSizeDependentResources(); + void SetWindow(HWND window, int width, int height) noexcept; + bool WindowSizeChanged(int width, int height); + void HandleDeviceLost(); + void RegisterDeviceNotify(IDeviceNotify *deviceNotify) noexcept { m_deviceNotify = deviceNotify; } + void Prepare(D3D12_RESOURCE_STATES beforeState = D3D12_RESOURCE_STATE_PRESENT, + D3D12_RESOURCE_STATES afterState = D3D12_RESOURCE_STATE_RENDER_TARGET); + void Present(D3D12_RESOURCE_STATES beforeState = D3D12_RESOURCE_STATE_RENDER_TARGET); + void WaitForGpu() noexcept; + void UpdateColorSpace(); + + // Device Accessors. + RECT GetOutputSize() const noexcept { return m_outputSize; } + + // Direct3D Accessors. + auto GetD3DDevice() const noexcept { return m_d3dDevice.Get(); } + auto GetSwapChain() const noexcept { return m_swapChain.Get(); } + auto GetDXGIFactory() const noexcept { return m_dxgiFactory.Get(); } + HWND GetWindow() const noexcept { return m_window; } + D3D_FEATURE_LEVEL GetDeviceFeatureLevel() const noexcept { return m_d3dFeatureLevel; } + ID3D12Resource *GetRenderTarget() const noexcept { return m_renderTargets[m_backBufferIndex].Get(); } + ID3D12Resource *GetDepthStencil() const noexcept { return m_depthStencil.Get(); } + ID3D12CommandQueue *GetCommandQueue() const noexcept { return m_commandQueue.Get(); } + ID3D12CommandAllocator *GetCommandAllocator() const noexcept { + return m_commandAllocators[m_backBufferIndex].Get(); + } + auto GetCommandList() const noexcept { return m_commandList.Get(); } + DXGI_FORMAT GetBackBufferFormat() const noexcept { return m_backBufferFormat; } + DXGI_FORMAT GetDepthBufferFormat() const noexcept { return m_depthBufferFormat; } + D3D12_VIEWPORT GetScreenViewport() const noexcept { return m_screenViewport; } + D3D12_RECT GetScissorRect() const noexcept { return m_scissorRect; } + UINT GetCurrentFrameIndex() const noexcept { return m_backBufferIndex; } + UINT GetBackBufferCount() const noexcept { return m_backBufferCount; } + DXGI_COLOR_SPACE_TYPE GetColorSpace() const noexcept { return m_colorSpace; } + unsigned int GetDeviceOptions() const noexcept { return m_options; } + + CD3DX12_CPU_DESCRIPTOR_HANDLE GetRenderTargetView() const noexcept { + return CD3DX12_CPU_DESCRIPTOR_HANDLE(m_rtvDescriptorHeap->GetCPUDescriptorHandleForHeapStart(), + static_cast(m_backBufferIndex), m_rtvDescriptorSize); + } + CD3DX12_CPU_DESCRIPTOR_HANDLE GetDepthStencilView() const noexcept { + return CD3DX12_CPU_DESCRIPTOR_HANDLE(m_dsvDescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + } + + void MoveToNextFrame(); + void GetAdapter(IDXGIAdapter1 **ppAdapter); + + static constexpr size_t MAX_BACK_BUFFER_COUNT = 3; + + UINT m_backBufferIndex; + + // Direct3D objects. + Microsoft::WRL::ComPtr m_d3dDevice; + Microsoft::WRL::ComPtr m_commandList; + Microsoft::WRL::ComPtr m_commandQueue; + Microsoft::WRL::ComPtr m_commandAllocators[MAX_BACK_BUFFER_COUNT]; + + // Swap chain objects. + Microsoft::WRL::ComPtr m_dxgiFactory; + Microsoft::WRL::ComPtr m_swapChain; + Microsoft::WRL::ComPtr m_renderTargets[MAX_BACK_BUFFER_COUNT]; + Microsoft::WRL::ComPtr m_depthStencil; + + // Presentation fence objects. + Microsoft::WRL::ComPtr m_fence; + UINT64 m_fenceValues[MAX_BACK_BUFFER_COUNT]; + Microsoft::WRL::Wrappers::Event m_fenceEvent; + + // Direct3D rendering objects. + Microsoft::WRL::ComPtr m_rtvDescriptorHeap; + Microsoft::WRL::ComPtr m_dsvDescriptorHeap; + UINT m_rtvDescriptorSize; + D3D12_VIEWPORT m_screenViewport; + D3D12_RECT m_scissorRect; + + // Direct3D properties. + DXGI_FORMAT m_backBufferFormat; + DXGI_FORMAT m_depthBufferFormat; + UINT m_backBufferCount; + D3D_FEATURE_LEVEL m_d3dMinFeatureLevel; + + // Cached device properties. + HWND m_window; + D3D_FEATURE_LEVEL m_d3dFeatureLevel; + DWORD m_dxgiFactoryFlags; + RECT m_outputSize; + + // HDR Support + DXGI_COLOR_SPACE_TYPE m_colorSpace; + + // DeviceResources options (see flags above) + unsigned int m_options; + + // The IDeviceNotify can be held directly as it owns the DeviceResources. + IDeviceNotify *m_deviceNotify; +}; +} // namespace DX \ No newline at end of file diff --git a/superbench/benchmarks/micro_benchmarks/directx_third_party/pch.h b/superbench/benchmarks/micro_benchmarks/directx_third_party/pch.h new file mode 100644 index 000000000..c51edb428 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_third_party/pch.h @@ -0,0 +1,97 @@ +//-------------------------------------------------------------------------------------- +// pch.h +// +// Header for standard system include files. +// +// Advanced Technology Group (ATG) +// Copyright (C) Microsoft Corporation. All rights reserved. +//-------------------------------------------------------------------------------------- + +#pragma once + +#include +#define _WIN32_WINNT 0x0A00 +#include + +// Use the C++ standard templated min/max +#define NOMINMAX + +// DirectX apps don't need GDI +#define NODRAWTEXT +#define NOGDI +#define NOBITMAP + +// Include if you need this +#define NOMCX + +// Include if you need this +#define NOSERVICE + +// WinHelp is deprecated +#define NOHELP + +#define WIN32_LEAN_AND_MEAN +#include + +#include +#include + +#include + +#if defined(NTDDI_WIN10_RS2) +#include +#else +#include +#endif + +#include +#include + +#include "d3dx12.h" + +#include +#include +#include +#include + +#ifdef _DEBUG +#include +#endif + +#include + +// To use graphics and CPU markup events with the latest version of PIX, change this to include +// then add the NuGet package WinPixEventRuntime to the project. +#include + +#include +#include + +#pragma comment(lib, "D3Dcompiler.lib") +#pragma comment(lib, "d3d12.lib") +#pragma comment(lib, "dxgi.lib") +#pragma comment(lib, "dxguid.lib") + +namespace DX { +// Helper class for COM exceptions +class com_exception : public std::exception { + public: + com_exception(HRESULT hr) noexcept : result(hr) {} + + const char *what() const override { + static char s_str[64] = {}; + sprintf_s(s_str, "Failure with HRESULT of %08X", static_cast(result)); + return s_str; + } + + private: + HRESULT result; +}; + +// Helper utility converts D3D API failures into exceptions. +inline void ThrowIfFailed(HRESULT hr) { + if (FAILED(hr)) { + throw com_exception(hr); + } +} +} // namespace DX From 27a10811afb4f2f9c5404d02b1056391f14f4b1a Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 22 Aug 2023 18:56:33 +0800 Subject: [PATCH 28/33] Benchmarks: micro benchmark - source code for evaluating NVDEC decoding performance (#560) **Description** source code for evaluating NVDEC decoding performance. --------- Co-authored-by: yukirora --- .azure-pipelines/cuda-unit-test.yml | 6 +- .github/workflows/codeql-analysis.yml | 4 + .gitignore | 3 - dockerfile/cuda11.1.1.dockerfile | 5 + dockerfile/cuda12.1.dockerfile | 5 + .../cuda_decode_performance/AppDecPerf.cpp | 454 +++++++ .../cuda_decode_performance/CMakeLists.txt | 117 ++ .../OptimizedNvDecoder.cpp | 263 ++++ .../OptimizedNvDecoder.h | 52 + .../cuda_decode_performance/ThreadPoolUtils.h | 99 ++ .../Video_Codec_SDK/Interface/cuviddec.h | 1173 +++++++++++++++++ .../Video_Codec_SDK/Interface/nvcuvid.h | 486 +++++++ .../Lib/linux/stubs/x86_64/libnvcuvid.so | Bin 0 -> 3528 bytes .../Samples/NvCodec/NvDecoder/NvDecoder.cpp | 709 ++++++++++ .../Samples/NvCodec/NvDecoder/NvDecoder.h | 528 ++++++++ .../Samples/Utils/FFmpegDemuxer.h | 379 ++++++ .../Samples/Utils/FFmpegStreamer.h | 148 +++ .../Video_Codec_SDK/Samples/Utils/Logger.h | 235 ++++ .../Samples/Utils/NvCodecUtils.h | 547 ++++++++ 19 files changed, 5208 insertions(+), 5 deletions(-) create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h create mode 100644 superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h create mode 100644 third_party/Video_Codec_SDK/Interface/cuviddec.h create mode 100644 third_party/Video_Codec_SDK/Interface/nvcuvid.h create mode 100644 third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so create mode 100644 third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp create mode 100644 third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/Logger.h create mode 100644 third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index 3afcd49fd..2d953d659 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -11,7 +11,7 @@ pool: container: image: nvcr.io/nvidia/pytorch:20.12-py3 - options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker' + options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/' steps: - script: | @@ -21,6 +21,8 @@ steps: python3 -m pip install --upgrade pip setuptools==65.7 python3 -m pip install .[test,nvworker] make postinstall + sudo DEBIAN_FRONTEND=noninteractive apt-get update + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev displayName: Install dependencies - script: | python3 setup.py lint @@ -31,7 +33,7 @@ steps: - script: | SB_MICRO_PATH=$PWD python3 setup.py test displayName: Run unit tests - timeoutInMinutes: 15 + timeoutInMinutes: 30 - script: | bash <(curl -s https://codecov.io/bash) -cF cuda-unit-test displayName: Report coverage results diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index ef9f652b7..e53acebf6 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -49,6 +49,10 @@ jobs: steps: - name: Checkout uses: actions/checkout@v3 + - name: Install Dependency + run: | + DEBIAN_FRONTEND=noninteractive apt-get update + DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: diff --git a/.gitignore b/.gitignore index e1ab18ca4..5888455a8 100644 --- a/.gitignore +++ b/.gitignore @@ -9,9 +9,6 @@ __pycache__/ *.py[cod] *$py.class -# C extensions -*.so - # Distribution / packaging .Python build/ diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 8b92c5463..d7feb2baa 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -26,13 +26,18 @@ RUN apt-get update && \ build-essential \ curl \ dmidecode \ + ffmpeg \ git \ iproute2 \ jq \ libaio-dev \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ libcap2 \ libnuma-dev \ libpci-dev \ + libswresample-dev \ libtinfo5 \ libtool \ lshw \ diff --git a/dockerfile/cuda12.1.dockerfile b/dockerfile/cuda12.1.dockerfile index 4a257bf43..2f9e430fa 100644 --- a/dockerfile/cuda12.1.dockerfile +++ b/dockerfile/cuda12.1.dockerfile @@ -25,14 +25,19 @@ RUN apt-get update && \ build-essential \ curl \ dmidecode \ + ffmpeg \ git \ iproute2 \ jq \ libaio-dev \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ libboost-program-options-dev \ libcap2 \ libnuma-dev \ libpci-dev \ + libswresample-dev \ libtinfo5 \ libtool \ lshw \ diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp new file mode 100644 index 000000000..1ae5ae121 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/AppDecPerf.cpp @@ -0,0 +1,454 @@ +// Copyright(c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../Utils/FFmpegDemuxer.h" +#include "../Utils/NvCodecUtils.h" +#include "OptimizedNvDecoder.h" +#include "ThreadPoolUtils.h" + +// Define logger which need in third party utils +simplelogger::Logger *logger = simplelogger::LoggerFactory::CreateConsoleLogger(); + +// Define the codec map +std::map codecMap = { + {"mpeg1", cudaVideoCodec_MPEG1}, {"mpeg2", cudaVideoCodec_MPEG2}, {"mpeg4", cudaVideoCodec_MPEG4}, + {"vc1", cudaVideoCodec_VC1}, {"h264", cudaVideoCodec_H264}, {"jpeg", cudaVideoCodec_JPEG}, + {"h264_svc", cudaVideoCodec_H264_SVC}, {"h264_mvc", cudaVideoCodec_H264_MVC}, {"hevc", cudaVideoCodec_HEVC}, + {"vp8", cudaVideoCodec_VP8}, {"vp9", cudaVideoCodec_VP9}, {"av1", cudaVideoCodec_AV1}}; + +/** + * @brief Function to decode video file using OptimizedNvDecoder interface + * @param pDec - Handle to OptimizedNvDecoder + * @param demuxer - Pointer to an FFmpegDemuxer instance + * @param pnFrame - Variable to record the number of frames decoded + * @param ex - Stores current exception in case of failure + */ +void DecProc(OptimizedNvDecoder *pDec, const char *szInFilePath, int *pnFrame, std::exception_ptr &ex) { + try { + std::unique_ptr demuxer(new FFmpegDemuxer(szInFilePath)); + int nVideoBytes = 0, nFrameReturned = 0, nFrame = 0; + uint8_t *pVideo = NULL, *pFrame = NULL; + do { + // Demux video from file using FFmpegDemuxer + demuxer->Demux(&pVideo, &nVideoBytes); + // Decode the video frame from demuxed packet + nFrameReturned = pDec->Decode(pVideo, nVideoBytes); + if (!nFrame && nFrameReturned) + LOG(INFO) << pDec->GetVideoInfo(); + nFrame += nFrameReturned; + } while (nVideoBytes); + *pnFrame = nFrame; + } catch (std::exception &) { + ex = std::current_exception(); + } +} + +/** + * @brief Function to show help message and exit + */ +void ShowHelpAndExit(const char *szBadOption = NULL) { + std::ostringstream oss; + bool bThrowError = false; + if (szBadOption) { + bThrowError = true; + oss << "Error parsing \"" << szBadOption << "\"" << std::endl; + } + oss << "Options:" << std::endl + << "-i Input file path. No default value. One of -i and -multi_input is required." << std::endl + << "-o Output file path of raw data. No default value. Optional." << std::endl + << "-gpu Ordinal of GPU to use. Default 0. Optional." << std::endl + << "-thread Number of decoding thread. Default 5. Optional." << std::endl + << "-total Number of total video to test. Default 100. Optional." << std::endl + << "-single (No value) Use single cuda context for every thread. Default is multi-context, one context " + "per thread." + << std::endl + << "-host (No value) Copy frame to host memory .Default is device memory)" << std::endl + << "-multi_input The file path which lists the path of multiple video in each line." << std::endl + << "-codec The codec of video to test. Default H264." << std::endl; + if (bThrowError) { + throw std::invalid_argument(oss.str()); + } else { + std::cout << oss.str(); + exit(0); + } +} + +/** + * @brief Function to parse commandline arguments + */ +void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu, int &nThread, int &nTotalVideo, + bool &bSingle, bool &bHost, std::string &inputFilesListPath, std::string &outputFile, + cudaVideoCodec &codec) { + for (int i = 1; i < argc; i++) { + if (!_stricmp(argv[i], "-h")) { + ShowHelpAndExit(); + } + if (!_stricmp(argv[i], "-i")) { + if (++i == argc) { + ShowHelpAndExit("-i"); + } + sprintf(szInputFileName, "%s", argv[i]); + continue; + } + if (!_stricmp(argv[i], "-o")) { + if (++i == argc) { + ShowHelpAndExit("-o"); + } + outputFile = std::string(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-gpu")) { + if (++i == argc) { + ShowHelpAndExit("-gpu"); + } + iGpu = atoi(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-thread")) { + if (++i == argc) { + ShowHelpAndExit("-thread"); + } + nThread = atoi(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-total")) { + if (++i == argc) { + ShowHelpAndExit("-total"); + } + nTotalVideo = atoi(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-multi_input")) { + if (++i == argc) { + ShowHelpAndExit("-multi_input"); + } + inputFilesListPath = std::string(argv[i]); + continue; + } + if (!_stricmp(argv[i], "-single")) { + bSingle = true; + continue; + } + if (!_stricmp(argv[i], "-host")) { + bHost = true; + continue; + } + if (!_stricmp(argv[i], "-codec")) { + if (++i == argc) { + ShowHelpAndExit("-codec"); + } + std::string codecName = std::string(argv[i]); + std::transform(codecName.begin(), codecName.end(), codecName.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (codecMap.find(codecName) != codecMap.end()) { + codec = codecMap[codecName]; + } else { + std::cout << "Codec name not found in the map." << std::endl; + exit(1); + } + continue; + } + ShowHelpAndExit(argv[i]); + } +} + +/** + * @brief Function to create cuda context and initialize decoder + */ +OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUcontext &cuContext, bool bSingle, + bool bHost, cudaVideoCodec codec, CUVIDDECODECAPS decodecaps) { + if (!bSingle) { + ck(cuCtxCreate(&cuContext, 0, cuDevice)); + } + OptimizedNvDecoder *sessionObject = new OptimizedNvDecoder(cuContext, !bHost, codec, decodecaps); + sessionObject->setDecoderSessionID(i); + return sessionObject; +} + +/** + * @brief Function to decode a video in a thread and measure the latency + */ +double DecodeVideo(size_t i, const std::vector &vDec, const char *szInFilePath, int *pnFrame, + std::exception_ptr &ex) { + try { + OptimizedNvDecoder *pDec = vDec[i]; + auto start = std::chrono::high_resolution_clock::now(); + DecProc(pDec, szInFilePath, pnFrame, ex); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsedTime = std::chrono::duration_cast(end - start).count(); + std::cout << "Decode finished --" + << " duration:" << elapsedTime << " frames:" << *pnFrame << std::endl; + return elapsedTime / 1000.0f; + } catch (const std::exception &e) { + std::cerr << "Exception in deocding: " << e.what() << std::endl; + return 0; + } +} + +/** + * @brief Function to read the video paths from a file + */ +std::vector ReadMultipleVideoFiles(const std::string &filepath) { + std::ifstream file(filepath); + if (!file) { + std::cerr << "Error opening the file." << std::endl; + exit(1); + } + std::string line; + std::vector tokens; + while (std::getline(file, line)) { + tokens.push_back(line); + } + file.close(); + return tokens; +} + +/** + * @brief Function to get the decoder capability + */ +void GetDefaultDecoderCaps(CUVIDDECODECAPS &decodecaps, cudaVideoCodec codec) { + memset(&decodecaps, 0, sizeof(decodecaps)); + decodecaps.eCodecType = codec; + decodecaps.eChromaFormat = cudaVideoChromaFormat_420; + decodecaps.nBitDepthMinus8 = 0; + NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps)); +} + +/** + * @brief Function to initialize the cuda device, cuda context, query the decoder capability and create decoder for + * each thread + */ +void InitializeContext(std::vector &vDec, int iGpu, int nThread, bool bSingle, bool bHost, + cudaVideoCodec codec) { + ck(cuInit(0)); + int nGpu = 0; + ck(cuDeviceGetCount(&nGpu)); + if (iGpu < 0 || iGpu >= nGpu) { + std::cout << "GPU ordinal out of range. Should be within [" << 0 << ", " << nGpu - 1 << "]" << std::endl; + exit(1); + } + CUdevice cuDevice = 0; + ck(cuDeviceGet(&cuDevice, iGpu)); + char szDeviceName[80]; + ck(cuDeviceGetName(szDeviceName, sizeof(szDeviceName), cuDevice)); + std::cout << "GPU in use: " << szDeviceName << std::endl; + + CUcontext cuContext = NULL; + ck(cuCtxCreate(&cuContext, 0, cuDevice)); + + CUVIDDECODECAPS decodecaps; + GetDefaultDecoderCaps(decodecaps, codec); + + ThreadPool threadPool(nThread); + std::vector> futures; + for (int i = 0; i < nThread; i++) { + futures.push_back( + threadPool.enqueue(InitOptimizedNvDecoder, cuDevice, cuContext, bSingle, bHost, codec, decodecaps)); + } + for (auto &future : futures) { + vDec.push_back(future.get()); // Retrieve the results from each task + } +} + +/** + * @brief Function to write the latency and FPS data of each video to a file + */ +void WriteRawData(std::vector &vDec, int nThread, const std::vector &data, + std::vector &frames, std::string filename) { + // Open the output file stream + std::ofstream outputFile(filename); + outputFile << "Frame Latency" << std::endl; + for (int i = 0; i < nThread; i++) { + for (const auto &tuple : vDec[i]->GetFrameLatency()) { + int frame = std::get<0>(tuple); + double latency = std::get<1>(tuple); + outputFile << "Frame: " << frame << ", Latency: " << latency << std::endl; + } + } + outputFile << "Video Latency" << std::endl; + for (int i = 0; i < data.size(); i++) { + outputFile << data[i] << std::endl; + } + outputFile << "Video FPS" << std::endl; + for (int i = 0; i < data.size(); i++) { + outputFile << frames[i] / data[i] << std::endl; + } + + // Close the file stream + outputFile.close(); +} + +/** + * @brief Function to calculate the statistical metrics + */ +std::tuple +CalMetrics(const std::vector &originData) { + std::vector data = originData; + double sum = std::accumulate(data.begin(), data.end(), 0.0); + double mean = sum / data.size(); + double min = *std::min_element(data.begin(), data.end()); + double max = *std::max_element(data.begin(), data.end()); + std::sort(data.begin(), data.end()); + double p50 = data[data.size() / 2]; + double p90 = data[static_cast(data.size() * 0.9)]; + double p95 = data[static_cast(data.size() * 0.95)]; + double p99 = data[static_cast(data.size() * 0.99)]; + return std::make_tuple(sum, mean, min, max, p50, p90, p95, p99); +} + +/** + * @brief Function to generate the total file list for the given total number of videos. + * If the number of videos is less than the total number of videos, the list will be repeated. + * If the number of videos is greater than the total number of videos, the list will be truncated. + */ +std::vector GenerateTotalFileList(const std::string &inputFilesListPath, int nTotalVideo, + const char *szInFilePath) { + std::vector files; + if (inputFilesListPath.size() != 0) { + auto videofiles = ReadMultipleVideoFiles(inputFilesListPath); + int smallerSize = videofiles.size(); + + if (nTotalVideo > smallerSize) { + int numIterations = nTotalVideo / smallerSize; + + for (int i = 0; i < numIterations; i++) { + files.insert(files.end(), videofiles.begin(), videofiles.end()); + } + + int remainingElements = nTotalVideo - (numIterations * smallerSize); + files.insert(files.end(), videofiles.begin(), videofiles.begin() + remainingElements); + } else { + files = std::vector(videofiles.begin(), videofiles.begin() + nTotalVideo); + } + + std::cout << "Multifile mode - " << nTotalVideo << "videos will be decoded" << std::endl; + } else { + for (int i = 0; i < nTotalVideo; i++) { + files.push_back(std::string(szInFilePath)); + } + } + return files; +} + +/** + * @brief Function to run the decoding tasks in parallel with thread pool to decode all the videos and record the total + * latency and the total number of frames + */ +float run(std::vector &vDec, int nThread, std::vector &files, + std::vector &vnFrame, std::vector &vExceptionPtrs, int *nTotalFrames, + std::vector &vnLatency, std::vector &frLatency, std::vector &vnFPS) { + std::vector> decodeLatencyFutures; + ThreadPool threadPool(nThread); + // Enqueue the video decoding task into thread pool + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < files.size(); i++) { + auto filePath = files[i].c_str(); + CheckInputFile(filePath); + decodeLatencyFutures.push_back( + threadPool.enqueue(DecodeVideo, vDec, filePath, &vnFrame[i], std::ref(vExceptionPtrs[i]))); + } + // Wait until decoding tasks finished + for (int i = 0; i < files.size(); i++) { + auto decodeLatency = decodeLatencyFutures[i].get(); + vnLatency.push_back(decodeLatency); + *nTotalFrames += vnFrame[i]; + } + auto elapsedTime = + (std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start) + .count()) / + 1000.0f; + for (int i = 0; i < nThread; i++) { + for (const auto &tuple : vDec[i]->GetFrameLatency()) { + int frame = std::get<0>(tuple); + double latency = std::get<1>(tuple); + if (frame > 0) { + frLatency.push_back(latency / frame); + } + } + } + for (int i = 0; i < vnLatency.size(); i++) { + if (vnLatency[i] != 0) { + vnFPS.push_back(vnFrame[i] / vnLatency[i]); + } + } + + // Record the total time + return elapsedTime; +} + +int main(int argc, char **argv) { + char szInFilePath[256] = ""; + int iGpu = 0; + int nThread = 5; + int nTotalVideo = 100; + bool bSingle = false; + bool bHost = false; + std::string inputFilesListPath = ""; + std::string outputFilePath = ""; + std::vector vExceptionPtrs(nTotalVideo); + cudaVideoCodec codec = cudaVideoCodec_H264; + try { + // Parse the command line arguments + ParseCommandLine(argc, argv, szInFilePath, iGpu, nThread, nTotalVideo, bSingle, bHost, inputFilesListPath, + outputFilePath, codec); + auto files = GenerateTotalFileList(inputFilesListPath, nTotalVideo, szInFilePath); + + // Initialize and prepare the decoder context for each thread + std::vector vDec; + InitializeContext(vDec, iGpu, nThread, bSingle, bHost, codec); + + // Decode all video with thread pool + std::vector vnFrame(nTotalVideo); + int nTotalFrames = 0; + std::vector vnLatency; + std::vector frLatency; + std::vector videoFPS; + auto elapsedTime = + run(vDec, nThread, files, vnFrame, vExceptionPtrs, &nTotalFrames, vnLatency, frLatency, videoFPS); + + // Calculate and output the raw data into file and metrics into stdout + double sum, mean, min, max, p50, p90, p95, p99; + std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(vnLatency); + std::cout << "Total Frames Decoded=" << nTotalFrames << " FPS=" << nTotalFrames / elapsedTime << std::endl; + std::cout << "Mean Latency for each video=" << mean * 1000 << " P50 Latency=" << p50 * 1000 + << " P90 Latency=" << p90 * 1000 << " P95 Latency=" << p95 * 1000 << " P99 Latency=" << p99 * 1000 + << "ms" << std::endl; + + std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(videoFPS); + std::cout << "Mean FPS for each video=" << mean << " P50 FPS=" << p50 << " P90 FPS=" << p90 + << " P95 FPS=" << p95 << " P99 FPS=" << p99 << std::endl; + std::tie(sum, mean, min, max, p50, p90, p95, p99) = CalMetrics(frLatency); + std::cout << "Mean Latency for each frame=" << mean * 1000 << " P50 Latency=" << p50 * 1000 + << " P90 Latency=" << p90 * 1000 << " P95 Latency=" << p95 * 1000 << " P99 Latency=" << p99 * 1000 + << "ms" << std::endl; + if (outputFilePath.size() != 0) { + WriteRawData(vDec, nThread, vnLatency, vnFrame, outputFilePath); + } + // Deinitialization + for (int i = 0; i < nThread; i++) { + delete (vDec[i]); + } + for (int i = 0; i < nThread; i++) { + if (vExceptionPtrs[i]) { + std::rethrow_exception(vExceptionPtrs[i]); + } + } + } catch (const std::exception &ex) { + std::cout << ex.what(); + exit(1); + } + return 0; +} diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt new file mode 100644 index 000000000..83cb15067 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/CMakeLists.txt @@ -0,0 +1,117 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.18) +project(cuda_decode_performance) + +find_package(CUDA QUIET) +if(CUDA_FOUND) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + + set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) + set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) + set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) + set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec) + set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + find_package(PkgConfig REQUIRED) + pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) + pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat) + pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) + pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) + + set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) + find_library(AVCODEC_LIBRARY NAMES avcodec + HINTS + ${PC_AVCODEC_LIBDIR} + ${PC_AVCODEC_LIBRARY_DIRS} + ) + find_library(AVFORMAT_LIBRARY NAMES avformat + HINTS + ${PC_AVFORMAT_LIBDIR} + ${PC_AVFORMAT_LIBRARY_DIRS} + ) + find_library(AVUTIL_LIBRARY NAMES avutil + HINTS + ${PC_AVUTIL_LIBDIR} + ${PC_AVUTIL_LIBRARY_DIRS} + ) + find_library(SWRESAMPLE_LIBRARY NAMES swresample + HINTS + ${PC_SWRESAMPLE_LIBDIR} + ${PC_SWRESAMPLE_LIBRARY_DIRS} + ) + set(AVCODEC_LIB ${AVCODEC_LIBRARY}) + set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) + set(AVUTIL_LIB ${AVUTIL_LIBRARY}) + set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) + endif() + + set(APP_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp + ) + + set(NV_DEC_SOURCES + ${NV_DEC_DIR}/NvDecoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp + ) + + set(NV_DEC_HDRS + ${NV_DEC_DIR}/NvDecoder.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h + ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h + ${NVCODEC_UTILS_DIR}/NvCodecUtils.h + ${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h + ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h + ) + + source_group( "headers" FILES ${NV_DEC_HDRS} ) + source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) + set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") + find_package(CUDA) + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") + if ( CMAKE_COMPILER_IS_GNUCC ) + if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) + list(APPEND CUDA_NVCC_FLAGS -std=c++11) + endif() + endif() + + # Check if the file exists + if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) + execute_process( + COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so + RESULT_VARIABLE result + ) + if(result) + message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") + endif() + endif () + + find_library(CUVID_LIB nvcuvid + HINTS + "/usr/local/lib/" + "${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/" + ) + + cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) + + set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + + target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} + ${NVCODEC_PUBLIC_INTERFACE_DIR} + ${NVCODEC_UTILS_DIR} + ${NV_CODEC_DIR} + ${NV_APPDEC_COMMON_DIR} + ${NV_FFMPEG_HDRS} + ${THIRD_PARTY_SAMPLE_DIR} + ) + + target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} + ${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) + + install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib) +endif() diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp new file mode 100644 index 000000000..ee23391b7 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.cpp @@ -0,0 +1,263 @@ +// Copyright(c) Microsoft Corporation. +// Licensed under the MIT License. + +#include + +#include "OptimizedNvDecoder.h" + +int OptimizedNvDecoder::Decode(const uint8_t *pData, int nSize, int nFlags, int64_t nTimestamp) { + m_nDecodedFrame = 0; + m_nDecodedFrameReturned = 0; + CUVIDSOURCEDATAPACKET packet = {0}; + packet.payload = pData; + packet.payload_size = nSize; + packet.flags = nFlags | CUVID_PKT_TIMESTAMP; + packet.timestamp = nTimestamp; + if (!pData || nSize == 0) { + packet.flags |= CUVID_PKT_ENDOFSTREAM; + } + auto start = std::chrono::high_resolution_clock::now(); + NVDEC_API_CALL(cuvidParseVideoData(m_hParser, &packet)); + int64_t elapsedTime = + std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start) + .count(); + frameLatency.push_back(std::make_tuple(m_nDecodedFrame, elapsedTime / 1000.0f / 1000.0f)); + return m_nDecodedFrame; +} + +OptimizedNvDecoder::OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, + CUVIDDECODECAPS decodecaps, bool bLowLatency, bool bDeviceFramePitched, + const Rect *pCropRect, const Dim *pResizeDim, bool extract_user_SEI_Message, + int maxWidth, int maxHeight, unsigned int clkRate, bool force_zero_latency) { + m_cuContext = cuContext; + m_bUseDeviceFrame = bUseDeviceFrame; + m_eCodec = eCodec; + m_bDeviceFramePitched = bDeviceFramePitched; + m_bExtractSEIMessage = extract_user_SEI_Message; + m_nMaxWidth = maxWidth; + m_nMaxHeight = maxHeight; + m_bForce_zero_latency = force_zero_latency; + if (pCropRect) + m_cropRect = *pCropRect; + if (pResizeDim) + m_resizeDim = *pResizeDim; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext)); + + ck(cuStreamCreate(&m_cuvidStream, CU_STREAM_DEFAULT)); + + decoderSessionID = 0; + + if (m_bExtractSEIMessage) { + m_fpSEI = fopen("sei_message.txt", "wb"); + m_pCurrSEIMessage = new CUVIDSEIMESSAGEINFO; + memset(&m_SEIMessagesDisplayOrder, 0, sizeof(m_SEIMessagesDisplayOrder)); + } + CUVIDPARSERPARAMS videoParserParameters = {}; + videoParserParameters.CodecType = eCodec; + videoParserParameters.ulMaxNumDecodeSurfaces = 1; + videoParserParameters.ulClockRate = clkRate; + videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1; + videoParserParameters.pUserData = this; + videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc; + videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc; + videoParserParameters.pfnDisplayPicture = m_bForce_zero_latency ? NULL : HandlePictureDisplayProc; + videoParserParameters.pfnGetOperatingPoint = HandleOperatingPointProc; + videoParserParameters.pfnGetSEIMsg = m_bExtractSEIMessage ? HandleSEIMessagesProc : NULL; + NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters)); + // reuse the decodecaps queried before + m_decodecaps = decodecaps; + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); +} + +int OptimizedNvDecoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat) { + START_TIMER + m_videoInfo.str(""); + m_videoInfo.clear(); + m_videoInfo << "Video Input Information" << std::endl + << "\tCodec : " << GetVideoCodecString(pVideoFormat->codec) << std::endl + << "\tFrame rate : " << pVideoFormat->frame_rate.numerator << "/" + << pVideoFormat->frame_rate.denominator << " = " + << 1.0 * pVideoFormat->frame_rate.numerator / pVideoFormat->frame_rate.denominator << " fps" + << std::endl + << "\tSequence : " << (pVideoFormat->progressive_sequence ? "Progressive" : "Interlaced") + << std::endl + << "\tCoded size : [" << pVideoFormat->coded_width << ", " << pVideoFormat->coded_height << "]" + << std::endl + << "\tDisplay area : [" << pVideoFormat->display_area.left << ", " << pVideoFormat->display_area.top + << ", " << pVideoFormat->display_area.right << ", " << pVideoFormat->display_area.bottom << "]" + << std::endl + << "\tChroma : " << GetVideoChromaFormatString(pVideoFormat->chroma_format) << std::endl + << "\tBit depth : " << pVideoFormat->bit_depth_luma_minus8 + 8; + m_videoInfo << std::endl; + + int nDecodeSurface = pVideoFormat->min_num_decode_surfaces; + + // re-call the cuvidGetDecoderCaps when the video codeoc and format change + if (m_decodecaps.eCodecType != pVideoFormat->codec || m_decodecaps.eChromaFormat != pVideoFormat->chroma_format || + m_decodecaps.nBitDepthMinus8 != pVideoFormat->bit_depth_luma_minus8) { + m_decodecaps.eCodecType = pVideoFormat->codec; + m_decodecaps.eChromaFormat = pVideoFormat->chroma_format; + m_decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidGetDecoderCaps(&m_decodecaps)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + } + + if (!m_decodecaps.bIsSupported) { + NVDEC_THROW_ERROR("Codec not supported on this GPU", CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if ((pVideoFormat->coded_width > m_decodecaps.nMaxWidth) || + (pVideoFormat->coded_height > m_decodecaps.nMaxHeight)) { + + std::ostringstream errorString; + errorString << std::endl + << "Resolution : " << pVideoFormat->coded_width << "x" << pVideoFormat->coded_height + << std::endl + << "Max Supported (wxh) : " << m_decodecaps.nMaxWidth << "x" << m_decodecaps.nMaxHeight << std::endl + << "Resolution not supported on this GPU"; + + const std::string cErr = errorString.str(); + NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > m_decodecaps.nMaxMBCount) { + + std::ostringstream errorString; + errorString << std::endl + << "MBCount : " << (pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) + << std::endl + << "Max Supported mbcnt : " << m_decodecaps.nMaxMBCount << std::endl + << "MBCount not supported on this GPU"; + NVDEC_THROW_ERROR(errorString.str(), CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if (m_nWidth && m_nLumaHeight && m_nChromaHeight) { + + // cuvidCreateDecoder() has been called before, and now there's possible config change + return ReconfigureDecoder(pVideoFormat); + } + + // eCodec has been set in the constructor (for parser). Here it's set again for potential correction + m_eCodec = pVideoFormat->codec; + m_eChromaFormat = pVideoFormat->chroma_format; + m_nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + m_nBPP = m_nBitDepthMinus8 > 0 ? 2 : 1; + + // Set the output surface format same as chroma format + if (m_eChromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome) + m_eOutputFormat = + pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12; + else if (m_eChromaFormat == cudaVideoChromaFormat_444) + m_eOutputFormat = + pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444; + else if (m_eChromaFormat == cudaVideoChromaFormat_422) + m_eOutputFormat = cudaVideoSurfaceFormat_NV12; // no 4:2:2 output format supported yet so make 420 default + + // Check if output format supported. If not, check falback options + if (!(m_decodecaps.nOutputFormatMask & (1 << m_eOutputFormat))) { + if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)) + m_eOutputFormat = cudaVideoSurfaceFormat_NV12; + else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016)) + m_eOutputFormat = cudaVideoSurfaceFormat_P016; + else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444)) + m_eOutputFormat = cudaVideoSurfaceFormat_YUV444; + else if (m_decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit)) + m_eOutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit; + else + NVDEC_THROW_ERROR("No supported output format found", CUDA_ERROR_NOT_SUPPORTED); + } + m_videoFormat = *pVideoFormat; + + CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0}; + videoDecodeCreateInfo.CodecType = pVideoFormat->codec; + videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format; + videoDecodeCreateInfo.OutputFormat = m_eOutputFormat; + videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + if (pVideoFormat->progressive_sequence) + videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave; + else + videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive; + videoDecodeCreateInfo.ulNumOutputSurfaces = 2; + // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware + videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID; + videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface; + videoDecodeCreateInfo.vidLock = m_ctxLock; + videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width; + videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height; + // AV1 has max width/height of sequence in sequence header + if (pVideoFormat->codec == cudaVideoCodec_AV1 && pVideoFormat->seqhdr_data_length > 0) { + CUVIDEOFORMATEX *vidFormatEx = (CUVIDEOFORMATEX *)pVideoFormat; + if (m_nMaxWidth < pVideoFormat->coded_width) { + m_nMaxWidth = vidFormatEx->av1.max_width; + } + if (m_nMaxHeight < pVideoFormat->coded_height) { + m_nMaxHeight = vidFormatEx->av1.max_height; + } + } + if (m_nMaxWidth < (int)pVideoFormat->coded_width) + m_nMaxWidth = pVideoFormat->coded_width; + if (m_nMaxHeight < (int)pVideoFormat->coded_height) + m_nMaxHeight = pVideoFormat->coded_height; + videoDecodeCreateInfo.ulMaxWidth = m_nMaxWidth; + videoDecodeCreateInfo.ulMaxHeight = m_nMaxHeight; + + if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) { + m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; + m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; + videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width; + videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height; + } else { + if (m_resizeDim.w && m_resizeDim.h) { + videoDecodeCreateInfo.display_area.left = pVideoFormat->display_area.left; + videoDecodeCreateInfo.display_area.top = pVideoFormat->display_area.top; + videoDecodeCreateInfo.display_area.right = pVideoFormat->display_area.right; + videoDecodeCreateInfo.display_area.bottom = pVideoFormat->display_area.bottom; + m_nWidth = m_resizeDim.w; + m_nLumaHeight = m_resizeDim.h; + } + + if (m_cropRect.r && m_cropRect.b) { + videoDecodeCreateInfo.display_area.left = m_cropRect.l; + videoDecodeCreateInfo.display_area.top = m_cropRect.t; + videoDecodeCreateInfo.display_area.right = m_cropRect.r; + videoDecodeCreateInfo.display_area.bottom = m_cropRect.b; + m_nWidth = m_cropRect.r - m_cropRect.l; + m_nLumaHeight = m_cropRect.b - m_cropRect.t; + } + videoDecodeCreateInfo.ulTargetWidth = m_nWidth; + videoDecodeCreateInfo.ulTargetHeight = m_nLumaHeight; + } + + m_nChromaHeight = (int)(ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat))); + m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat); + m_nSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight; + m_nSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth; + m_displayRect.b = videoDecodeCreateInfo.display_area.bottom; + m_displayRect.t = videoDecodeCreateInfo.display_area.top; + m_displayRect.l = videoDecodeCreateInfo.display_area.left; + m_displayRect.r = videoDecodeCreateInfo.display_area.right; + + m_videoInfo << "Video Decoding Params:" << std::endl + << "\tNum Surfaces : " << videoDecodeCreateInfo.ulNumDecodeSurfaces << std::endl + << "\tCrop : [" << videoDecodeCreateInfo.display_area.left << ", " + << videoDecodeCreateInfo.display_area.top << ", " << videoDecodeCreateInfo.display_area.right << ", " + << videoDecodeCreateInfo.display_area.bottom << "]" << std::endl + << "\tResize : " << videoDecodeCreateInfo.ulTargetWidth << "x" + << videoDecodeCreateInfo.ulTargetHeight << std::endl + << "\tDeinterlace : " + << std::vector{"Weave", "Bob", "Adaptive"}[videoDecodeCreateInfo.DeinterlaceMode]; + m_videoInfo << std::endl; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidCreateDecoder(&m_hDecoder, &videoDecodeCreateInfo)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + STOP_TIMER("Session Initialization Time: "); + NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime); + return nDecodeSurface; +} diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h new file mode 100644 index 000000000..f9881c80d --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/OptimizedNvDecoder.h @@ -0,0 +1,52 @@ +// Copyright(c) Microsoft Corporation. +// Licensed under the MIT License. + +#include "NvDecoder/NvDecoder.h" + +// This class is derived from NvDecoder class and is used to optimize the cuvidGetDecoderCaps overhead +class OptimizedNvDecoder : public NvDecoder { + + public: + OptimizedNvDecoder() {} + /** + * @brief This function is used to initialize the decoder session. + * Application must call this function to initialize the decoder, before + * starting to decode any frames. + * The only difference from the original function is to add a new member m_decodecaps. + * Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK. + */ + OptimizedNvDecoder(CUcontext &cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, CUVIDDECODECAPS decodecaps, + bool bLowLatency = false, bool bDeviceFramePitched = false, const Rect *pCropRect = NULL, + const Dim *pResizeDim = NULL, bool extract_user_SEI_Message = false, int maxWidth = 0, + int maxHeight = 0, unsigned int clkRate = 1000, bool force_zero_latency = false); + + /** + * @brief This function is to overwrite the origin Decode function to record the latency on frame level. + */ + int Decode(const uint8_t *pData, int nSize, int nFlags = 0, int64_t nTimestamp = 0); + /** + * @brief This function is used to Get the frameLatency vector + */ + std::vector> &GetFrameLatency() { return frameLatency; } + + protected: + /** + * @brief Callback function to be registered for getting a callback when decoding of sequence starts + */ + static int CUDAAPI HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat) { + if (pUserData == nullptr) { + throw std::runtime_error("pUserData is nullptr"); + } + return ((OptimizedNvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat); + } + /** + * @brief Define the new handler when decoding of sequence starts. + * The only change is to re-query decoder caps when the video codec or format change + * Other part is the same as the original function, refer to NvDecoder.cpp in NVIDIA Video Codec SDK. + */ + int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat); + + CUVIDDECODECAPS m_decodecaps; + + std::vector> frameLatency; +}; diff --git a/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h new file mode 100644 index 000000000..5592b76e7 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/cuda_decode_performance/ThreadPoolUtils.h @@ -0,0 +1,99 @@ +// Copyright(c) Microsoft Corporation. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include +#include + +// ThreadPool is a simple thread pool implementation that supports enqueueing the task with the index of thread to use +// and custom arguments like task(thread_index, *args). +class ThreadPool { + public: + /** + * @brief Construct a new ThreadPool object with the given number of threads. + */ + ThreadPool(size_t numThreads) { + for (size_t i = 0; i < numThreads; ++i) { + threads.emplace_back(&ThreadPool::worker, this, i); + } + } + /** + * @brief Destroy the ThreadPool object and join all threads. + */ + ~ThreadPool() { + { + std::unique_lock lock(mutex); + stop = true; + } + cv.notify_all(); + + for (auto &thread : threads) { + thread.join(); + } + } + /** + * @brief TaskWrapper is a wrapper of the task with the index of thread to use and custom arguments like + * task(thread_index, *args). + */ + template struct TaskWrapper { + std::shared_ptr> task; + + template TaskWrapper(Callable &&f, CallableArgs &&...args) { + task = std::make_shared>( + [f, args...](size_t threadIdx) mutable { return f(threadIdx, args...); }); + } + + void operator()(size_t threadIdx) { (*task)(threadIdx); } + }; + /** + * @brief Enqueue enqueues the task with custom arguments and return the results of task when finished. + */ + template + auto enqueue(F &&f, Args &&...args) -> std::future::type> { + using ReturnType = typename std::result_of::type; + + TaskWrapper wrapper(std::forward(f), std::forward(args)...); + std::future res = wrapper.task->get_future(); + + { + std::unique_lock lock(mutex); + tasks.emplace(std::move(wrapper)); + } + cv.notify_one(); + + return res; + } + + private: + /** + * @brief The worker function that dequeues the task and executes it for each thread index. + */ + void worker(size_t threadIdx) { + while (true) { + std::function task; + { + std::unique_lock lock(mutex); + cv.wait(lock, [this] { return stop || !tasks.empty(); }); + + if (stop && tasks.empty()) { + return; + } + + task = tasks.front(); + tasks.pop(); + } + + task(threadIdx); + } + } + + std::vector threads; + std::queue> tasks; + std::mutex mutex; + std::condition_variable cv; + bool stop = false; +}; diff --git a/third_party/Video_Codec_SDK/Interface/cuviddec.h b/third_party/Video_Codec_SDK/Interface/cuviddec.h new file mode 100644 index 000000000..1d13eec83 --- /dev/null +++ b/third_party/Video_Codec_SDK/Interface/cuviddec.h @@ -0,0 +1,1173 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/*****************************************************************************************************/ +//! \file cuviddec.h +//! NVDECODE API provides video decoding interface to NVIDIA GPU devices. +//! This file contains constants, structure definitions and function prototypes used for decoding. +/*****************************************************************************************************/ + +#if !defined(__CUDA_VIDEO_H__) +#define __CUDA_VIDEO_H__ + +#ifndef __cuda_cuda_h__ +#include +#endif // __cuda_cuda_h__ + +#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) +#if (CUDA_VERSION >= 3020) && (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020)) +#define __CUVID_DEVPTR64 +#endif +#endif + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +typedef void *CUvideodecoder; +typedef struct _CUcontextlock_st *CUvideoctxlock; + +/*********************************************************************************/ +//! \enum cudaVideoCodec +//! Video codec enums +//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures +/*********************************************************************************/ +typedef enum cudaVideoCodec_enum { + cudaVideoCodec_MPEG1 = 0, /**< MPEG1 */ + cudaVideoCodec_MPEG2, /**< MPEG2 */ + cudaVideoCodec_MPEG4, /**< MPEG4 */ + cudaVideoCodec_VC1, /**< VC1 */ + cudaVideoCodec_H264, /**< H264 */ + cudaVideoCodec_JPEG, /**< JPEG */ + cudaVideoCodec_H264_SVC, /**< H264-SVC */ + cudaVideoCodec_H264_MVC, /**< H264-MVC */ + cudaVideoCodec_HEVC, /**< HEVC */ + cudaVideoCodec_VP8, /**< VP8 */ + cudaVideoCodec_VP9, /**< VP9 */ + cudaVideoCodec_AV1, /**< AV1 */ + cudaVideoCodec_NumCodecs, /**< Max codecs */ + // Uncompressed YUV + cudaVideoCodec_YUV420 = (('I' << 24) | ('Y' << 16) | ('U' << 8) | ('V')), /**< Y,U,V (4:2:0) */ + cudaVideoCodec_YV12 = (('Y' << 24) | ('V' << 16) | ('1' << 8) | ('2')), /**< Y,V,U (4:2:0) */ + cudaVideoCodec_NV12 = (('N' << 24) | ('V' << 16) | ('1' << 8) | ('2')), /**< Y,UV (4:2:0) */ + cudaVideoCodec_YUYV = (('Y' << 24) | ('U' << 16) | ('Y' << 8) | ('V')), /**< YUYV/YUY2 (4:2:2) */ + cudaVideoCodec_UYVY = (('U' << 24) | ('Y' << 16) | ('V' << 8) | ('Y')) /**< UYVY (4:2:2) */ +} cudaVideoCodec; + +/*********************************************************************************/ +//! \enum cudaVideoSurfaceFormat +//! Video surface format enums used for output format of decoded output +//! These enums are used in CUVIDDECODECREATEINFO structure +/*********************************************************************************/ +typedef enum cudaVideoSurfaceFormat_enum { + cudaVideoSurfaceFormat_NV12 = 0, /**< Semi-Planar YUV [Y plane followed by interleaved UV plane] */ + cudaVideoSurfaceFormat_P016 = 1, /**< 16 bit Semi-Planar YUV [Y plane followed by interleaved UV plane]. + Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0) */ + cudaVideoSurfaceFormat_YUV444 = 2, /**< Planar YUV [Y plane followed by U and V planes] */ + cudaVideoSurfaceFormat_YUV444_16Bit = 3, /**< 16 bit Planar YUV [Y plane followed by U and V planes]. + Can be used for 10 bit(6LSB bits 0), 12 bit (4LSB bits 0) */ +} cudaVideoSurfaceFormat; + +/******************************************************************************************************************/ +//! \enum cudaVideoDeinterlaceMode +//! Deinterlacing mode enums +//! These enums are used in CUVIDDECODECREATEINFO structure +//! Use cudaVideoDeinterlaceMode_Weave for progressive content and for content that doesn't need deinterlacing +//! cudaVideoDeinterlaceMode_Adaptive needs more video memory than other DImodes +/******************************************************************************************************************/ +typedef enum cudaVideoDeinterlaceMode_enum { + cudaVideoDeinterlaceMode_Weave = 0, /**< Weave both fields (no deinterlacing) */ + cudaVideoDeinterlaceMode_Bob, /**< Drop one field */ + cudaVideoDeinterlaceMode_Adaptive /**< Adaptive deinterlacing */ +} cudaVideoDeinterlaceMode; + +/**************************************************************************************************************/ +//! \enum cudaVideoChromaFormat +//! Chroma format enums +//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures +/**************************************************************************************************************/ +typedef enum cudaVideoChromaFormat_enum { + cudaVideoChromaFormat_Monochrome = 0, /**< MonoChrome */ + cudaVideoChromaFormat_420, /**< YUV 4:2:0 */ + cudaVideoChromaFormat_422, /**< YUV 4:2:2 */ + cudaVideoChromaFormat_444 /**< YUV 4:4:4 */ +} cudaVideoChromaFormat; + +/*************************************************************************************************************/ +//! \enum cudaVideoCreateFlags +//! Decoder flag enums to select preferred decode path +//! cudaVideoCreate_Default and cudaVideoCreate_PreferCUVID are most optimized, use these whenever possible +/*************************************************************************************************************/ +typedef enum cudaVideoCreateFlags_enum { + cudaVideoCreate_Default = 0x00, /**< Default operation mode: use dedicated video engines */ + cudaVideoCreate_PreferCUDA = + 0x01, /**< Use CUDA-based decoder (requires valid vidLock object for multi-threading) */ + cudaVideoCreate_PreferDXVA = 0x02, /**< Go through DXVA internally if possible (requires D3D9 interop) */ + cudaVideoCreate_PreferCUVID = 0x04 /**< Use dedicated video engines directly */ +} cudaVideoCreateFlags; + +/*************************************************************************/ +//! \enum cuvidDecodeStatus +//! Decode status enums +//! These enums are used in CUVIDGETDECODESTATUS structure +/*************************************************************************/ +typedef enum cuvidDecodeStatus_enum { + cuvidDecodeStatus_Invalid = 0, // Decode status is not valid + cuvidDecodeStatus_InProgress = 1, // Decode is in progress + cuvidDecodeStatus_Success = 2, // Decode is completed without any errors + // 3 to 7 enums are reserved for future use + cuvidDecodeStatus_Error = 8, // Decode is completed with an error (error is not concealed) + cuvidDecodeStatus_Error_Concealed = 9, // Decode is completed with an error and error is concealed +} cuvidDecodeStatus; + +/**************************************************************************************************************/ +//! \struct CUVIDDECODECAPS; +//! This structure is used in cuvidGetDecoderCaps API +/**************************************************************************************************************/ +typedef struct _CUVIDDECODECAPS { + cudaVideoCodec eCodecType; /**< IN: cudaVideoCodec_XXX */ + cudaVideoChromaFormat eChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */ + unsigned int nBitDepthMinus8; /**< IN: The Value "BitDepth minus 8" */ + unsigned int reserved1[3]; /**< Reserved for future use - set to zero */ + + unsigned char bIsSupported; /**< OUT: 1 if codec supported, 0 if not supported */ + unsigned char nNumNVDECs; /**< OUT: Number of NVDECs that can support IN params */ + unsigned short nOutputFormatMask; /**< OUT: each bit represents corresponding cudaVideoSurfaceFormat enum */ + unsigned int nMaxWidth; /**< OUT: Max supported coded width in pixels */ + unsigned int nMaxHeight; /**< OUT: Max supported coded height in pixels */ + unsigned int nMaxMBCount; /**< OUT: Max supported macroblock count + CodedWidth*CodedHeight/256 must be <= nMaxMBCount */ + unsigned short nMinWidth; /**< OUT: Min supported coded width in pixels */ + unsigned short nMinHeight; /**< OUT: Min supported coded height in pixels */ + unsigned char bIsHistogramSupported; /**< OUT: 1 if Y component histogram output is supported, 0 if not + Note: histogram is computed on original picture data before + any post-processing like scaling, cropping, etc. is applied */ + unsigned char nCounterBitDepth; /**< OUT: histogram counter bit depth */ + unsigned short nMaxHistogramBins; /**< OUT: Max number of histogram bins */ + unsigned int reserved3[10]; /**< Reserved for future use - set to zero */ +} CUVIDDECODECAPS; + +/**************************************************************************************************************/ +//! \struct CUVIDDECODECREATEINFO +//! This structure is used in cuvidCreateDecoder API +/**************************************************************************************************************/ +typedef struct _CUVIDDECODECREATEINFO { + unsigned long ulWidth; /**< IN: Coded sequence width in pixels */ + unsigned long ulHeight; /**< IN: Coded sequence height in pixels */ + unsigned long ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */ + cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX */ + cudaVideoChromaFormat ChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */ + unsigned long ulCreationFlags; /**< IN: Decoder creation flags (cudaVideoCreateFlags_XXX) */ + unsigned long bitDepthMinus8; /**< IN: The value "BitDepth minus 8" */ + unsigned long ulIntraDecodeOnly; /**< IN: Set 1 only if video has all intra frames (default value is 0). This will + optimize video memory for Intra frames only decoding. The support is limited + to specific codecs - H264, HEVC, VP9, the flag will be ignored for codecs + which are not supported. However decoding might fail if the flag is enabled in + case of supported codecs for regular bit streams having P and/or B frames. */ + unsigned long ulMaxWidth; /**< IN: Coded sequence max width in pixels used with reconfigure Decoder */ + unsigned long ulMaxHeight; /**< IN: Coded sequence max height in pixels used with reconfigure Decoder */ + unsigned long Reserved1; /**< Reserved for future use - set to zero */ + /** + * IN: area of the frame that should be displayed + */ + struct { + short left; + short top; + short right; + short bottom; + } display_area; + + cudaVideoSurfaceFormat OutputFormat; /**< IN: cudaVideoSurfaceFormat_XXX */ + cudaVideoDeinterlaceMode DeinterlaceMode; /**< IN: cudaVideoDeinterlaceMode_XXX */ + unsigned long ulTargetWidth; /**< IN: Post-processed output width (Should be aligned to 2) */ + unsigned long ulTargetHeight; /**< IN: Post-processed output height (Should be aligned to 2) */ + unsigned long ulNumOutputSurfaces; /**< IN: Maximum number of output surfaces simultaneously mapped */ + CUvideoctxlock vidLock; /**< IN: If non-NULL, context lock used for synchronizing ownership of + the cuda context. Needed for cudaVideoCreate_PreferCUDA decode */ + /** + * IN: target rectangle in the output frame (for aspect ratio conversion) + * if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will be used + */ + struct { + short left; + short top; + short right; + short bottom; + } target_rect; + + unsigned long enableHistogram; /**< IN: enable histogram output, if supported */ + unsigned long Reserved2[4]; /**< Reserved for future use - set to zero */ +} CUVIDDECODECREATEINFO; + +/*********************************************************/ +//! \struct CUVIDH264DPBENTRY +//! H.264 DPB entry +//! This structure is used in CUVIDH264PICPARAMS structure +/*********************************************************/ +typedef struct _CUVIDH264DPBENTRY { + int PicIdx; /**< picture index of reference frame */ + int FrameIdx; /**< frame_num(short-term) or LongTermFrameIdx(long-term) */ + int is_long_term; /**< 0=short term reference, 1=long term reference */ + int not_existing; /**< non-existing reference frame (corresponding PicIdx should be set to -1) */ + int used_for_reference; /**< 0=unused, 1=top_field, 2=bottom_field, 3=both_fields */ + int FieldOrderCnt[2]; /**< field order count of top and bottom fields */ +} CUVIDH264DPBENTRY; + +/************************************************************/ +//! \struct CUVIDH264MVCEXT +//! H.264 MVC picture parameters ext +//! This structure is used in CUVIDH264PICPARAMS structure +/************************************************************/ +typedef struct _CUVIDH264MVCEXT { + int num_views_minus1; /**< Max number of coded views minus 1 in video : Range - 0 to 1023 */ + int view_id; /**< view identifier */ + unsigned char inter_view_flag; /**< 1 if used for inter-view prediction, 0 if not */ + unsigned char num_inter_view_refs_l0; /**< number of inter-view ref pics in RefPicList0 */ + unsigned char num_inter_view_refs_l1; /**< number of inter-view ref pics in RefPicList1 */ + unsigned char MVCReserved8Bits; /**< Reserved bits */ + int InterViewRefsL0[16]; /**< view id of the i-th view component for inter-view prediction in RefPicList0 */ + int InterViewRefsL1[16]; /**< view id of the i-th view component for inter-view prediction in RefPicList1 */ +} CUVIDH264MVCEXT; + +/*********************************************************/ +//! \struct CUVIDH264SVCEXT +//! H.264 SVC picture parameters ext +//! This structure is used in CUVIDH264PICPARAMS structure +/*********************************************************/ +typedef struct _CUVIDH264SVCEXT { + unsigned char profile_idc; + unsigned char level_idc; + unsigned char DQId; + unsigned char DQIdMax; + unsigned char disable_inter_layer_deblocking_filter_idc; + unsigned char ref_layer_chroma_phase_y_plus1; + signed char inter_layer_slice_alpha_c0_offset_div2; + signed char inter_layer_slice_beta_offset_div2; + + unsigned short DPBEntryValidFlag; + unsigned char inter_layer_deblocking_filter_control_present_flag; + unsigned char extended_spatial_scalability_idc; + unsigned char adaptive_tcoeff_level_prediction_flag; + unsigned char slice_header_restriction_flag; + unsigned char chroma_phase_x_plus1_flag; + unsigned char chroma_phase_y_plus1; + + unsigned char tcoeff_level_prediction_flag; + unsigned char constrained_intra_resampling_flag; + unsigned char ref_layer_chroma_phase_x_plus1_flag; + unsigned char store_ref_base_pic_flag; + unsigned char Reserved8BitsA; + unsigned char Reserved8BitsB; + + short scaled_ref_layer_left_offset; + short scaled_ref_layer_top_offset; + short scaled_ref_layer_right_offset; + short scaled_ref_layer_bottom_offset; + unsigned short Reserved16Bits; + struct _CUVIDPICPARAMS *pNextLayer; /**< Points to the picparams for the next layer to be decoded. + Linked list ends at the target layer. */ + int bRefBaseLayer; /**< whether to store ref base pic */ +} CUVIDH264SVCEXT; + +/******************************************************/ +//! \struct CUVIDH264PICPARAMS +//! H.264 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/******************************************************/ +typedef struct _CUVIDH264PICPARAMS { + // SPS + int log2_max_frame_num_minus4; + int pic_order_cnt_type; + int log2_max_pic_order_cnt_lsb_minus4; + int delta_pic_order_always_zero_flag; + int frame_mbs_only_flag; + int direct_8x8_inference_flag; + int num_ref_frames; // NOTE: shall meet level 4.1 restrictions + unsigned char residual_colour_transform_flag; + unsigned char bit_depth_luma_minus8; // Must be 0 (only 8-bit supported) + unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported) + unsigned char qpprime_y_zero_transform_bypass_flag; + // PPS + int entropy_coding_mode_flag; + int pic_order_present_flag; + int num_ref_idx_l0_active_minus1; + int num_ref_idx_l1_active_minus1; + int weighted_pred_flag; + int weighted_bipred_idc; + int pic_init_qp_minus26; + int deblocking_filter_control_present_flag; + int redundant_pic_cnt_present_flag; + int transform_8x8_mode_flag; + int MbaffFrameFlag; + int constrained_intra_pred_flag; + int chroma_qp_index_offset; + int second_chroma_qp_index_offset; + int ref_pic_flag; + int frame_num; + int CurrFieldOrderCnt[2]; + // DPB + CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB + // Quantization Matrices (raster-order) + unsigned char WeightScale4x4[6][16]; + unsigned char WeightScale8x8[2][64]; + // FMO/ASO + unsigned char fmo_aso_enable; + unsigned char num_slice_groups_minus1; + unsigned char slice_group_map_type; + signed char pic_init_qs_minus26; + unsigned int slice_group_change_rate_minus1; + union { + unsigned long long slice_group_map_addr; + const unsigned char *pMb2SliceGroupMap; + } fmo; + unsigned int Reserved[12]; + // SVC/MVC + union { + CUVIDH264MVCEXT mvcext; + CUVIDH264SVCEXT svcext; + }; +} CUVIDH264PICPARAMS; + +/********************************************************/ +//! \struct CUVIDMPEG2PICPARAMS +//! MPEG-2 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/********************************************************/ +typedef struct _CUVIDMPEG2PICPARAMS { + int ForwardRefIdx; // Picture index of forward reference (P/B-frames) + int BackwardRefIdx; // Picture index of backward reference (B-frames) + int picture_coding_type; + int full_pel_forward_vector; + int full_pel_backward_vector; + int f_code[2][2]; + int intra_dc_precision; + int frame_pred_frame_dct; + int concealment_motion_vectors; + int q_scale_type; + int intra_vlc_format; + int alternate_scan; + int top_field_first; + // Quantization matrices (raster order) + unsigned char QuantMatrixIntra[64]; + unsigned char QuantMatrixInter[64]; +} CUVIDMPEG2PICPARAMS; + +// MPEG-4 has VOP types instead of Picture types +#define I_VOP 0 +#define P_VOP 1 +#define B_VOP 2 +#define S_VOP 3 + +/*******************************************************/ +//! \struct CUVIDMPEG4PICPARAMS +//! MPEG-4 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/*******************************************************/ +typedef struct _CUVIDMPEG4PICPARAMS { + int ForwardRefIdx; // Picture index of forward reference (P/B-frames) + int BackwardRefIdx; // Picture index of backward reference (B-frames) + // VOL + int video_object_layer_width; + int video_object_layer_height; + int vop_time_increment_bitcount; + int top_field_first; + int resync_marker_disable; + int quant_type; + int quarter_sample; + int short_video_header; + int divx_flags; + // VOP + int vop_coding_type; + int vop_coded; + int vop_rounding_type; + int alternate_vertical_scan_flag; + int interlaced; + int vop_fcode_forward; + int vop_fcode_backward; + int trd[2]; + int trb[2]; + // Quantization matrices (raster order) + unsigned char QuantMatrixIntra[64]; + unsigned char QuantMatrixInter[64]; + int gmc_enabled; +} CUVIDMPEG4PICPARAMS; + +/********************************************************/ +//! \struct CUVIDVC1PICPARAMS +//! VC1 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/********************************************************/ +typedef struct _CUVIDVC1PICPARAMS { + int ForwardRefIdx; /**< Picture index of forward reference (P/B-frames) */ + int BackwardRefIdx; /**< Picture index of backward reference (B-frames) */ + int FrameWidth; /**< Actual frame width */ + int FrameHeight; /**< Actual frame height */ + // PICTURE + int intra_pic_flag; /**< Set to 1 for I,BI frames */ + int ref_pic_flag; /**< Set to 1 for I,P frames */ + int progressive_fcm; /**< Progressive frame */ + // SEQUENCE + int profile; + int postprocflag; + int pulldown; + int interlace; + int tfcntrflag; + int finterpflag; + int psf; + int multires; + int syncmarker; + int rangered; + int maxbframes; + // ENTRYPOINT + int panscan_flag; + int refdist_flag; + int extended_mv; + int dquant; + int vstransform; + int loopfilter; + int fastuvmc; + int overlap; + int quantizer; + int extended_dmv; + int range_mapy_flag; + int range_mapy; + int range_mapuv_flag; + int range_mapuv; + int rangeredfrm; // range reduction state +} CUVIDVC1PICPARAMS; + +/***********************************************************/ +//! \struct CUVIDJPEGPICPARAMS +//! JPEG picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/***********************************************************/ +typedef struct _CUVIDJPEGPICPARAMS { + int Reserved; +} CUVIDJPEGPICPARAMS; + +/*******************************************************/ +//! \struct CUVIDHEVCPICPARAMS +//! HEVC picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/*******************************************************/ +typedef struct _CUVIDHEVCPICPARAMS { + // sps + int pic_width_in_luma_samples; + int pic_height_in_luma_samples; + unsigned char log2_min_luma_coding_block_size_minus3; + unsigned char log2_diff_max_min_luma_coding_block_size; + unsigned char log2_min_transform_block_size_minus2; + unsigned char log2_diff_max_min_transform_block_size; + unsigned char pcm_enabled_flag; + unsigned char log2_min_pcm_luma_coding_block_size_minus3; + unsigned char log2_diff_max_min_pcm_luma_coding_block_size; + unsigned char pcm_sample_bit_depth_luma_minus1; + + unsigned char pcm_sample_bit_depth_chroma_minus1; + unsigned char pcm_loop_filter_disabled_flag; + unsigned char strong_intra_smoothing_enabled_flag; + unsigned char max_transform_hierarchy_depth_intra; + unsigned char max_transform_hierarchy_depth_inter; + unsigned char amp_enabled_flag; + unsigned char separate_colour_plane_flag; + unsigned char log2_max_pic_order_cnt_lsb_minus4; + + unsigned char num_short_term_ref_pic_sets; + unsigned char long_term_ref_pics_present_flag; + unsigned char num_long_term_ref_pics_sps; + unsigned char sps_temporal_mvp_enabled_flag; + unsigned char sample_adaptive_offset_enabled_flag; + unsigned char scaling_list_enable_flag; + unsigned char IrapPicFlag; + unsigned char IdrPicFlag; + + unsigned char bit_depth_luma_minus8; + unsigned char bit_depth_chroma_minus8; + // sps/pps extension fields + unsigned char log2_max_transform_skip_block_size_minus2; + unsigned char log2_sao_offset_scale_luma; + unsigned char log2_sao_offset_scale_chroma; + unsigned char high_precision_offsets_enabled_flag; + unsigned char reserved1[10]; + + // pps + unsigned char dependent_slice_segments_enabled_flag; + unsigned char slice_segment_header_extension_present_flag; + unsigned char sign_data_hiding_enabled_flag; + unsigned char cu_qp_delta_enabled_flag; + unsigned char diff_cu_qp_delta_depth; + signed char init_qp_minus26; + signed char pps_cb_qp_offset; + signed char pps_cr_qp_offset; + + unsigned char constrained_intra_pred_flag; + unsigned char weighted_pred_flag; + unsigned char weighted_bipred_flag; + unsigned char transform_skip_enabled_flag; + unsigned char transquant_bypass_enabled_flag; + unsigned char entropy_coding_sync_enabled_flag; + unsigned char log2_parallel_merge_level_minus2; + unsigned char num_extra_slice_header_bits; + + unsigned char loop_filter_across_tiles_enabled_flag; + unsigned char loop_filter_across_slices_enabled_flag; + unsigned char output_flag_present_flag; + unsigned char num_ref_idx_l0_default_active_minus1; + unsigned char num_ref_idx_l1_default_active_minus1; + unsigned char lists_modification_present_flag; + unsigned char cabac_init_present_flag; + unsigned char pps_slice_chroma_qp_offsets_present_flag; + + unsigned char deblocking_filter_override_enabled_flag; + unsigned char pps_deblocking_filter_disabled_flag; + signed char pps_beta_offset_div2; + signed char pps_tc_offset_div2; + unsigned char tiles_enabled_flag; + unsigned char uniform_spacing_flag; + unsigned char num_tile_columns_minus1; + unsigned char num_tile_rows_minus1; + + unsigned short column_width_minus1[21]; + unsigned short row_height_minus1[21]; + + // sps and pps extension HEVC-main 444 + unsigned char sps_range_extension_flag; + unsigned char transform_skip_rotation_enabled_flag; + unsigned char transform_skip_context_enabled_flag; + unsigned char implicit_rdpcm_enabled_flag; + + unsigned char explicit_rdpcm_enabled_flag; + unsigned char extended_precision_processing_flag; + unsigned char intra_smoothing_disabled_flag; + unsigned char persistent_rice_adaptation_enabled_flag; + + unsigned char cabac_bypass_alignment_enabled_flag; + unsigned char pps_range_extension_flag; + unsigned char cross_component_prediction_enabled_flag; + unsigned char chroma_qp_offset_list_enabled_flag; + + unsigned char diff_cu_chroma_qp_offset_depth; + unsigned char chroma_qp_offset_list_len_minus1; + signed char cb_qp_offset_list[6]; + + signed char cr_qp_offset_list[6]; + unsigned char reserved2[2]; + + unsigned int reserved3[8]; + + // RefPicSets + int NumBitsForShortTermRPSInSlice; + int NumDeltaPocsOfRefRpsIdx; + int NumPocTotalCurr; + int NumPocStCurrBefore; + int NumPocStCurrAfter; + int NumPocLtCurr; + int CurrPicOrderCntVal; + int RefPicIdx[16]; // [refpic] Indices of valid reference pictures (-1 if unused for reference) + int PicOrderCntVal[16]; // [refpic] + unsigned char IsLongTerm[16]; // [refpic] 0=not a long-term reference, 1=long-term reference + unsigned char RefPicSetStCurrBefore[8]; // [0..NumPocStCurrBefore-1] -> refpic (0..15) + unsigned char RefPicSetStCurrAfter[8]; // [0..NumPocStCurrAfter-1] -> refpic (0..15) + unsigned char RefPicSetLtCurr[8]; // [0..NumPocLtCurr-1] -> refpic (0..15) + unsigned char RefPicSetInterLayer0[8]; + unsigned char RefPicSetInterLayer1[8]; + unsigned int reserved4[12]; + + // scaling lists (diag order) + unsigned char ScalingList4x4[6][16]; // [matrixId][i] + unsigned char ScalingList8x8[6][64]; // [matrixId][i] + unsigned char ScalingList16x16[6][64]; // [matrixId][i] + unsigned char ScalingList32x32[2][64]; // [matrixId][i] + unsigned char ScalingListDCCoeff16x16[6]; // [matrixId] + unsigned char ScalingListDCCoeff32x32[2]; // [matrixId] +} CUVIDHEVCPICPARAMS; + +/***********************************************************/ +//! \struct CUVIDVP8PICPARAMS +//! VP8 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/***********************************************************/ +typedef struct _CUVIDVP8PICPARAMS { + int width; + int height; + unsigned int first_partition_size; + // Frame Indexes + unsigned char LastRefIdx; + unsigned char GoldenRefIdx; + unsigned char AltRefIdx; + union { + struct { + unsigned char frame_type : 1; /**< 0 = KEYFRAME, 1 = INTERFRAME */ + unsigned char version : 3; + unsigned char show_frame : 1; + unsigned char update_mb_segmentation_data : 1; /**< Must be 0 if segmentation is not enabled */ + unsigned char Reserved2Bits : 2; + } vp8_frame_tag; + unsigned char wFrameTagFlags; + }; + unsigned char Reserved1[4]; + unsigned int Reserved2[3]; +} CUVIDVP8PICPARAMS; + +/***********************************************************/ +//! \struct CUVIDVP9PICPARAMS +//! VP9 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/***********************************************************/ +typedef struct _CUVIDVP9PICPARAMS { + unsigned int width; + unsigned int height; + + // Frame Indices + unsigned char LastRefIdx; + unsigned char GoldenRefIdx; + unsigned char AltRefIdx; + unsigned char colorSpace; + + unsigned short profile : 3; + unsigned short frameContextIdx : 2; + unsigned short frameType : 1; + unsigned short showFrame : 1; + unsigned short errorResilient : 1; + unsigned short frameParallelDecoding : 1; + unsigned short subSamplingX : 1; + unsigned short subSamplingY : 1; + unsigned short intraOnly : 1; + unsigned short allow_high_precision_mv : 1; + unsigned short refreshEntropyProbs : 1; + unsigned short reserved2Bits : 2; + + unsigned short reserved16Bits; + + unsigned char refFrameSignBias[4]; + + unsigned char bitDepthMinus8Luma; + unsigned char bitDepthMinus8Chroma; + unsigned char loopFilterLevel; + unsigned char loopFilterSharpness; + + unsigned char modeRefLfEnabled; + unsigned char log2_tile_columns; + unsigned char log2_tile_rows; + + unsigned char segmentEnabled : 1; + unsigned char segmentMapUpdate : 1; + unsigned char segmentMapTemporalUpdate : 1; + unsigned char segmentFeatureMode : 1; + unsigned char reserved4Bits : 4; + + unsigned char segmentFeatureEnable[8][4]; + short segmentFeatureData[8][4]; + unsigned char mb_segment_tree_probs[7]; + unsigned char segment_pred_probs[3]; + unsigned char reservedSegment16Bits[2]; + + int qpYAc; + int qpYDc; + int qpChDc; + int qpChAc; + + unsigned int activeRefIdx[3]; + unsigned int resetFrameContext; + unsigned int mcomp_filter_type; + unsigned int mbRefLfDelta[4]; + unsigned int mbModeLfDelta[2]; + unsigned int frameTagSize; + unsigned int offsetToDctParts; + unsigned int reserved128Bits[4]; + +} CUVIDVP9PICPARAMS; + +/***********************************************************/ +//! \struct CUVIDAV1PICPARAMS +//! AV1 picture parameters +//! This structure is used in CUVIDPICPARAMS structure +/***********************************************************/ +typedef struct _CUVIDAV1PICPARAMS { + unsigned int width; // coded width, if superres enabled then it is upscaled width + unsigned int height; // coded height + unsigned int frame_offset; // defined as order_hint in AV1 specification + int decodePicIdx; // decoded output pic index, if film grain enabled, it will keep decoded (without film grain) + // output It can be used as reference frame for future frames + + // sequence header + unsigned int profile : 3; // 0 = profile0, 1 = profile1, 2 = profile2 + unsigned int use_128x128_superblock : 1; // superblock size 0:64x64, 1: 128x128 + unsigned int subsampling_x : 1; // (subsampling_x, _y) 1,1 = 420, 1,0 = 422, 0,0 = 444 + unsigned int subsampling_y : 1; + unsigned int mono_chrome : 1; // for monochrome content, mono_chrome = 1 and (subsampling_x, _y) should be 1,1 + unsigned int bit_depth_minus8 : 4; // bit depth minus 8 + unsigned int enable_filter_intra : 1; // tool enable in seq level, 0 : disable 1: frame header control + unsigned int enable_intra_edge_filter : 1; // intra edge filtering process, 0 : disable 1: enabled + unsigned int enable_interintra_compound : 1; // interintra, 0 : not present 1: present + unsigned int enable_masked_compound : 1; // 1: mode info for inter blocks may contain the syntax element + // compound_type. 0: syntax element compound_type will not be present + unsigned int enable_dual_filter : 1; // vertical and horiz filter selection, 1: enable and 0: disable + unsigned int enable_order_hint : 1; // order hint, and related tools, 1: enable and 0: disable + unsigned int order_hint_bits_minus1 : 3; // is used to compute OrderHintBits + unsigned int enable_jnt_comp : 1; // joint compound modes, 1: enable and 0: disable + unsigned int enable_superres : 1; // superres in seq level, 0 : disable 1: frame level control + unsigned int enable_cdef : 1; // cdef filtering in seq level, 0 : disable 1: frame level control + unsigned int enable_restoration : 1; // loop restoration filtering in seq level, 0 : disable 1: frame level control + unsigned int enable_fgs : 1; // defined as film_grain_params_present in AV1 specification + unsigned int reserved0_7bits : 7; // reserved bits; must be set to 0 + + // frame header + unsigned int frame_type : 2; // 0:Key frame, 1:Inter frame, 2:intra only, 3:s-frame + unsigned int show_frame : 1; // show_frame = 1 implies that frame should be immediately output once decoded + unsigned int disable_cdf_update : 1; // CDF update during symbol decoding, 1: disabled, 0: enabled + unsigned int + allow_screen_content_tools : 1; // 1: intra blocks may use palette encoding, 0: palette encoding is never used + unsigned int force_integer_mv : 1; // 1: motion vectors will always be integers, 0: can contain fractional bits + unsigned int coded_denom : 3; // coded_denom of the superres scale as specified in AV1 specification + unsigned int allow_intrabc : 1; // 1: intra block copy may be used, 0: intra block copy is not allowed + unsigned int allow_high_precision_mv : 1; // 1/8 precision mv enable + unsigned int interp_filter : 3; // interpolation filter. Refer to section 6.8.9 of the AV1 specification + // Version 1.0.0 with Errata 1 + unsigned int switchable_motion_mode : 1; // defined as is_motion_mode_switchable in AV1 specification + unsigned int use_ref_frame_mvs : 1; // 1: current frame can use the previous frame mv information, 0: will not use. + unsigned int disable_frame_end_update_cdf : 1; // 1: indicates that the end of frame CDF update is disabled + unsigned int delta_q_present : 1; // quantizer index delta values are present in the block level + unsigned int delta_q_res : 2; // left shift which should be applied to decoded quantizer index delta values + unsigned int using_qmatrix : 1; // 1: quantizer matrix will be used to compute quantizers + unsigned int coded_lossless : 1; // 1: all segments use lossless coding + unsigned int use_superres : 1; // 1: superres enabled for frame + unsigned int tx_mode : 2; // 0: ONLY4x4,1:LARGEST,2:SELECT + unsigned int reference_mode : 1; // 0: SINGLE, 1: SELECT + unsigned int + allow_warped_motion : 1; // 1: allow_warped_motion may be present, 0: allow_warped_motion will not be present + unsigned int reduced_tx_set : 1; // 1: frame is restricted to subset of the full set of transform types, 0: no such + // restriction + unsigned int skip_mode : 1; // 1: most of the mode info is skipped, 0: mode info is not skipped + unsigned int reserved1_3bits : 3; // reserved bits; must be set to 0 + + // tiling info + unsigned int num_tile_cols : 8; // number of tiles across the frame., max is 64 + unsigned int num_tile_rows : 8; // number of tiles down the frame., max is 64 + unsigned int context_update_tile_id : 16; // specifies which tile to use for the CDF update + unsigned short tile_widths[64]; // Width of each column in superblocks + unsigned short tile_heights[64]; // height of each row in superblocks + + // CDEF - refer to section 6.10.14 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char cdef_damping_minus_3 : 2; // controls the amount of damping in the deringing filter + unsigned char cdef_bits : 2; // the number of bits needed to specify which CDEF filter to apply + unsigned char reserved2_4bits : 4; // reserved bits; must be set to 0 + unsigned char cdef_y_strength[8]; // 0-3 bits: y_pri_strength, 4-7 bits y_sec_strength + unsigned char cdef_uv_strength[8]; // 0-3 bits: uv_pri_strength, 4-7 bits uv_sec_strength + + // SkipModeFrames + unsigned char + SkipModeFrame0 : 4; // specifies the frames to use for compound prediction when skip_mode is equal to 1. + unsigned char SkipModeFrame1 : 4; + + // qp information - refer to section 6.8.11 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char base_qindex; // indicates the base frame qindex. Defined as base_q_idx in AV1 specification + char qp_y_dc_delta_q; // indicates the Y DC quantizer relative to base_q_idx. Defined as DeltaQYDc in AV1 + // specification + char qp_u_dc_delta_q; // indicates the U DC quantizer relative to base_q_idx. Defined as DeltaQUDc in AV1 + // specification + char qp_v_dc_delta_q; // indicates the V DC quantizer relative to base_q_idx. Defined as DeltaQVDc in AV1 + // specification + char qp_u_ac_delta_q; // indicates the U AC quantizer relative to base_q_idx. Defined as DeltaQUAc in AV1 + // specification + char qp_v_ac_delta_q; // indicates the V AC quantizer relative to base_q_idx. Defined as DeltaQVAc in AV1 + // specification + unsigned char qm_y; // specifies the level in the quantizer matrix that should be used for luma plane decoding + unsigned char qm_u; // specifies the level in the quantizer matrix that should be used for chroma U plane decoding + unsigned char qm_v; // specifies the level in the quantizer matrix that should be used for chroma V plane decoding + + // segmentation - refer to section 6.8.13 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char segmentation_enabled : 1; // 1 indicates that this frame makes use of the segmentation tool + unsigned char segmentation_update_map : 1; // 1 indicates that the segmentation map are updated during the decoding + // of this frame + unsigned char + segmentation_update_data : 1; // 1 indicates that new parameters are about to be specified for each segment + unsigned char segmentation_temporal_update : 1; // 1 indicates that the updates to the segmentation map are coded + // relative to the existing segmentation map + unsigned char reserved3_4bits : 4; // reserved bits; must be set to 0 + short segmentation_feature_data[8][8]; // specifies the feature data for a segment feature + unsigned char + segmentation_feature_mask[8]; // indicates that the corresponding feature is unused or feature value is coded + + // loopfilter - refer to section 6.8.10 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char loop_filter_level[2]; // contains loop filter strength values + unsigned char loop_filter_level_u; // loop filter strength value of U plane + unsigned char loop_filter_level_v; // loop filter strength value of V plane + unsigned char loop_filter_sharpness; // indicates the sharpness level + char loop_filter_ref_deltas[8]; // contains the adjustment needed for the filter level based on the chosen reference + // frame + char loop_filter_mode_deltas[2]; // contains the adjustment needed for the filter level based on the chosen mode + unsigned char loop_filter_delta_enabled : 1; // indicates that the filter level depends on the mode and reference + // frame used to predict a block + unsigned char loop_filter_delta_update : 1; // indicates that additional syntax elements are present that specify + // which mode and reference frame deltas are to be updated + unsigned char delta_lf_present : 1; // specifies whether loop filter delta values are present in the block level + unsigned char delta_lf_res : 2; // specifies the left shift to apply to the decoded loop filter values + unsigned char delta_lf_multi : 1; // separate loop filter deltas for Hy,Vy,U,V edges + unsigned char reserved4_2bits : 2; // reserved bits; must be set to 0 + + // restoration - refer to section 6.10.15 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned char lr_unit_size[3]; // specifies the size of loop restoration units: 0: 32, 1: 64, 2: 128, 3: 256 + unsigned char lr_type[3]; // used to compute FrameRestorationType + + // reference frames + unsigned char primary_ref_frame; // specifies which reference frame contains the CDF values and other state that + // should be loaded at the start of the frame + unsigned char ref_frame_map[8]; // frames in dpb that can be used as reference for current or future frames + + unsigned char temporal_layer_id : 4; // temporal layer id + unsigned char spatial_layer_id : 4; // spatial layer id + + unsigned char reserved5_32bits[4]; // reserved bits; must be set to 0 + + // ref frame list + struct { + unsigned int width; + unsigned int height; + unsigned char index; + unsigned char reserved24Bits[3]; // reserved bits; must be set to 0 + } ref_frame[7]; // frames used as reference frame for current frame. + + // global motion + struct { + unsigned char invalid : 1; + unsigned char wmtype : 2; // defined as GmType in AV1 specification + unsigned char reserved5Bits : 5; // reserved bits; must be set to 0 + char reserved24Bits[3]; // reserved bits; must be set to 0 + int wmmat[6]; // defined as gm_params[] in AV1 specification + } global_motion[7]; // global motion params for reference frames + + // film grain params - refer to section 6.8.20 of the AV1 specification Version 1.0.0 with Errata 1 + unsigned short apply_grain : 1; + unsigned short overlap_flag : 1; + unsigned short scaling_shift_minus8 : 2; + unsigned short chroma_scaling_from_luma : 1; + unsigned short ar_coeff_lag : 2; + unsigned short ar_coeff_shift_minus6 : 2; + unsigned short grain_scale_shift : 2; + unsigned short clip_to_restricted_range : 1; + unsigned short reserved6_4bits : 4; // reserved bits; must be set to 0 + unsigned char num_y_points; + unsigned char scaling_points_y[14][2]; + unsigned char num_cb_points; + unsigned char scaling_points_cb[10][2]; + unsigned char num_cr_points; + unsigned char scaling_points_cr[10][2]; + unsigned char reserved7_8bits; // reserved bits; must be set to 0 + unsigned short random_seed; + short ar_coeffs_y[24]; + short ar_coeffs_cb[25]; + short ar_coeffs_cr[25]; + unsigned char cb_mult; + unsigned char cb_luma_mult; + short cb_offset; + unsigned char cr_mult; + unsigned char cr_luma_mult; + short cr_offset; + + int reserved[7]; // reserved bits; must be set to 0 +} CUVIDAV1PICPARAMS; + +/******************************************************************************************/ +//! \struct CUVIDPICPARAMS +//! Picture parameters for decoding +//! This structure is used in cuvidDecodePicture API +//! IN for cuvidDecodePicture +/******************************************************************************************/ +typedef struct _CUVIDPICPARAMS { + int PicWidthInMbs; /**< IN: Coded frame size in macroblocks */ + int FrameHeightInMbs; /**< IN: Coded frame height in macroblocks */ + int CurrPicIdx; /**< IN: Output index of the current picture */ + int field_pic_flag; /**< IN: 0=frame picture, 1=field picture */ + int bottom_field_flag; /**< IN: 0=top field, 1=bottom field (ignored if field_pic_flag=0) */ + int second_field; /**< IN: Second field of a complementary field pair */ + // Bitstream data + unsigned int nBitstreamDataLen; /**< IN: Number of bytes in bitstream data buffer */ + const unsigned char *pBitstreamData; /**< IN: Ptr to bitstream data for this picture (slice-layer) */ + unsigned int nNumSlices; /**< IN: Number of slices in this picture */ + const unsigned int *pSliceDataOffsets; /**< IN: nNumSlices entries, contains offset of each slice within + the bitstream data buffer */ + int ref_pic_flag; /**< IN: This picture is a reference picture */ + int intra_pic_flag; /**< IN: This picture is entirely intra coded */ + unsigned int Reserved[30]; /**< Reserved for future use */ + // IN: Codec-specific data + union { + CUVIDMPEG2PICPARAMS mpeg2; /**< Also used for MPEG-1 */ + CUVIDH264PICPARAMS h264; + CUVIDVC1PICPARAMS vc1; + CUVIDMPEG4PICPARAMS mpeg4; + CUVIDJPEGPICPARAMS jpeg; + CUVIDHEVCPICPARAMS hevc; + CUVIDVP8PICPARAMS vp8; + CUVIDVP9PICPARAMS vp9; + CUVIDAV1PICPARAMS av1; + unsigned int CodecReserved[1024]; + } CodecSpecific; +} CUVIDPICPARAMS; + +/******************************************************/ +//! \struct CUVIDPROCPARAMS +//! Picture parameters for postprocessing +//! This structure is used in cuvidMapVideoFrame API +/******************************************************/ +typedef struct _CUVIDPROCPARAMS { + int progressive_frame; /**< IN: Input is progressive (deinterlace_mode will be ignored) */ + int second_field; /**< IN: Output the second field (ignored if deinterlace mode is Weave) */ + int top_field_first; /**< IN: Input frame is top field first (1st field is top, 2nd field is bottom) */ + int unpaired_field; /**< IN: Input only contains one field (2nd field is invalid) */ + // The fields below are used for raw YUV input + unsigned int reserved_flags; /**< Reserved for future use (set to zero) */ + unsigned int reserved_zero; /**< Reserved (set to zero) */ + unsigned long long raw_input_dptr; /**< IN: Input CUdeviceptr for raw YUV extensions */ + unsigned int raw_input_pitch; /**< IN: pitch in bytes of raw YUV input (should be aligned appropriately) */ + unsigned int raw_input_format; /**< IN: Input YUV format (cudaVideoCodec_enum) */ + unsigned long long raw_output_dptr; /**< IN: Output CUdeviceptr for raw YUV extensions */ + unsigned int raw_output_pitch; /**< IN: pitch in bytes of raw YUV output (should be aligned appropriately) */ + unsigned int Reserved1; /**< Reserved for future use (set to zero) */ + CUstream output_stream; /**< IN: stream object used by cuvidMapVideoFrame */ + unsigned int Reserved[46]; /**< Reserved for future use (set to zero) */ + unsigned long long *histogram_dptr; /**< OUT: Output CUdeviceptr for histogram extensions */ + void *Reserved2[1]; /**< Reserved for future use (set to zero) */ +} CUVIDPROCPARAMS; + +/*********************************************************************************************************/ +//! \struct CUVIDGETDECODESTATUS +//! Struct for reporting decode status. +//! This structure is used in cuvidGetDecodeStatus API. +/*********************************************************************************************************/ +typedef struct _CUVIDGETDECODESTATUS { + cuvidDecodeStatus decodeStatus; + unsigned int reserved[31]; + void *pReserved[8]; +} CUVIDGETDECODESTATUS; + +/****************************************************/ +//! \struct CUVIDRECONFIGUREDECODERINFO +//! Struct for decoder reset +//! This structure is used in cuvidReconfigureDecoder() API +/****************************************************/ +typedef struct _CUVIDRECONFIGUREDECODERINFO { + unsigned int + ulWidth; /**< IN: Coded sequence width in pixels, MUST be < = ulMaxWidth defined at CUVIDDECODECREATEINFO */ + unsigned int + ulHeight; /**< IN: Coded sequence height in pixels, MUST be < = ulMaxHeight defined at CUVIDDECODECREATEINFO */ + unsigned int ulTargetWidth; /**< IN: Post processed output width */ + unsigned int ulTargetHeight; /**< IN: Post Processed output height */ + unsigned int ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */ + unsigned int reserved1[12]; /**< Reserved for future use. Set to Zero */ + /** + * IN: Area of frame to be displayed. Use-case : Source Cropping + */ + struct { + short left; + short top; + short right; + short bottom; + } display_area; + /** + * IN: Target Rectangle in the OutputFrame. Use-case : Aspect ratio Conversion + */ + struct { + short left; + short top; + short right; + short bottom; + } target_rect; + unsigned int reserved2[11]; /**< Reserved for future use. Set to Zero */ +} CUVIDRECONFIGUREDECODERINFO; + +/***********************************************************************************************************/ +//! VIDEO_DECODER +//! +//! In order to minimize decode latencies, there should be always at least 2 pictures in the decode +//! queue at any time, in order to make sure that all decode engines are always busy. +//! +//! Overall data flow: +//! - cuvidGetDecoderCaps(...) +//! - cuvidCreateDecoder(...) +//! - For each picture: +//! + cuvidDecodePicture(N) +//! + cuvidMapVideoFrame(N-4) +//! + do some processing in cuda +//! + cuvidUnmapVideoFrame(N-4) +//! + cuvidDecodePicture(N+1) +//! + cuvidMapVideoFrame(N-3) +//! + ... +//! - cuvidDestroyDecoder(...) +//! +//! NOTE: +//! - When the cuda context is created from a D3D device, the D3D device must also be created +//! with the D3DCREATE_MULTITHREADED flag. +//! - There is a limit to how many pictures can be mapped simultaneously (ulNumOutputSurfaces) +//! - cuvidDecodePicture may block the calling thread if there are too many pictures pending +//! in the decode queue +/***********************************************************************************************************/ + +/**********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc) +//! Queries decode capabilities of NVDEC-HW based on CodecType, ChromaFormat and BitDepthMinus8 parameters. +//! 1. Application fills IN parameters CodecType, ChromaFormat and BitDepthMinus8 of CUVIDDECODECAPS structure +//! 2. On calling cuvidGetDecoderCaps, driver fills OUT parameters if the IN parameters are supported +//! If IN parameters passed to the driver are not supported by NVDEC-HW, then all OUT params are set to 0. +//! E.g. on Geforce GTX 960: +//! App fills - eCodecType = cudaVideoCodec_H264; eChromaFormat = cudaVideoChromaFormat_420; nBitDepthMinus8 = 0; +//! Given IN parameters are supported, hence driver fills: bIsSupported = 1; nMinWidth = 48; nMinHeight = 16; +//! nMaxWidth = 4096; nMaxHeight = 4096; nMaxMBCount = 65536; +//! CodedWidth*CodedHeight/256 must be less than or equal to nMaxMBCount +/**********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc); + +/*****************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci) +//! Create the decoder object based on pdci. A handle to the created decoder is returned +/*****************************************************************************************************/ +extern CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci); + +/*****************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder) +//! Destroy the decoder object +/*****************************************************************************************************/ +extern CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder); + +/*****************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams) +//! Decode a single picture (field or frame) +//! Kicks off HW decoding +/*****************************************************************************************************/ +extern CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams); + +/************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int nPicIdx); +//! Get the decode status for frame corresponding to nPicIdx +//! API is supported for Maxwell and above generation GPUs. +//! API is currently supported for HEVC, H264 and JPEG codecs. +//! API returns CUDA_ERROR_NOT_SUPPORTED error code for unsupported GPU or codec. +/************************************************************************************************************/ +extern CUresult CUDAAPI cuvidGetDecodeStatus(CUvideodecoder hDecoder, int nPicIdx, CUVIDGETDECODESTATUS *pDecodeStatus); + +/*********************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder, CUVIDRECONFIGUREDECODERINFO +//! *pDecReconfigParams) Used to reuse single decoder for multiple clips. Currently supports resolution change, resize +//! params, display area params, target area params change for same codec. Must be called during +//! CUVIDPARSERPARAMS::pfnSequenceCallback +/*********************************************************************************************************/ +extern CUresult CUDAAPI cuvidReconfigureDecoder(CUvideodecoder hDecoder, + CUVIDRECONFIGUREDECODERINFO *pDecReconfigParams); + +#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL) +/************************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr, +//! unsigned int *pPitch, CUVIDPROCPARAMS *pVPP); +//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and +//! associated pitch of the video frame +/************************************************************************************************************************/ +extern CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr, + unsigned int *pPitch, CUVIDPROCPARAMS *pVPP); + +/*****************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr) +//! Unmap a previously mapped video frame +/*****************************************************************************************************/ +extern CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr); +#endif + +/****************************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr, +//! unsigned int * pPitch, CUVIDPROCPARAMS *pVPP); +//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and +//! associated pitch of the video frame +/****************************************************************************************************************************/ +extern CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr, + unsigned int *pPitch, CUVIDPROCPARAMS *pVPP); + +/**************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr); +//! Unmap a previously mapped video frame +/**************************************************************************************************/ +extern CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr); + +#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL) +#define cuvidMapVideoFrame cuvidMapVideoFrame64 +#define cuvidUnmapVideoFrame cuvidUnmapVideoFrame64 +#endif + +/********************************************************************************************************************/ +//! +//! Context-locking: to facilitate multi-threaded implementations, the following 4 functions +//! provide a simple mutex-style host synchronization. If a non-NULL context is specified +//! in CUVIDDECODECREATEINFO, the codec library will acquire the mutex associated with the given +//! context before making any cuda calls. +//! A multi-threaded application could create a lock associated with a context handle so that +//! multiple threads can safely share the same cuda context: +//! - use cuCtxPopCurrent immediately after context creation in order to create a 'floating' context +//! that can be passed to cuvidCtxLockCreate. +//! - When using a floating context, all cuda calls should only be made within a cuvidCtxLock/cuvidCtxUnlock section. +//! +//! NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent, and is not related to video +//! decoder in any way (implemented as a critical section associated with cuCtx{Push|Pop}Current calls). +/********************************************************************************************************************/ + +/********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx) +//! This API is used to create CtxLock object +/********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx); + +/********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck) +//! This API is used to free CtxLock object +/********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck); + +/********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags) +//! This API is used to acquire ctxlock +/********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags); + +/********************************************************************************************************************/ +//! \fn CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags) +//! This API is used to release ctxlock +/********************************************************************************************************************/ +extern CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags); + +/**********************************************************************************************/ + +#if defined(__cplusplus) +} +// Auto-lock helper for C++ applications +class CCtxAutoLock { + private: + CUvideoctxlock m_ctx; + + public: + CCtxAutoLock(CUvideoctxlock ctx) : m_ctx(ctx) { cuvidCtxLock(m_ctx, 0); } + ~CCtxAutoLock() { cuvidCtxUnlock(m_ctx, 0); } +}; +#endif /* __cplusplus */ + +#endif // __CUDA_VIDEO_H__ diff --git a/third_party/Video_Codec_SDK/Interface/nvcuvid.h b/third_party/Video_Codec_SDK/Interface/nvcuvid.h new file mode 100644 index 000000000..d4691672c --- /dev/null +++ b/third_party/Video_Codec_SDK/Interface/nvcuvid.h @@ -0,0 +1,486 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/********************************************************************************************************************/ +//! \file nvcuvid.h +//! NVDECODE API provides video decoding interface to NVIDIA GPU devices. +//! \date 2015-2022 +//! This file contains the interface constants, structure definitions and function prototypes. +/********************************************************************************************************************/ + +#if !defined(__NVCUVID_H__) +#define __NVCUVID_H__ + +#include "cuviddec.h" + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +#define MAX_CLOCK_TS 3 + +/***********************************************/ +//! +//! High-level helper APIs for video sources +//! +/***********************************************/ + +typedef void *CUvideosource; +typedef void *CUvideoparser; +typedef long long CUvideotimestamp; + +/************************************************************************/ +//! \enum cudaVideoState +//! Video source state enums +//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs +/************************************************************************/ +typedef enum { + cudaVideoState_Error = -1, /**< Error state (invalid source) */ + cudaVideoState_Stopped = 0, /**< Source is stopped (or reached end-of-stream) */ + cudaVideoState_Started = 1 /**< Source is running and delivering data */ +} cudaVideoState; + +/************************************************************************/ +//! \enum cudaAudioCodec +//! Audio compression enums +//! Used in CUAUDIOFORMAT structure +/************************************************************************/ +typedef enum { + cudaAudioCodec_MPEG1 = 0, /**< MPEG-1 Audio */ + cudaAudioCodec_MPEG2, /**< MPEG-2 Audio */ + cudaAudioCodec_MP3, /**< MPEG-1 Layer III Audio */ + cudaAudioCodec_AC3, /**< Dolby Digital (AC3) Audio */ + cudaAudioCodec_LPCM, /**< PCM Audio */ + cudaAudioCodec_AAC, /**< AAC Audio */ +} cudaAudioCodec; + +/************************************************************************/ +//! \ingroup STRUCTS +//! \struct HEVCTIMECODESET +//! Used to store Time code extracted from Time code SEI in HEVC codec +/************************************************************************/ +typedef struct _HEVCTIMECODESET { + unsigned int time_offset_value; + unsigned short n_frames; + unsigned char clock_timestamp_flag; + unsigned char units_field_based_flag; + unsigned char counting_type; + unsigned char full_timestamp_flag; + unsigned char discontinuity_flag; + unsigned char cnt_dropped_flag; + unsigned char seconds_value; + unsigned char minutes_value; + unsigned char hours_value; + unsigned char seconds_flag; + unsigned char minutes_flag; + unsigned char hours_flag; + unsigned char time_offset_length; + unsigned char reserved; +} HEVCTIMECODESET; + +/************************************************************************/ +//! \ingroup STRUCTS +//! \struct HEVCSEITIMECODE +//! Used to extract Time code SEI in HEVC codec +/************************************************************************/ +typedef struct _HEVCSEITIMECODE { + HEVCTIMECODESET time_code_set[MAX_CLOCK_TS]; + unsigned char num_clock_ts; +} HEVCSEITIMECODE; + +/**********************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUSEIMESSAGE; +//! Used in CUVIDSEIMESSAGEINFO structure +/**********************************************************************************/ +typedef struct _CUSEIMESSAGE { + unsigned char sei_message_type; /**< OUT: SEI Message Type */ + unsigned char reserved[3]; + unsigned int sei_message_size; /**< OUT: SEI Message Size */ +} CUSEIMESSAGE; + +/************************************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDEOFORMAT +//! Video format +//! Used in cuvidGetSourceVideoFormat API +/************************************************************************************************/ +typedef struct { + cudaVideoCodec codec; /**< OUT: Compression format */ + /** + * OUT: frame rate = numerator / denominator (for example: 30000/1001) + */ + struct { + /**< OUT: frame rate numerator (0 = unspecified or variable frame rate) */ + unsigned int numerator; + /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */ + unsigned int denominator; + } frame_rate; + unsigned char progressive_sequence; /**< OUT: 0=interlaced, 1=progressive */ + unsigned char bit_depth_luma_minus8; /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */ + unsigned char bit_depth_chroma_minus8; /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */ + unsigned char min_num_decode_surfaces; /**< OUT: Minimum number of decode surfaces to be allocated for correct + decoding. The client can send this value in ulNumDecodeSurfaces + (in CUVIDDECODECREATEINFO structure). + This guarantees correct functionality and optimal video memory + usage but not necessarily the best performance, which depends on + the design of the overall application. The optimal number of + decode surfaces (in terms of performance and memory utilization) + should be decided by experimentation for each application, but it + cannot go below min_num_decode_surfaces. + If this value is used for ulNumDecodeSurfaces then it must be + returned to parser during sequence callback. */ + unsigned int coded_width; /**< OUT: coded frame width in pixels */ + unsigned int coded_height; /**< OUT: coded frame height in pixels */ + /** + * area of the frame that should be displayed + * typical example: + * coded_width = 1920, coded_height = 1088 + * display_area = { 0,0,1920,1080 } + */ + struct { + int left; /**< OUT: left position of display rect */ + int top; /**< OUT: top position of display rect */ + int right; /**< OUT: right position of display rect */ + int bottom; /**< OUT: bottom position of display rect */ + } display_area; + cudaVideoChromaFormat chroma_format; /**< OUT: Chroma format */ + unsigned int bitrate; /**< OUT: video bitrate (bps, 0=unknown) */ + /** + * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc) + */ + struct { + int x; + int y; + } display_aspect_ratio; + /** + * Video Signal Description + * Refer section E.2.1 (VUI parameters semantics) of H264 spec file + */ + struct { + unsigned char video_format : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified */ + unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range */ + unsigned char reserved_zero_bits : 4; /**< Reserved bits */ + unsigned char color_primaries; /**< OUT: chromaticity coordinates of source primaries */ + unsigned char + transfer_characteristics; /**< OUT: opto-electronic transfer characteristic of the source picture */ + unsigned char matrix_coefficients; /**< OUT: used in deriving luma and chroma signals from RGB primaries */ + } video_signal_description; + unsigned int seqhdr_data_length; /**< OUT: Additional bytes following (CUVIDEOFORMATEX) */ +} CUVIDEOFORMAT; + +/****************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDOPERATINGPOINTINFO +//! Operating point information of scalable bitstream +/****************************************************************/ +typedef struct { + cudaVideoCodec codec; + union { + struct { + unsigned char operating_points_cnt; + unsigned char reserved24_bits[3]; + unsigned short operating_points_idc[32]; + } av1; + unsigned char CodecReserved[1024]; + }; +} CUVIDOPERATINGPOINTINFO; + +/**********************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDSEIMESSAGEINFO +//! Used in cuvidParseVideoData API with PFNVIDSEIMSGCALLBACK pfnGetSEIMsg +/**********************************************************************************/ +typedef struct _CUVIDSEIMESSAGEINFO { + void *pSEIData; /**< OUT: SEI Message Data */ + CUSEIMESSAGE *pSEIMessage; /**< OUT: SEI Message Info */ + unsigned int sei_message_count; /**< OUT: SEI Message Count */ + unsigned int picIdx; /**< OUT: SEI Message Pic Index */ +} CUVIDSEIMESSAGEINFO; + +/****************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDAV1SEQHDR +//! AV1 specific sequence header information +/****************************************************************/ +typedef struct { + unsigned int max_width; + unsigned int max_height; + unsigned char reserved[1016]; +} CUVIDAV1SEQHDR; + +/****************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDEOFORMATEX +//! Video format including raw sequence header information +//! Used in cuvidGetSourceVideoFormat API +/****************************************************************/ +typedef struct { + CUVIDEOFORMAT format; /**< OUT: CUVIDEOFORMAT structure */ + union { + CUVIDAV1SEQHDR av1; + unsigned char raw_seqhdr_data[1024]; /**< OUT: Sequence header data */ + }; +} CUVIDEOFORMATEX; + +/****************************************************************/ +//! \ingroup STRUCTS +//! \struct CUAUDIOFORMAT +//! Audio formats +//! Used in cuvidGetSourceAudioFormat API +/****************************************************************/ +typedef struct { + cudaAudioCodec codec; /**< OUT: Compression format */ + unsigned int channels; /**< OUT: number of audio channels */ + unsigned int samplespersec; /**< OUT: sampling frequency */ + unsigned int bitrate; /**< OUT: For uncompressed, can also be used to determine bits per sample */ + unsigned int reserved1; /**< Reserved for future use */ + unsigned int reserved2; /**< Reserved for future use */ +} CUAUDIOFORMAT; + +/***************************************************************/ +//! \enum CUvideopacketflags +//! Data packet flags +//! Used in CUVIDSOURCEDATAPACKET structure +/***************************************************************/ +typedef enum { + CUVID_PKT_ENDOFSTREAM = 0x01, /**< Set when this is the last packet for this stream */ + CUVID_PKT_TIMESTAMP = 0x02, /**< Timestamp is valid */ + CUVID_PKT_DISCONTINUITY = 0x04, /**< Set when a discontinuity has to be signalled */ + CUVID_PKT_ENDOFPICTURE = 0x08, /**< Set when the packet contains exactly one frame or one field */ + CUVID_PKT_NOTIFY_EOS = 0x10, /**< If this flag is set along with CUVID_PKT_ENDOFSTREAM, an additional (dummy) + display callback will be invoked with null value of CUVIDPARSERDISPINFO which + should be interpreted as end of the stream. */ +} CUvideopacketflags; + +/*****************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDSOURCEDATAPACKET +//! Data Packet +//! Used in cuvidParseVideoData API +//! IN for cuvidParseVideoData +/*****************************************************************************/ +typedef struct _CUVIDSOURCEDATAPACKET { + unsigned long flags; /**< IN: Combination of CUVID_PKT_XXX flags */ + unsigned long payload_size; /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */ + const unsigned char *payload; /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */ + CUvideotimestamp timestamp; /**< IN: Presentation time stamp (10MHz clock), only valid if + CUVID_PKT_TIMESTAMP flag is set */ +} CUVIDSOURCEDATAPACKET; + +// Callback for packet delivery +typedef int(CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *); + +/**************************************************************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDSOURCEPARAMS +//! Describes parameters needed in cuvidCreateVideoSource API +//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all +//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is +//! needed. +/**************************************************************************************************************************/ +typedef struct _CUVIDSOURCEPARAMS { + unsigned int ulClockRate; /**< IN: Time stamp units in Hz (0=default=10000000Hz) */ + unsigned int bAnnexb : 1; /**< IN: AV1 annexB stream */ + unsigned int uReserved : 31; /**< Reserved for future use - set to zero */ + unsigned int uReserved1[6]; /**< Reserved for future use - set to zero */ + void *pUserData; /**< IN: User private data passed in to the data handlers */ + PFNVIDSOURCECALLBACK pfnVideoDataHandler; /**< IN: Called to deliver video packets */ + PFNVIDSOURCECALLBACK pfnAudioDataHandler; /**< IN: Called to deliver audio packets. */ + void *pvReserved2[8]; /**< Reserved for future use - set to NULL */ +} CUVIDSOURCEPARAMS; + +/**********************************************/ +//! \ingroup ENUMS +//! \enum CUvideosourceformat_flags +//! CUvideosourceformat_flags +//! Used in cuvidGetSourceVideoFormat API +/**********************************************/ +typedef enum { + CUVID_FMT_EXTFORMATINFO = 0x100 /**< Return extended format structure (CUVIDEOFORMATEX) */ +} CUvideosourceformat_flags; + +#if !defined(__APPLE__) +/***************************************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS +//! *pParams) Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks: +//! pfnVideoDataHandler() and pfnAudioDataHandler() +//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all +//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is +//! needed. +/***************************************************************************************************************************/ +CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams); + +/***************************************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS +//! *pParams) Create video source +/***************************************************************************************************************************/ +CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams); + +/********************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj) +//! Destroy video source +/********************************************************************/ +CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj); + +/******************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state) +//! Set video source state to: +//! cudaVideoState_Started - to signal the source to run and deliver data +//! cudaVideoState_Stopped - to stop the source from delivering the data +//! cudaVideoState_Error - invalid source +/******************************************************************************************/ +CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state); + +/******************************************************************************************/ +//! \ingroup FUNCTS +//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj) +//! Get video source state +//! Returns: +//! cudaVideoState_Started - if Source is running and delivering data +//! cudaVideoState_Stopped - if Source is stopped or reached end-of-stream +//! cudaVideoState_Error - if Source is in error state +/******************************************************************************************/ +cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj); + +/******************************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags) +//! Gets video source format in pvidfmt, flags is set to combination of CUvideosourceformat_flags as per requirement +/******************************************************************************************************************/ +CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags); + +/**************************************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags) +//! Get audio source format +//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all +//! supported containers. It's recommended to clients to use their own or third party demuxer if audio support is +//! needed. +/**************************************************************************************************************************/ +CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags); + +#endif +/**********************************************************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDPARSERDISPINFO +//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture +/**********************************************************************************/ +typedef struct _CUVIDPARSERDISPINFO { + int picture_index; /**< OUT: Index of the current picture */ + int progressive_frame; /**< OUT: 1 if progressive frame; 0 otherwise */ + int top_field_first; /**< OUT: 1 if top field is displayed first; 0 otherwise */ + int repeat_first_field; /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling, + -1=unpaired field) */ + CUvideotimestamp timestamp; /**< OUT: Presentation time stamp */ +} CUVIDPARSERDISPINFO; + +/***********************************************************************************************************************/ +//! Parser callbacks +//! The parser will call these synchronously from within cuvidParseVideoData(), whenever there is sequence change or a +//! picture is ready to be decoded and/or displayed. First argument in functions is "void *pUserData" member of +//! structure CUVIDSOURCEPARAMS Return values from these callbacks are interpreted as below. If the callbacks return +//! failure, it will be propagated by cuvidParseVideoData() to the application. Parser picks default operating point as +//! 0 and outputAllLayers flag as 0 if PFNVIDOPPOINTCALLBACK is not set or return value is -1 or invalid operating +//! point. PFNVIDSEQUENCECALLBACK : 0: fail, 1: succeeded, > 1: override dpb size of parser (set by +//! CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while creating parser) PFNVIDDECODECALLBACK : 0: fail, >=1: succeeded +//! PFNVIDDISPLAYCALLBACK : 0: fail, >=1: succeeded +//! PFNVIDOPPOINTCALLBACK : <0: fail, >=0: succeeded (bit 0-9: OperatingPoint, bit 10-10: outputAllLayers, bit 11-30: +//! reserved) PFNVIDSEIMSGCALLBACK : 0: fail, >=1: succeeded +/***********************************************************************************************************************/ +typedef int(CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *); +typedef int(CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *); +typedef int(CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *); +typedef int(CUDAAPI *PFNVIDOPPOINTCALLBACK)(void *, CUVIDOPERATINGPOINTINFO *); +typedef int(CUDAAPI *PFNVIDSEIMSGCALLBACK)(void *, CUVIDSEIMESSAGEINFO *); + +/**************************************/ +//! \ingroup STRUCTS +//! \struct CUVIDPARSERPARAMS +//! Used in cuvidCreateVideoParser API +/**************************************/ +typedef struct _CUVIDPARSERPARAMS { + cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX */ + unsigned int ulMaxNumDecodeSurfaces; /**< IN: Max # of decode surfaces (parser will cycle through these) */ + unsigned int ulClockRate; /**< IN: Timestamp units in Hz (0=default=10000000Hz) */ + unsigned int ulErrorThreshold; /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always + IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */ + unsigned int ulMaxDisplayDelay; /**< IN: Max display queue delay (improves pipelining of decode with display) + 0=no delay (recommended values: 2..4) */ + unsigned int bAnnexb : 1; /**< IN: AV1 annexB stream */ + unsigned int uReserved : 31; /**< Reserved for future use - set to zero */ + unsigned int uReserved1[4]; /**< IN: Reserved for future use - set to 0 */ + void *pUserData; /**< IN: User data for callbacks */ + PFNVIDSEQUENCECALLBACK + pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */ + PFNVIDDECODECALLBACK pfnDecodePicture; /**< IN: Called when a picture is ready to be decoded (decode order) */ + PFNVIDDISPLAYCALLBACK + pfnDisplayPicture; /**< IN: Called whenever a picture is ready to be displayed (display order) */ + PFNVIDOPPOINTCALLBACK pfnGetOperatingPoint; /**< IN: Called from AV1 sequence header to get operating point of a AV1 + scalable bitstream */ + PFNVIDSEIMSGCALLBACK pfnGetSEIMsg; /**< IN: Called when all SEI messages are parsed for particular frame */ + void *pvReserved2[5]; /**< Reserved for future use - set to NULL */ + CUVIDEOFORMATEX *pExtVideoInfo; /**< IN: [Optional] sequence header data from system layer */ +} CUVIDPARSERPARAMS; + +/************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams) +//! Create video parser object and initialize +/************************************************************************************************/ +CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams); + +/************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket) +//! Parse the video data from source data packet in pPacket +//! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and +//! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding +//! calls back pfnSequenceCallback with CUVIDEOFORMAT data for initial sequence header or when +//! the decoder encounters a video format change +//! calls back pfnDisplayPicture with CUVIDPARSERDISPINFO data to display a video frame +/************************************************************************************************/ +CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket); + +/************************************************************************************************/ +//! \ingroup FUNCTS +//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj) +//! Destroy the video parser +/************************************************************************************************/ +CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj); + +/**********************************************************************************************/ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif // __NVCUVID_H__ diff --git a/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so b/third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/libnvcuvid.so new file mode 100644 index 0000000000000000000000000000000000000000..f08a209545e076a835d11dcc24bd20d22088b1c5 GIT binary patch literal 3528 zcmd6qPiWLf6vt;1HBqCksam5^qWA|QEDE9$7Rf&~2kXBy7D8bA%kD49*6eQh{boZF zQhO;S^&sNGQ+g}n(MzF+P(t+(4}u2|J?z!uNvMbi8%@9OH}7pWlk5~hN*}zL+0XaQ z%$u3tZ+G^)a~IFo#9~UUR-IM!>CFa1+F4lG%nelS>Wo^4vR>6ivFn$mRjq|h21%_{ zO9W^RnnL)(Cc~wkhekFZl~lAuts0ac)2}}2rPS3II>Sq1;;y9Mc7*MRH^M)M$KiY7 zd*Ec-;2WcsDfXeFKD*$%;k5i__*S^fXkF{zk|XV@IdbVVDZF z7ige-MvZ0%^W)Lek_Zcl6L`-0dVYNIlPhJM7M&q8l-=yT9tzpuW3CFp-R^hM~uIrL_H2mkEQlhAKD z^a1E&4t*N>b%#C+J?YTrpdWPTCFu1IeG&Sr#p?UljEl|lD!SaSS}_J0-KEG{u%~2l zNBxxrd2e2_&ev4c#{K19MR&OW#1i2MeoM4Bs1hSyIHD4n$$U7OGqny>Li-b1CH#@p zQ1l&4x|hqQKT&eDUfLsGetF+y|LFcjCc4~BWuNKJOJj&G`zrM_7U*tDP3bTDOZQaK z&{T3xQp>rJM#jfxKA=3nOwLvA5mMhPN1BG@E%+$+m;CSWeM9Oqlw5^L{(|Uth31wp i`5zq3C&*|$qD6p<8GaDkDk0kT`u_pVP7(wF literal 0 HcmV?d00001 diff --git a/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp new file mode 100644 index 000000000..0fd61f447 --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.cpp @@ -0,0 +1,709 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +#include "../../../Interface/nvcuvid.h" +#include "NvDecoder/NvDecoder.h" + +std::map NvDecoder::sessionOverHead = {{0, 0}, {1, 0}}; + +/** + * @brief This function is used to get codec string from codec id + */ +const char *NvDecoder::GetCodecString(cudaVideoCodec eCodec) { return GetVideoCodecString(eCodec); } + +/* Called when the parser encounters sequence header for AV1 SVC content + * return value interpretation: + * < 0 : fail, >=0: succeeded (bit 0-9: currOperatingPoint, bit 10-10: bDispAllLayer, bit 11-30: reserved, must be + * set 0) + */ +int NvDecoder::GetOperatingPoint(CUVIDOPERATINGPOINTINFO *pOPInfo) { + if (pOPInfo->codec == cudaVideoCodec_AV1) { + if (pOPInfo->av1.operating_points_cnt > 1) { + // clip has SVC enabled + if (m_nOperatingPoint >= pOPInfo->av1.operating_points_cnt) + m_nOperatingPoint = 0; + + printf("AV1 SVC clip: operating point count %d ", pOPInfo->av1.operating_points_cnt); + printf("Selected operating point: %d, IDC 0x%x bOutputAllLayers %d\n", m_nOperatingPoint, + pOPInfo->av1.operating_points_idc[m_nOperatingPoint], m_bDispAllLayers); + return (m_nOperatingPoint | (m_bDispAllLayers << 10)); + } + } + return -1; +} + +/* Return value from HandleVideoSequence() are interpreted as : + * 0: fail, 1: succeeded, > 1: override dpb size of parser (set by CUVIDPARSERPARAMS::ulMaxNumDecodeSurfaces while + * creating parser) + */ +int NvDecoder::HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat) { + START_TIMER + m_videoInfo.str(""); + m_videoInfo.clear(); + m_videoInfo << "Video Input Information" << std::endl + << "\tCodec : " << GetVideoCodecString(pVideoFormat->codec) << std::endl + << "\tFrame rate : " << pVideoFormat->frame_rate.numerator << "/" + << pVideoFormat->frame_rate.denominator << " = " + << 1.0 * pVideoFormat->frame_rate.numerator / pVideoFormat->frame_rate.denominator << " fps" + << std::endl + << "\tSequence : " << (pVideoFormat->progressive_sequence ? "Progressive" : "Interlaced") + << std::endl + << "\tCoded size : [" << pVideoFormat->coded_width << ", " << pVideoFormat->coded_height << "]" + << std::endl + << "\tDisplay area : [" << pVideoFormat->display_area.left << ", " << pVideoFormat->display_area.top + << ", " << pVideoFormat->display_area.right << ", " << pVideoFormat->display_area.bottom << "]" + << std::endl + << "\tChroma : " << GetVideoChromaFormatString(pVideoFormat->chroma_format) << std::endl + << "\tBit depth : " << pVideoFormat->bit_depth_luma_minus8 + 8; + m_videoInfo << std::endl; + + int nDecodeSurface = pVideoFormat->min_num_decode_surfaces; + + CUVIDDECODECAPS decodecaps; + memset(&decodecaps, 0, sizeof(decodecaps)); + + decodecaps.eCodecType = pVideoFormat->codec; + decodecaps.eChromaFormat = pVideoFormat->chroma_format; + decodecaps.nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidGetDecoderCaps(&decodecaps)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + + if (!decodecaps.bIsSupported) { + NVDEC_THROW_ERROR("Codec not supported on this GPU", CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if ((pVideoFormat->coded_width > decodecaps.nMaxWidth) || (pVideoFormat->coded_height > decodecaps.nMaxHeight)) { + + std::ostringstream errorString; + errorString << std::endl + << "Resolution : " << pVideoFormat->coded_width << "x" << pVideoFormat->coded_height + << std::endl + << "Max Supported (wxh) : " << decodecaps.nMaxWidth << "x" << decodecaps.nMaxHeight << std::endl + << "Resolution not supported on this GPU"; + + const std::string cErr = errorString.str(); + NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if ((pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) > decodecaps.nMaxMBCount) { + + std::ostringstream errorString; + errorString << std::endl + << "MBCount : " << (pVideoFormat->coded_width >> 4) * (pVideoFormat->coded_height >> 4) + << std::endl + << "Max Supported mbcnt : " << decodecaps.nMaxMBCount << std::endl + << "MBCount not supported on this GPU"; + + const std::string cErr = errorString.str(); + NVDEC_THROW_ERROR(cErr, CUDA_ERROR_NOT_SUPPORTED); + return nDecodeSurface; + } + + if (m_nWidth && m_nLumaHeight && m_nChromaHeight) { + + // cuvidCreateDecoder() has been called before, and now there's possible config change + return ReconfigureDecoder(pVideoFormat); + } + + // eCodec has been set in the constructor (for parser). Here it's set again for potential correction + m_eCodec = pVideoFormat->codec; + m_eChromaFormat = pVideoFormat->chroma_format; + m_nBitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + m_nBPP = m_nBitDepthMinus8 > 0 ? 2 : 1; + + // Set the output surface format same as chroma format + if (m_eChromaFormat == cudaVideoChromaFormat_420 || cudaVideoChromaFormat_Monochrome) + m_eOutputFormat = + pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_P016 : cudaVideoSurfaceFormat_NV12; + else if (m_eChromaFormat == cudaVideoChromaFormat_444) + m_eOutputFormat = + pVideoFormat->bit_depth_luma_minus8 ? cudaVideoSurfaceFormat_YUV444_16Bit : cudaVideoSurfaceFormat_YUV444; + else if (m_eChromaFormat == cudaVideoChromaFormat_422) + m_eOutputFormat = cudaVideoSurfaceFormat_NV12; // no 4:2:2 output format supported yet so make 420 default + + // Check if output format supported. If not, check falback options + if (!(decodecaps.nOutputFormatMask & (1 << m_eOutputFormat))) { + if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)) + m_eOutputFormat = cudaVideoSurfaceFormat_NV12; + else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016)) + m_eOutputFormat = cudaVideoSurfaceFormat_P016; + else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444)) + m_eOutputFormat = cudaVideoSurfaceFormat_YUV444; + else if (decodecaps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444_16Bit)) + m_eOutputFormat = cudaVideoSurfaceFormat_YUV444_16Bit; + else + NVDEC_THROW_ERROR("No supported output format found", CUDA_ERROR_NOT_SUPPORTED); + } + m_videoFormat = *pVideoFormat; + + CUVIDDECODECREATEINFO videoDecodeCreateInfo = {0}; + videoDecodeCreateInfo.CodecType = pVideoFormat->codec; + videoDecodeCreateInfo.ChromaFormat = pVideoFormat->chroma_format; + videoDecodeCreateInfo.OutputFormat = m_eOutputFormat; + videoDecodeCreateInfo.bitDepthMinus8 = pVideoFormat->bit_depth_luma_minus8; + if (pVideoFormat->progressive_sequence) + videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave; + else + videoDecodeCreateInfo.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive; + videoDecodeCreateInfo.ulNumOutputSurfaces = 2; + // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded by NVDEC hardware + videoDecodeCreateInfo.ulCreationFlags = cudaVideoCreate_PreferCUVID; + videoDecodeCreateInfo.ulNumDecodeSurfaces = nDecodeSurface; + videoDecodeCreateInfo.vidLock = m_ctxLock; + videoDecodeCreateInfo.ulWidth = pVideoFormat->coded_width; + videoDecodeCreateInfo.ulHeight = pVideoFormat->coded_height; + // AV1 has max width/height of sequence in sequence header + if (pVideoFormat->codec == cudaVideoCodec_AV1 && pVideoFormat->seqhdr_data_length > 0) { + // dont overwrite if it is already set from cmdline or reconfig.txt + if (!(m_nMaxWidth > pVideoFormat->coded_width || m_nMaxHeight > pVideoFormat->coded_height)) { + CUVIDEOFORMATEX *vidFormatEx = (CUVIDEOFORMATEX *)pVideoFormat; + m_nMaxWidth = vidFormatEx->av1.max_width; + m_nMaxHeight = vidFormatEx->av1.max_height; + } + } + if (m_nMaxWidth < (int)pVideoFormat->coded_width) + m_nMaxWidth = pVideoFormat->coded_width; + if (m_nMaxHeight < (int)pVideoFormat->coded_height) + m_nMaxHeight = pVideoFormat->coded_height; + videoDecodeCreateInfo.ulMaxWidth = m_nMaxWidth; + videoDecodeCreateInfo.ulMaxHeight = m_nMaxHeight; + + if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) { + m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; + m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; + videoDecodeCreateInfo.ulTargetWidth = pVideoFormat->coded_width; + videoDecodeCreateInfo.ulTargetHeight = pVideoFormat->coded_height; + } else { + if (m_resizeDim.w && m_resizeDim.h) { + videoDecodeCreateInfo.display_area.left = pVideoFormat->display_area.left; + videoDecodeCreateInfo.display_area.top = pVideoFormat->display_area.top; + videoDecodeCreateInfo.display_area.right = pVideoFormat->display_area.right; + videoDecodeCreateInfo.display_area.bottom = pVideoFormat->display_area.bottom; + m_nWidth = m_resizeDim.w; + m_nLumaHeight = m_resizeDim.h; + } + + if (m_cropRect.r && m_cropRect.b) { + videoDecodeCreateInfo.display_area.left = m_cropRect.l; + videoDecodeCreateInfo.display_area.top = m_cropRect.t; + videoDecodeCreateInfo.display_area.right = m_cropRect.r; + videoDecodeCreateInfo.display_area.bottom = m_cropRect.b; + m_nWidth = m_cropRect.r - m_cropRect.l; + m_nLumaHeight = m_cropRect.b - m_cropRect.t; + } + videoDecodeCreateInfo.ulTargetWidth = m_nWidth; + videoDecodeCreateInfo.ulTargetHeight = m_nLumaHeight; + } + + m_nChromaHeight = (int)(ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat))); + m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat); + m_nSurfaceHeight = videoDecodeCreateInfo.ulTargetHeight; + m_nSurfaceWidth = videoDecodeCreateInfo.ulTargetWidth; + m_displayRect.b = videoDecodeCreateInfo.display_area.bottom; + m_displayRect.t = videoDecodeCreateInfo.display_area.top; + m_displayRect.l = videoDecodeCreateInfo.display_area.left; + m_displayRect.r = videoDecodeCreateInfo.display_area.right; + + m_videoInfo << "Video Decoding Params:" << std::endl + << "\tNum Surfaces : " << videoDecodeCreateInfo.ulNumDecodeSurfaces << std::endl + << "\tCrop : [" << videoDecodeCreateInfo.display_area.left << ", " + << videoDecodeCreateInfo.display_area.top << ", " << videoDecodeCreateInfo.display_area.right << ", " + << videoDecodeCreateInfo.display_area.bottom << "]" << std::endl + << "\tResize : " << videoDecodeCreateInfo.ulTargetWidth << "x" + << videoDecodeCreateInfo.ulTargetHeight << std::endl + << "\tDeinterlace : " + << std::vector{"Weave", "Bob", "Adaptive"}[videoDecodeCreateInfo.DeinterlaceMode]; + m_videoInfo << std::endl; + + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidCreateDecoder(&m_hDecoder, &videoDecodeCreateInfo)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + STOP_TIMER("Session Initialization Time: "); + NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime); + return nDecodeSurface; +} + +int NvDecoder::ReconfigureDecoder(CUVIDEOFORMAT *pVideoFormat) { + if (pVideoFormat->bit_depth_luma_minus8 != m_videoFormat.bit_depth_luma_minus8 || + pVideoFormat->bit_depth_chroma_minus8 != m_videoFormat.bit_depth_chroma_minus8) { + + NVDEC_THROW_ERROR("Reconfigure Not supported for bit depth change", CUDA_ERROR_NOT_SUPPORTED); + } + + if (pVideoFormat->chroma_format != m_videoFormat.chroma_format) { + + NVDEC_THROW_ERROR("Reconfigure Not supported for chroma format change", CUDA_ERROR_NOT_SUPPORTED); + } + + bool bDecodeResChange = !(pVideoFormat->coded_width == m_videoFormat.coded_width && + pVideoFormat->coded_height == m_videoFormat.coded_height); + bool bDisplayRectChange = !(pVideoFormat->display_area.bottom == m_videoFormat.display_area.bottom && + pVideoFormat->display_area.top == m_videoFormat.display_area.top && + pVideoFormat->display_area.left == m_videoFormat.display_area.left && + pVideoFormat->display_area.right == m_videoFormat.display_area.right); + + int nDecodeSurface = pVideoFormat->min_num_decode_surfaces; + + if ((pVideoFormat->coded_width > m_nMaxWidth) || (pVideoFormat->coded_height > m_nMaxHeight)) { + // For VP9, let driver handle the change if new width/height > maxwidth/maxheight + if ((m_eCodec != cudaVideoCodec_VP9) || m_bReconfigExternal) { + NVDEC_THROW_ERROR("Reconfigure Not supported when width/height > maxwidth/maxheight", + CUDA_ERROR_NOT_SUPPORTED); + } + return 1; + } + + if (!bDecodeResChange && !m_bReconfigExtPPChange) { + // if the coded_width/coded_height hasn't changed but display resolution has changed, then need to update + // width/height for correct output without cropping. Example : 1920x1080 vs 1920x1088 + if (bDisplayRectChange) { + m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; + m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; + m_nChromaHeight = (int)ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat)); + m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat); + } + + // no need for reconfigureDecoder(). Just return + return 1; + } + + CUVIDRECONFIGUREDECODERINFO reconfigParams = {0}; + + reconfigParams.ulWidth = m_videoFormat.coded_width = pVideoFormat->coded_width; + reconfigParams.ulHeight = m_videoFormat.coded_height = pVideoFormat->coded_height; + + // Dont change display rect and get scaled output from decoder. This will help display app to present apps smoothly + reconfigParams.display_area.bottom = m_displayRect.b; + reconfigParams.display_area.top = m_displayRect.t; + reconfigParams.display_area.left = m_displayRect.l; + reconfigParams.display_area.right = m_displayRect.r; + reconfigParams.ulTargetWidth = m_nSurfaceWidth; + reconfigParams.ulTargetHeight = m_nSurfaceHeight; + + // If external reconfigure is called along with resolution change even if post processing params is not changed, + // do full reconfigure params update + if ((m_bReconfigExternal && bDecodeResChange) || m_bReconfigExtPPChange) { + // update display rect and target resolution if requested explicitly + m_bReconfigExternal = false; + m_bReconfigExtPPChange = false; + m_videoFormat = *pVideoFormat; + if (!(m_cropRect.r && m_cropRect.b) && !(m_resizeDim.w && m_resizeDim.h)) { + m_nWidth = pVideoFormat->display_area.right - pVideoFormat->display_area.left; + m_nLumaHeight = pVideoFormat->display_area.bottom - pVideoFormat->display_area.top; + reconfigParams.ulTargetWidth = pVideoFormat->coded_width; + reconfigParams.ulTargetHeight = pVideoFormat->coded_height; + } else { + if (m_resizeDim.w && m_resizeDim.h) { + reconfigParams.display_area.left = pVideoFormat->display_area.left; + reconfigParams.display_area.top = pVideoFormat->display_area.top; + reconfigParams.display_area.right = pVideoFormat->display_area.right; + reconfigParams.display_area.bottom = pVideoFormat->display_area.bottom; + m_nWidth = m_resizeDim.w; + m_nLumaHeight = m_resizeDim.h; + } + + if (m_cropRect.r && m_cropRect.b) { + reconfigParams.display_area.left = m_cropRect.l; + reconfigParams.display_area.top = m_cropRect.t; + reconfigParams.display_area.right = m_cropRect.r; + reconfigParams.display_area.bottom = m_cropRect.b; + m_nWidth = m_cropRect.r - m_cropRect.l; + m_nLumaHeight = m_cropRect.b - m_cropRect.t; + } + reconfigParams.ulTargetWidth = m_nWidth; + reconfigParams.ulTargetHeight = m_nLumaHeight; + } + + m_nChromaHeight = (int)ceil(m_nLumaHeight * GetChromaHeightFactor(m_eOutputFormat)); + m_nNumChromaPlanes = GetChromaPlaneCount(m_eOutputFormat); + m_nSurfaceHeight = reconfigParams.ulTargetHeight; + m_nSurfaceWidth = reconfigParams.ulTargetWidth; + m_displayRect.b = reconfigParams.display_area.bottom; + m_displayRect.t = reconfigParams.display_area.top; + m_displayRect.l = reconfigParams.display_area.left; + m_displayRect.r = reconfigParams.display_area.right; + } + + reconfigParams.ulNumDecodeSurfaces = nDecodeSurface; + + START_TIMER + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidReconfigureDecoder(m_hDecoder, &reconfigParams)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + STOP_TIMER("Session Reconfigure Time: "); + + return nDecodeSurface; +} + +int NvDecoder::setReconfigParams(const Rect *pCropRect, const Dim *pResizeDim) { + m_bReconfigExternal = true; + m_bReconfigExtPPChange = false; + if (pCropRect) { + if (!((pCropRect->t == m_cropRect.t) && (pCropRect->l == m_cropRect.l) && (pCropRect->b == m_cropRect.b) && + (pCropRect->r == m_cropRect.r))) { + m_bReconfigExtPPChange = true; + m_cropRect = *pCropRect; + } + } + if (pResizeDim) { + if (!((pResizeDim->w == m_resizeDim.w) && (pResizeDim->h == m_resizeDim.h))) { + m_bReconfigExtPPChange = true; + m_resizeDim = *pResizeDim; + } + } + + // Clear existing output buffers of different size + uint8_t *pFrame = NULL; + while (!m_vpFrame.empty()) { + pFrame = m_vpFrame.back(); + m_vpFrame.pop_back(); + if (m_bUseDeviceFrame) { + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + CUDA_DRVAPI_CALL(cuMemFree((CUdeviceptr)pFrame)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + } else { + delete pFrame; + } + } + + return 1; +} + +/* Return value from HandlePictureDecode() are interpreted as: + * 0: fail, >=1: succeeded + */ +int NvDecoder::HandlePictureDecode(CUVIDPICPARAMS *pPicParams) { + if (!m_hDecoder) { + NVDEC_THROW_ERROR("Decoder not initialized.", CUDA_ERROR_NOT_INITIALIZED); + return false; + } + m_nPicNumInDecodeOrder[pPicParams->CurrPicIdx] = m_nDecodePicCnt++; + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL(cuvidDecodePicture(m_hDecoder, pPicParams)); + if (m_bForce_zero_latency && ((!pPicParams->field_pic_flag) || (pPicParams->second_field))) { + CUVIDPARSERDISPINFO dispInfo; + memset(&dispInfo, 0, sizeof(dispInfo)); + dispInfo.picture_index = pPicParams->CurrPicIdx; + dispInfo.progressive_frame = !pPicParams->field_pic_flag; + dispInfo.top_field_first = pPicParams->bottom_field_flag ^ 1; + HandlePictureDisplay(&dispInfo); + } + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + return 1; +} + +/* Return value from HandlePictureDisplay() are interpreted as: + * 0: fail, >=1: succeeded + */ +int NvDecoder::HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo) { + CUVIDPROCPARAMS videoProcessingParameters = {}; + videoProcessingParameters.progressive_frame = pDispInfo->progressive_frame; + videoProcessingParameters.second_field = pDispInfo->repeat_first_field + 1; + videoProcessingParameters.top_field_first = pDispInfo->top_field_first; + videoProcessingParameters.unpaired_field = pDispInfo->repeat_first_field < 0; + videoProcessingParameters.output_stream = m_cuvidStream; + + if (m_bExtractSEIMessage) { + if (m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData) { + // Write SEI Message + uint8_t *seiBuffer = (uint8_t *)(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData); + uint32_t seiNumMessages = m_SEIMessagesDisplayOrder[pDispInfo->picture_index].sei_message_count; + CUSEIMESSAGE *seiMessagesInfo = m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIMessage; + if (m_fpSEI) { + for (uint32_t i = 0; i < seiNumMessages; i++) { + if (m_eCodec == cudaVideoCodec_H264 || cudaVideoCodec_H264_SVC || cudaVideoCodec_H264_MVC || + cudaVideoCodec_HEVC) { + switch (seiMessagesInfo[i].sei_message_type) { + case SEI_TYPE_TIME_CODE: { + HEVCSEITIMECODE *timecode = (HEVCSEITIMECODE *)seiBuffer; + fwrite(timecode, sizeof(HEVCSEITIMECODE), 1, m_fpSEI); + } break; + case SEI_TYPE_USER_DATA_UNREGISTERED: { + fwrite(seiBuffer, seiMessagesInfo[i].sei_message_size, 1, m_fpSEI); + } break; + } + } + if (m_eCodec == cudaVideoCodec_AV1) { + fwrite(seiBuffer, seiMessagesInfo[i].sei_message_size, 1, m_fpSEI); + } + seiBuffer += seiMessagesInfo[i].sei_message_size; + } + } + free(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIData); + free(m_SEIMessagesDisplayOrder[pDispInfo->picture_index].pSEIMessage); + } + } + + CUdeviceptr dpSrcFrame = 0; + unsigned int nSrcPitch = 0; + CUDA_DRVAPI_CALL(cuCtxPushCurrent(m_cuContext)); + NVDEC_API_CALL( + cuvidMapVideoFrame(m_hDecoder, pDispInfo->picture_index, &dpSrcFrame, &nSrcPitch, &videoProcessingParameters)); + + CUVIDGETDECODESTATUS DecodeStatus; + memset(&DecodeStatus, 0, sizeof(DecodeStatus)); + CUresult result = cuvidGetDecodeStatus(m_hDecoder, pDispInfo->picture_index, &DecodeStatus); + if (result == CUDA_SUCCESS && (DecodeStatus.decodeStatus == cuvidDecodeStatus_Error || + DecodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed)) { + printf("Decode Error occurred for picture %d\n", m_nPicNumInDecodeOrder[pDispInfo->picture_index]); + } + + uint8_t *pDecodedFrame = nullptr; + { + std::lock_guard lock(m_mtxVPFrame); + if ((unsigned)++m_nDecodedFrame > m_vpFrame.size()) { + // Not enough frames in stock + m_nFrameAlloc++; + uint8_t *pFrame = NULL; + if (m_bUseDeviceFrame) { + if (m_bDeviceFramePitched) { + CUDA_DRVAPI_CALL(cuMemAllocPitch((CUdeviceptr *)&pFrame, &m_nDeviceFramePitch, GetWidth() * m_nBPP, + m_nLumaHeight + (m_nChromaHeight * m_nNumChromaPlanes), 16)); + } else { + CUDA_DRVAPI_CALL(cuMemAlloc((CUdeviceptr *)&pFrame, GetFrameSize())); + } + } else { + pFrame = new uint8_t[GetFrameSize()]; + } + m_vpFrame.push_back(pFrame); + } + pDecodedFrame = m_vpFrame[m_nDecodedFrame - 1]; + } + + // Copy luma plane + CUDA_MEMCPY2D m = {0}; + m.srcMemoryType = CU_MEMORYTYPE_DEVICE; + m.srcDevice = dpSrcFrame; + m.srcPitch = nSrcPitch; + m.dstMemoryType = m_bUseDeviceFrame ? CU_MEMORYTYPE_DEVICE : CU_MEMORYTYPE_HOST; + m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame); + m.dstPitch = m_nDeviceFramePitch ? m_nDeviceFramePitch : GetWidth() * m_nBPP; + m.WidthInBytes = GetWidth() * m_nBPP; + m.Height = m_nLumaHeight; + CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream)); + + // Copy chroma plane + // NVDEC output has luma height aligned by 2. Adjust chroma offset by aligning height + m.srcDevice = (CUdeviceptr)((uint8_t *)dpSrcFrame + m.srcPitch * ((m_nSurfaceHeight + 1) & ~1)); + m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * m_nLumaHeight); + m.Height = m_nChromaHeight; + CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream)); + + if (m_nNumChromaPlanes == 2) { + m.srcDevice = (CUdeviceptr)((uint8_t *)dpSrcFrame + m.srcPitch * ((m_nSurfaceHeight + 1) & ~1) * 2); + m.dstDevice = (CUdeviceptr)(m.dstHost = pDecodedFrame + m.dstPitch * m_nLumaHeight * 2); + m.Height = m_nChromaHeight; + CUDA_DRVAPI_CALL(cuMemcpy2DAsync(&m, m_cuvidStream)); + } + CUDA_DRVAPI_CALL(cuStreamSynchronize(m_cuvidStream)); + CUDA_DRVAPI_CALL(cuCtxPopCurrent(NULL)); + + if ((int)m_vTimestamp.size() < m_nDecodedFrame) { + m_vTimestamp.resize(m_vpFrame.size()); + } + m_vTimestamp[m_nDecodedFrame - 1] = pDispInfo->timestamp; + + NVDEC_API_CALL(cuvidUnmapVideoFrame(m_hDecoder, dpSrcFrame)); + return 1; +} + +int NvDecoder::GetSEIMessage(CUVIDSEIMESSAGEINFO *pSEIMessageInfo) { + uint32_t seiNumMessages = pSEIMessageInfo->sei_message_count; + CUSEIMESSAGE *seiMessagesInfo = pSEIMessageInfo->pSEIMessage; + size_t totalSEIBufferSize = 0; + if ((pSEIMessageInfo->picIdx < 0) || (pSEIMessageInfo->picIdx >= MAX_FRM_CNT)) { + printf("Invalid picture index (%d)\n", pSEIMessageInfo->picIdx); + return 0; + } + for (uint32_t i = 0; i < seiNumMessages; i++) { + totalSEIBufferSize += seiMessagesInfo[i].sei_message_size; + } + if (!m_pCurrSEIMessage) { + printf("Out of Memory, Allocation failed for m_pCurrSEIMessage\n"); + return 0; + } + m_pCurrSEIMessage->pSEIData = malloc(totalSEIBufferSize); + if (!m_pCurrSEIMessage->pSEIData) { + printf("Out of Memory, Allocation failed for SEI Buffer\n"); + return 0; + } + memcpy(m_pCurrSEIMessage->pSEIData, pSEIMessageInfo->pSEIData, totalSEIBufferSize); + m_pCurrSEIMessage->pSEIMessage = (CUSEIMESSAGE *)malloc(sizeof(CUSEIMESSAGE) * seiNumMessages); + if (!m_pCurrSEIMessage->pSEIMessage) { + free(m_pCurrSEIMessage->pSEIData); + m_pCurrSEIMessage->pSEIData = NULL; + return 0; + } + memcpy(m_pCurrSEIMessage->pSEIMessage, pSEIMessageInfo->pSEIMessage, sizeof(CUSEIMESSAGE) * seiNumMessages); + m_pCurrSEIMessage->sei_message_count = pSEIMessageInfo->sei_message_count; + m_SEIMessagesDisplayOrder[pSEIMessageInfo->picIdx] = *m_pCurrSEIMessage; + return 1; +} + +NvDecoder::NvDecoder(CUcontext cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, bool bLowLatency, + bool bDeviceFramePitched, const Rect *pCropRect, const Dim *pResizeDim, + bool extract_user_SEI_Message, int maxWidth, int maxHeight, unsigned int clkRate, + bool force_zero_latency) + : m_cuContext(cuContext), m_bUseDeviceFrame(bUseDeviceFrame), m_eCodec(eCodec), + m_bDeviceFramePitched(bDeviceFramePitched), m_bExtractSEIMessage(extract_user_SEI_Message), m_nMaxWidth(maxWidth), + m_nMaxHeight(maxHeight), m_bForce_zero_latency(force_zero_latency) { + if (pCropRect) + m_cropRect = *pCropRect; + if (pResizeDim) + m_resizeDim = *pResizeDim; + + NVDEC_API_CALL(cuvidCtxLockCreate(&m_ctxLock, cuContext)); + + ck(cuStreamCreate(&m_cuvidStream, CU_STREAM_DEFAULT)); + + decoderSessionID = 0; + + if (m_bExtractSEIMessage) { + m_fpSEI = fopen("sei_message.txt", "wb"); + m_pCurrSEIMessage = new CUVIDSEIMESSAGEINFO; + memset(&m_SEIMessagesDisplayOrder, 0, sizeof(m_SEIMessagesDisplayOrder)); + } + CUVIDPARSERPARAMS videoParserParameters = {}; + videoParserParameters.CodecType = eCodec; + videoParserParameters.ulMaxNumDecodeSurfaces = 1; + videoParserParameters.ulClockRate = clkRate; + videoParserParameters.ulMaxDisplayDelay = bLowLatency ? 0 : 1; + videoParserParameters.pUserData = this; + videoParserParameters.pfnSequenceCallback = HandleVideoSequenceProc; + videoParserParameters.pfnDecodePicture = HandlePictureDecodeProc; + videoParserParameters.pfnDisplayPicture = m_bForce_zero_latency ? NULL : HandlePictureDisplayProc; + videoParserParameters.pfnGetOperatingPoint = HandleOperatingPointProc; + videoParserParameters.pfnGetSEIMsg = m_bExtractSEIMessage ? HandleSEIMessagesProc : NULL; + NVDEC_API_CALL(cuvidCreateVideoParser(&m_hParser, &videoParserParameters)); +} + +NvDecoder::~NvDecoder() { + + START_TIMER + + if (m_pCurrSEIMessage) { + delete m_pCurrSEIMessage; + m_pCurrSEIMessage = NULL; + } + + if (m_fpSEI) { + fclose(m_fpSEI); + m_fpSEI = NULL; + } + + if (m_hParser) { + cuvidDestroyVideoParser(m_hParser); + } + cuCtxPushCurrent(m_cuContext); + if (m_hDecoder) { + cuvidDestroyDecoder(m_hDecoder); + } + + std::lock_guard lock(m_mtxVPFrame); + + for (uint8_t *pFrame : m_vpFrame) { + if (m_bUseDeviceFrame) { + cuMemFree((CUdeviceptr)pFrame); + } else { + delete[] pFrame; + } + } + cuCtxPopCurrent(NULL); + + cuvidCtxLockDestroy(m_ctxLock); + + STOP_TIMER("Session Deinitialization Time: "); + + NvDecoder::addDecoderSessionOverHead(getDecoderSessionID(), elapsedTime); +} + +int NvDecoder::Decode(const uint8_t *pData, int nSize, int nFlags, int64_t nTimestamp) { + m_nDecodedFrame = 0; + m_nDecodedFrameReturned = 0; + CUVIDSOURCEDATAPACKET packet = {0}; + packet.payload = pData; + packet.payload_size = nSize; + packet.flags = nFlags | CUVID_PKT_TIMESTAMP; + packet.timestamp = nTimestamp; + if (!pData || nSize == 0) { + packet.flags |= CUVID_PKT_ENDOFSTREAM; + } + NVDEC_API_CALL(cuvidParseVideoData(m_hParser, &packet)); + + return m_nDecodedFrame; +} + +uint8_t *NvDecoder::GetFrame(int64_t *pTimestamp) { + if (m_nDecodedFrame > 0) { + std::lock_guard lock(m_mtxVPFrame); + m_nDecodedFrame--; + if (pTimestamp) + *pTimestamp = m_vTimestamp[m_nDecodedFrameReturned]; + return m_vpFrame[m_nDecodedFrameReturned++]; + } + + return NULL; +} + +uint8_t *NvDecoder::GetLockedFrame(int64_t *pTimestamp) { + uint8_t *pFrame; + uint64_t timestamp; + if (m_nDecodedFrame > 0) { + std::lock_guard lock(m_mtxVPFrame); + m_nDecodedFrame--; + pFrame = m_vpFrame[0]; + m_vpFrame.erase(m_vpFrame.begin(), m_vpFrame.begin() + 1); + + timestamp = m_vTimestamp[0]; + m_vTimestamp.erase(m_vTimestamp.begin(), m_vTimestamp.begin() + 1); + + if (pTimestamp) + *pTimestamp = timestamp; + + return pFrame; + } + + return NULL; +} + +void NvDecoder::UnlockFrame(uint8_t **pFrame) { + std::lock_guard lock(m_mtxVPFrame); + m_vpFrame.insert(m_vpFrame.end(), &pFrame[0], &pFrame[1]); + + // add a dummy entry for timestamp + uint64_t timestamp[2] = {0}; + m_vTimestamp.insert(m_vTimestamp.end(), ×tamp[0], ×tamp[1]); +} diff --git a/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h new file mode 100644 index 000000000..886202bf7 --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/NvCodec/NvDecoder/NvDecoder.h @@ -0,0 +1,528 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include "../../../Interface/nvcuvid.h" +#include "../Utils/NvCodecUtils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_FRM_CNT 32 + +typedef enum { SEI_TYPE_TIME_CODE = 136, SEI_TYPE_USER_DATA_UNREGISTERED = 5 } SEI_H264_HEVC_PAYLOAD_TYPE; + +/** + * @brief Exception class for error reporting from the decode API. + */ +class NVDECException : public std::exception { + public: + NVDECException(const std::string &errorStr, const CUresult errorCode) + : m_errorString(errorStr), m_errorCode(errorCode) {} + + virtual ~NVDECException() throw() {} + virtual const char *what() const throw() { return m_errorString.c_str(); } + CUresult getErrorCode() const { return m_errorCode; } + const std::string &getErrorString() const { return m_errorString; } + static NVDECException makeNVDECException(const std::string &errorStr, const CUresult errorCode, + const std::string &functionName, const std::string &fileName, int lineNo); + + private: + std::string m_errorString; + CUresult m_errorCode; +}; + +inline NVDECException NVDECException::makeNVDECException(const std::string &errorStr, const CUresult errorCode, + const std::string &functionName, const std::string &fileName, + int lineNo) { + std::ostringstream errorLog; + errorLog << functionName << " : " << errorStr << " at " << fileName << ":" << lineNo << std::endl; + NVDECException exception(errorLog.str(), errorCode); + return exception; +} + +#define NVDEC_THROW_ERROR(errorStr, errorCode) \ + do { \ + throw NVDECException::makeNVDECException(errorStr, errorCode, __FUNCTION__, __FILE__, __LINE__); \ + } while (0) + +#define NVDEC_API_CALL(cuvidAPI) \ + do { \ + CUresult errorCode = cuvidAPI; \ + if (errorCode != CUDA_SUCCESS) { \ + std::ostringstream errorLog; \ + errorLog << #cuvidAPI << " returned error " << errorCode; \ + throw NVDECException::makeNVDECException(errorLog.str(), errorCode, __FUNCTION__, __FILE__, __LINE__); \ + } \ + } while (0) + +struct Rect { + int l, t, r, b; +}; + +struct Dim { + int w, h; +}; + +#define START_TIMER auto start = std::chrono::high_resolution_clock::now(); + +#define STOP_TIMER(print_message) \ + int64_t elapsedTime = \ + std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start) \ + .count(); \ + std::cout << print_message << elapsedTime << " ms " << std::endl; + +#define CUDA_DRVAPI_CALL(call) \ + do { \ + CUresult err__ = call; \ + if (err__ != CUDA_SUCCESS) { \ + const char *szErrName = NULL; \ + cuGetErrorName(err__, &szErrName); \ + std::ostringstream errorLog; \ + errorLog << "CUDA driver API error " << szErrName; \ + throw NVDECException::makeNVDECException(errorLog.str(), err__, __FUNCTION__, __FILE__, __LINE__); \ + } \ + } while (0) + +static const char *GetVideoCodecString(cudaVideoCodec eCodec) { + static struct { + cudaVideoCodec eCodec; + const char *name; + } aCodecName[] = { + {cudaVideoCodec_MPEG1, "MPEG-1"}, + {cudaVideoCodec_MPEG2, "MPEG-2"}, + {cudaVideoCodec_MPEG4, "MPEG-4 (ASP)"}, + {cudaVideoCodec_VC1, "VC-1/WMV"}, + {cudaVideoCodec_H264, "AVC/H.264"}, + {cudaVideoCodec_JPEG, "M-JPEG"}, + {cudaVideoCodec_H264_SVC, "H.264/SVC"}, + {cudaVideoCodec_H264_MVC, "H.264/MVC"}, + {cudaVideoCodec_HEVC, "H.265/HEVC"}, + {cudaVideoCodec_VP8, "VP8"}, + {cudaVideoCodec_VP9, "VP9"}, + {cudaVideoCodec_AV1, "AV1"}, + {cudaVideoCodec_NumCodecs, "Invalid"}, + {cudaVideoCodec_YUV420, "YUV 4:2:0"}, + {cudaVideoCodec_YV12, "YV12 4:2:0"}, + {cudaVideoCodec_NV12, "NV12 4:2:0"}, + {cudaVideoCodec_YUYV, "YUYV 4:2:2"}, + {cudaVideoCodec_UYVY, "UYVY 4:2:2"}, + }; + + if (eCodec >= 0 && eCodec <= cudaVideoCodec_NumCodecs) { + return aCodecName[eCodec].name; + } + for (int i = cudaVideoCodec_NumCodecs + 1; i < sizeof(aCodecName) / sizeof(aCodecName[0]); i++) { + if (eCodec == aCodecName[i].eCodec) { + return aCodecName[eCodec].name; + } + } + return "Unknown"; +} + +static const char *GetVideoChromaFormatString(cudaVideoChromaFormat eChromaFormat) { + static struct { + cudaVideoChromaFormat eChromaFormat; + const char *name; + } aChromaFormatName[] = { + {cudaVideoChromaFormat_Monochrome, "YUV 400 (Monochrome)"}, + {cudaVideoChromaFormat_420, "YUV 420"}, + {cudaVideoChromaFormat_422, "YUV 422"}, + {cudaVideoChromaFormat_444, "YUV 444"}, + }; + + if (eChromaFormat >= 0 && eChromaFormat < sizeof(aChromaFormatName) / sizeof(aChromaFormatName[0])) { + return aChromaFormatName[eChromaFormat].name; + } + return "Unknown"; +} + +static float GetChromaHeightFactor(cudaVideoSurfaceFormat eSurfaceFormat) { + float factor = 0.5; + switch (eSurfaceFormat) { + case cudaVideoSurfaceFormat_NV12: + case cudaVideoSurfaceFormat_P016: + factor = 0.5; + break; + case cudaVideoSurfaceFormat_YUV444: + case cudaVideoSurfaceFormat_YUV444_16Bit: + factor = 1.0; + break; + } + + return factor; +} + +static int GetChromaPlaneCount(cudaVideoSurfaceFormat eSurfaceFormat) { + int numPlane = 1; + switch (eSurfaceFormat) { + case cudaVideoSurfaceFormat_NV12: + case cudaVideoSurfaceFormat_P016: + numPlane = 1; + break; + case cudaVideoSurfaceFormat_YUV444: + case cudaVideoSurfaceFormat_YUV444_16Bit: + numPlane = 2; + break; + } + + return numPlane; +} + +/** + * @brief Base class for decoder interface. + */ +class NvDecoder { + + public: + NvDecoder() {} + /** + * @brief This function is used to initialize the decoder session. + * Application must call this function to initialize the decoder, before + * starting to decode any frames. + */ + NvDecoder(CUcontext cuContext, bool bUseDeviceFrame, cudaVideoCodec eCodec, bool bLowLatency = false, + bool bDeviceFramePitched = false, const Rect *pCropRect = NULL, const Dim *pResizeDim = NULL, + bool extract_user_SEI_Message = false, int maxWidth = 0, int maxHeight = 0, unsigned int clkRate = 1000, + bool force_zero_latency = false); + ~NvDecoder(); + + /** + * @brief This function is used to get the current CUDA context. + */ + CUcontext GetContext() { return m_cuContext; } + + /** + * @brief This function is used to get the output frame width. + * NV12/P016 output format width is 2 byte aligned because of U and V interleave + */ + int GetWidth() { + assert(m_nWidth); + return (m_eOutputFormat == cudaVideoSurfaceFormat_NV12 || m_eOutputFormat == cudaVideoSurfaceFormat_P016) + ? (m_nWidth + 1) & ~1 + : m_nWidth; + } + + /** + * @brief This function is used to get the actual decode width + */ + int GetDecodeWidth() { + assert(m_nWidth); + return m_nWidth; + } + + /** + * @brief This function is used to get the output frame height (Luma height). + */ + int GetHeight() { + assert(m_nLumaHeight); + return m_nLumaHeight; + } + + /** + * @brief This function is used to get the current chroma height. + */ + int GetChromaHeight() { + assert(m_nChromaHeight); + return m_nChromaHeight; + } + + /** + * @brief This function is used to get the number of chroma planes. + */ + int GetNumChromaPlanes() { + assert(m_nNumChromaPlanes); + return m_nNumChromaPlanes; + } + + /** + * @brief This function is used to get the current frame size based on pixel format. + */ + int GetFrameSize() { + assert(m_nWidth); + return GetWidth() * (m_nLumaHeight + (m_nChromaHeight * m_nNumChromaPlanes)) * m_nBPP; + } + + /** + * @brief This function is used to get the current frame Luma plane size. + */ + int GetLumaPlaneSize() { + assert(m_nWidth); + return GetWidth() * m_nLumaHeight * m_nBPP; + } + + /** + * @brief This function is used to get the current frame chroma plane size. + */ + int GetChromaPlaneSize() { + assert(m_nWidth); + return GetWidth() * (m_nChromaHeight * m_nNumChromaPlanes) * m_nBPP; + } + + /** + * @brief This function is used to get the pitch of the device buffer holding the decoded frame. + */ + int GetDeviceFramePitch() { + assert(m_nWidth); + return m_nDeviceFramePitch ? (int)m_nDeviceFramePitch : GetWidth() * m_nBPP; + } + + /** + * @brief This function is used to get the bit depth associated with the pixel format. + */ + int GetBitDepth() { + assert(m_nWidth); + return m_nBitDepthMinus8 + 8; + } + + /** + * @brief This function is used to get the bytes used per pixel. + */ + int GetBPP() { + assert(m_nWidth); + return m_nBPP; + } + + /** + * @brief This function is used to get the YUV chroma format + */ + cudaVideoSurfaceFormat GetOutputFormat() { return m_eOutputFormat; } + + /** + * @brief This function is used to get information about the video stream (codec, display parameters etc) + */ + CUVIDEOFORMAT GetVideoFormatInfo() { + assert(m_nWidth); + return m_videoFormat; + } + + /** + * @brief This function is used to get codec string from codec id + */ + const char *GetCodecString(cudaVideoCodec eCodec); + + /** + * @brief This function is used to print information about the video stream + */ + std::string GetVideoInfo() const { return m_videoInfo.str(); } + + /** + * @brief This function decodes a frame and returns the number of frames that are available for + * display. All frames that are available for display should be read before making a subsequent decode call. + * @param pData - pointer to the data buffer that is to be decoded + * @param nSize - size of the data buffer in bytes + * @param nFlags - CUvideopacketflags for setting decode options + * @param nTimestamp - presentation timestamp + */ + int Decode(const uint8_t *pData, int nSize, int nFlags = 0, int64_t nTimestamp = 0); + + /** + * @brief This function returns a decoded frame and timestamp. This function should be called in a loop for + * fetching all the frames that are available for display. + */ + uint8_t *GetFrame(int64_t *pTimestamp = nullptr); + + /** + * @brief This function decodes a frame and returns the locked frame buffers + * This makes the buffers available for use by the application without the buffers + * getting overwritten, even if subsequent decode calls are made. The frame buffers + * remain locked, until UnlockFrame() is called + */ + uint8_t *GetLockedFrame(int64_t *pTimestamp = nullptr); + + /** + * @brief This function unlocks the frame buffer and makes the frame buffers available for write again + * @param ppFrame - pointer to array of frames that are to be unlocked + * @param nFrame - number of frames to be unlocked + */ + void UnlockFrame(uint8_t **pFrame); + + /** + * @brief This function allows app to set decoder reconfig params + * @param pCropRect - cropping rectangle coordinates + * @param pResizeDim - width and height of resized output + */ + int setReconfigParams(const Rect *pCropRect, const Dim *pResizeDim); + + /** + * @brief This function allows app to set operating point for AV1 SVC clips + * @param opPoint - operating point of an AV1 scalable bitstream + * @param bDispAllLayers - Output all decoded frames of an AV1 scalable bitstream + */ + void SetOperatingPoint(const uint32_t opPoint, const bool bDispAllLayers) { + m_nOperatingPoint = opPoint; + m_bDispAllLayers = bDispAllLayers; + } + + // start a timer + void startTimer() { m_stDecode_time.Start(); } + + // stop the timer + double stopTimer() { return m_stDecode_time.Stop(); } + + void setDecoderSessionID(int sessionID) { decoderSessionID = sessionID; } + int getDecoderSessionID() { return decoderSessionID; } + + // Session overhead refers to decoder initialization and deinitialization time + static void addDecoderSessionOverHead(int sessionID, int64_t duration) { sessionOverHead[sessionID] += duration; } + static int64_t getDecoderSessionOverHead(int sessionID) { return sessionOverHead[sessionID]; } + + protected: + int decoderSessionID; // Decoder session identifier. Used to gather session level stats. + static std::map sessionOverHead; // Records session overhead of initialization+deinitialization time. + // Format is (thread id, duration) + + /** + * @brief Callback function to be registered for getting a callback when decoding of sequence starts + */ + static int CUDAAPI HandleVideoSequenceProc(void *pUserData, CUVIDEOFORMAT *pVideoFormat) { + return ((NvDecoder *)pUserData)->HandleVideoSequence(pVideoFormat); + } + + /** + * @brief Callback function to be registered for getting a callback when a decoded frame is ready to be decoded + */ + static int CUDAAPI HandlePictureDecodeProc(void *pUserData, CUVIDPICPARAMS *pPicParams) { + return ((NvDecoder *)pUserData)->HandlePictureDecode(pPicParams); + } + + /** + * @brief Callback function to be registered for getting a callback when a decoded frame is available for display + */ + static int CUDAAPI HandlePictureDisplayProc(void *pUserData, CUVIDPARSERDISPINFO *pDispInfo) { + return ((NvDecoder *)pUserData)->HandlePictureDisplay(pDispInfo); + } + + /** + * @brief Callback function to be registered for getting a callback to get operating point when AV1 SVC sequence + * header start. + */ + static int CUDAAPI HandleOperatingPointProc(void *pUserData, CUVIDOPERATINGPOINTINFO *pOPInfo) { + return ((NvDecoder *)pUserData)->GetOperatingPoint(pOPInfo); + } + + /** + * @brief Callback function to be registered for getting a callback when all the unregistered user SEI Messages + * are parsed for a frame. + */ + static int CUDAAPI HandleSEIMessagesProc(void *pUserData, CUVIDSEIMESSAGEINFO *pSEIMessageInfo) { + return ((NvDecoder *)pUserData)->GetSEIMessage(pSEIMessageInfo); + } + + /** + * @brief This function gets called when a sequence is ready to be decoded. The function also gets called + when there is format change + */ + int HandleVideoSequence(CUVIDEOFORMAT *pVideoFormat); + + /** + * @brief This function gets called when a picture is ready to be decoded. cuvidDecodePicture is called from this + * function to decode the picture + */ + int HandlePictureDecode(CUVIDPICPARAMS *pPicParams); + + /** + * @brief This function gets called after a picture is decoded and available for display. Frames are fetched and + stored in internal buffer + */ + int HandlePictureDisplay(CUVIDPARSERDISPINFO *pDispInfo); + + /** + * @brief This function gets called when AV1 sequence encounter more than one operating points + */ + int GetOperatingPoint(CUVIDOPERATINGPOINTINFO *pOPInfo); + + /** + * @brief This function gets called when all unregistered user SEI messages are parsed for a frame + */ + int GetSEIMessage(CUVIDSEIMESSAGEINFO *pSEIMessageInfo); + + /** + * @brief This function reconfigure decoder if there is a change in sequence params. + */ + int ReconfigureDecoder(CUVIDEOFORMAT *pVideoFormat); + + public: + CUcontext m_cuContext = NULL; + CUvideoctxlock m_ctxLock; + CUvideoparser m_hParser = NULL; + CUvideodecoder m_hDecoder = NULL; + bool m_bUseDeviceFrame; + // dimension of the output + unsigned int m_nWidth = 0, m_nLumaHeight = 0, m_nChromaHeight = 0; + unsigned int m_nNumChromaPlanes = 0; + // height of the mapped surface + int m_nSurfaceHeight = 0; + int m_nSurfaceWidth = 0; + cudaVideoCodec m_eCodec = cudaVideoCodec_NumCodecs; + cudaVideoChromaFormat m_eChromaFormat = cudaVideoChromaFormat_420; + cudaVideoSurfaceFormat m_eOutputFormat = cudaVideoSurfaceFormat_NV12; + int m_nBitDepthMinus8 = 0; + int m_nBPP = 1; + CUVIDEOFORMAT m_videoFormat = {}; + Rect m_displayRect = {}; + // stock of frames + std::vector m_vpFrame; + // timestamps of decoded frames + std::vector m_vTimestamp; + int m_nDecodedFrame = 0, m_nDecodedFrameReturned = 0; + int m_nDecodePicCnt = 0, m_nPicNumInDecodeOrder[MAX_FRM_CNT]; + CUVIDSEIMESSAGEINFO *m_pCurrSEIMessage = NULL; + CUVIDSEIMESSAGEINFO m_SEIMessagesDisplayOrder[MAX_FRM_CNT]; + FILE *m_fpSEI = NULL; + bool m_bEndDecodeDone = false; + std::mutex m_mtxVPFrame; + int m_nFrameAlloc = 0; + CUstream m_cuvidStream = 0; + bool m_bDeviceFramePitched = false; + size_t m_nDeviceFramePitch = 0; + Rect m_cropRect = {}; + Dim m_resizeDim = {}; + + std::ostringstream m_videoInfo; + unsigned int m_nMaxWidth = 0, m_nMaxHeight = 0; + bool m_bReconfigExternal = false; + bool m_bReconfigExtPPChange = false; + StopWatch m_stDecode_time; + + unsigned int m_nOperatingPoint = 0; + bool m_bDispAllLayers = false; + // In H.264, there is an inherent display latency for video contents + // which do not have num_reorder_frames=0 in the VUI. This applies to + // All-Intra and IPPP sequences as well. If the user wants zero display + // latency for All-Intra and IPPP sequences, the below flag will enable + // the display callback immediately after the decode callback. + bool m_bForce_zero_latency = false; + bool m_bExtractSEIMessage = false; +}; diff --git a/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h new file mode 100644 index 000000000..bd1881dbc --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegDemuxer.h @@ -0,0 +1,379 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#pragma once + +extern "C" { +#include +#include +#include +/* Explicitly include bsf.h when building against FFmpeg 4.3 (libavcodec 58.45.100) or later for backward compatibility + */ +#if LIBAVCODEC_VERSION_INT >= 3824484 +#include +#endif +} +#include "NvCodecUtils.h" +#include "nvcuvid.h" + +//--------------------------------------------------------------------------- +//! \file FFmpegDemuxer.h +//! \brief Provides functionality for stream demuxing +//! +//! This header file is used by Decode/Transcode apps to demux input video clips before decoding frames from it. +//--------------------------------------------------------------------------- + +/** + * @brief libavformat wrapper class. Retrieves the elementary encoded stream from the container format. + */ +class FFmpegDemuxer { + private: + AVFormatContext *fmtc = NULL; + AVIOContext *avioc = NULL; + AVPacket *pkt = NULL; /*!< AVPacket stores compressed data typically exported by demuxers and then passed as input + to decoders */ + AVPacket *pktFiltered = NULL; + AVBSFContext *bsfc = NULL; + + int iVideoStream; + bool bMp4H264, bMp4HEVC, bMp4MPEG4; + AVCodecID eVideoCodec; + AVPixelFormat eChromaFormat; + int nWidth, nHeight, nBitDepth, nBPP, nChromaHeight; + double timeBase = 0.0; + int64_t userTimeScale = 0; + + uint8_t *pDataWithHeader = NULL; + + unsigned int frameCount = 0; + + public: + class DataProvider { + public: + virtual ~DataProvider() {} + virtual int GetData(uint8_t *pBuf, int nBuf) = 0; + }; + + private: + /** + * @brief Private constructor to initialize libavformat resources. + * @param fmtc - Pointer to AVFormatContext allocated inside avformat_open_input() + */ + FFmpegDemuxer(AVFormatContext *fmtc, int64_t timeScale = 1000 /*Hz*/) : fmtc(fmtc) { + if (!fmtc) { + LOG(ERROR) << "No AVFormatContext provided."; + return; + } + + // Allocate the AVPackets and initialize to default values + pkt = av_packet_alloc(); + pktFiltered = av_packet_alloc(); + if (!pkt || !pktFiltered) { + LOG(ERROR) << "AVPacket allocation failed"; + return; + } + + LOG(INFO) << "Media format: " << fmtc->iformat->long_name << " (" << fmtc->iformat->name << ")"; + + ck(avformat_find_stream_info(fmtc, NULL)); + iVideoStream = av_find_best_stream(fmtc, AVMEDIA_TYPE_VIDEO, -1, -1, NULL, 0); + if (iVideoStream < 0) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " + << "Could not find stream in input file"; + av_packet_free(&pkt); + av_packet_free(&pktFiltered); + return; + } + + // fmtc->streams[iVideoStream]->need_parsing = AVSTREAM_PARSE_NONE; + eVideoCodec = fmtc->streams[iVideoStream]->codecpar->codec_id; + nWidth = fmtc->streams[iVideoStream]->codecpar->width; + nHeight = fmtc->streams[iVideoStream]->codecpar->height; + eChromaFormat = (AVPixelFormat)fmtc->streams[iVideoStream]->codecpar->format; + AVRational rTimeBase = fmtc->streams[iVideoStream]->time_base; + timeBase = av_q2d(rTimeBase); + userTimeScale = timeScale; + + // Set bit depth, chroma height, bits per pixel based on eChromaFormat of input + switch (eChromaFormat) { + case AV_PIX_FMT_YUV420P10LE: + case AV_PIX_FMT_GRAY10LE: // monochrome is treated as 420 with chroma filled with 0x0 + nBitDepth = 10; + nChromaHeight = (nHeight + 1) >> 1; + nBPP = 2; + break; + case AV_PIX_FMT_YUV420P12LE: + nBitDepth = 12; + nChromaHeight = (nHeight + 1) >> 1; + nBPP = 2; + break; + case AV_PIX_FMT_YUV444P10LE: + nBitDepth = 10; + nChromaHeight = nHeight << 1; + nBPP = 2; + break; + case AV_PIX_FMT_YUV444P12LE: + nBitDepth = 12; + nChromaHeight = nHeight << 1; + nBPP = 2; + break; + case AV_PIX_FMT_YUV444P: + nBitDepth = 8; + nChromaHeight = nHeight << 1; + nBPP = 1; + break; + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUVJ420P: + case AV_PIX_FMT_YUVJ422P: // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420 + case AV_PIX_FMT_YUVJ444P: // jpeg decoder output is subsampled to NV12 for 422/444 so treat it as 420 + case AV_PIX_FMT_GRAY8: // monochrome is treated as 420 with chroma filled with 0x0 + nBitDepth = 8; + nChromaHeight = (nHeight + 1) >> 1; + nBPP = 1; + break; + default: + LOG(WARNING) << "ChromaFormat not recognized. Assuming 420"; + eChromaFormat = AV_PIX_FMT_YUV420P; + nBitDepth = 8; + nChromaHeight = (nHeight + 1) >> 1; + nBPP = 1; + } + + bMp4H264 = eVideoCodec == AV_CODEC_ID_H264 && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") || + !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") || + !strcmp(fmtc->iformat->long_name, "Matroska / WebM")); + bMp4HEVC = eVideoCodec == AV_CODEC_ID_HEVC && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") || + !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") || + !strcmp(fmtc->iformat->long_name, "Matroska / WebM")); + + bMp4MPEG4 = eVideoCodec == AV_CODEC_ID_MPEG4 && (!strcmp(fmtc->iformat->long_name, "QuickTime / MOV") || + !strcmp(fmtc->iformat->long_name, "FLV (Flash Video)") || + !strcmp(fmtc->iformat->long_name, "Matroska / WebM")); + + // Initialize bitstream filter and its required resources + if (bMp4H264) { + const AVBitStreamFilter *bsf = av_bsf_get_by_name("h264_mp4toannexb"); + if (!bsf) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " + << "av_bsf_get_by_name() failed"; + av_packet_free(&pkt); + av_packet_free(&pktFiltered); + return; + } + ck(av_bsf_alloc(bsf, &bsfc)); + avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar); + ck(av_bsf_init(bsfc)); + } + if (bMp4HEVC) { + const AVBitStreamFilter *bsf = av_bsf_get_by_name("hevc_mp4toannexb"); + if (!bsf) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__ << " " + << "av_bsf_get_by_name() failed"; + av_packet_free(&pkt); + av_packet_free(&pktFiltered); + return; + } + ck(av_bsf_alloc(bsf, &bsfc)); + avcodec_parameters_copy(bsfc->par_in, fmtc->streams[iVideoStream]->codecpar); + ck(av_bsf_init(bsfc)); + } + } + + AVFormatContext *CreateFormatContext(DataProvider *pDataProvider) { + + AVFormatContext *ctx = NULL; + if (!(ctx = avformat_alloc_context())) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__; + return NULL; + } + + uint8_t *avioc_buffer = NULL; + int avioc_buffer_size = 8 * 1024 * 1024; + avioc_buffer = (uint8_t *)av_malloc(avioc_buffer_size); + if (!avioc_buffer) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__; + return NULL; + } + avioc = avio_alloc_context(avioc_buffer, avioc_buffer_size, 0, pDataProvider, &ReadPacket, NULL, NULL); + if (!avioc) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__; + return NULL; + } + ctx->pb = avioc; + + ck(avformat_open_input(&ctx, NULL, NULL, NULL)); + return ctx; + } + + /** + * @brief Allocate and return AVFormatContext*. + * @param szFilePath - Filepath pointing to input stream. + * @return Pointer to AVFormatContext + */ + AVFormatContext *CreateFormatContext(const char *szFilePath) { + avformat_network_init(); + + AVFormatContext *ctx = NULL; + ck(avformat_open_input(&ctx, szFilePath, NULL, NULL)); + return ctx; + } + + public: + FFmpegDemuxer(const char *szFilePath, int64_t timescale = 1000 /*Hz*/) + : FFmpegDemuxer(CreateFormatContext(szFilePath), timescale) {} + FFmpegDemuxer(DataProvider *pDataProvider) : FFmpegDemuxer(CreateFormatContext(pDataProvider)) { avioc = fmtc->pb; } + ~FFmpegDemuxer() { + + if (!fmtc) { + return; + } + + if (pkt) { + av_packet_free(&pkt); + } + if (pktFiltered) { + av_packet_free(&pktFiltered); + } + + if (bsfc) { + av_bsf_free(&bsfc); + } + + avformat_close_input(&fmtc); + + if (avioc) { + av_freep(&avioc->buffer); + av_freep(&avioc); + } + + if (pDataWithHeader) { + av_free(pDataWithHeader); + } + } + AVCodecID GetVideoCodec() { return eVideoCodec; } + AVPixelFormat GetChromaFormat() { return eChromaFormat; } + int GetWidth() { return nWidth; } + int GetHeight() { return nHeight; } + int GetBitDepth() { return nBitDepth; } + int GetFrameSize() { return nWidth * (nHeight + nChromaHeight) * nBPP; } + bool Demux(uint8_t **ppVideo, int *pnVideoBytes, int64_t *pts = NULL) { + if (!fmtc) { + return false; + } + + *pnVideoBytes = 0; + + if (pkt->data) { + av_packet_unref(pkt); + } + + int e = 0; + while ((e = av_read_frame(fmtc, pkt)) >= 0 && pkt->stream_index != iVideoStream) { + av_packet_unref(pkt); + } + if (e < 0) { + return false; + } + + if (bMp4H264 || bMp4HEVC) { + if (pktFiltered->data) { + av_packet_unref(pktFiltered); + } + ck(av_bsf_send_packet(bsfc, pkt)); + ck(av_bsf_receive_packet(bsfc, pktFiltered)); + *ppVideo = pktFiltered->data; + *pnVideoBytes = pktFiltered->size; + if (pts) + *pts = (int64_t)(pktFiltered->pts * userTimeScale * timeBase); + } else { + + if (bMp4MPEG4 && (frameCount == 0)) { + + int extraDataSize = fmtc->streams[iVideoStream]->codecpar->extradata_size; + + if (extraDataSize > 0) { + + // extradata contains start codes 00 00 01. Subtract its size + pDataWithHeader = (uint8_t *)av_malloc(extraDataSize + pkt->size - 3 * sizeof(uint8_t)); + + if (!pDataWithHeader) { + LOG(ERROR) << "FFmpeg error: " << __FILE__ << " " << __LINE__; + return false; + } + + memcpy(pDataWithHeader, fmtc->streams[iVideoStream]->codecpar->extradata, extraDataSize); + memcpy(pDataWithHeader + extraDataSize, pkt->data + 3, pkt->size - 3 * sizeof(uint8_t)); + + *ppVideo = pDataWithHeader; + *pnVideoBytes = extraDataSize + pkt->size - 3 * sizeof(uint8_t); + } + + } else { + *ppVideo = pkt->data; + *pnVideoBytes = pkt->size; + } + + if (pts) + *pts = (int64_t)(pkt->pts * userTimeScale * timeBase); + } + + frameCount++; + + return true; + } + + static int ReadPacket(void *opaque, uint8_t *pBuf, int nBuf) { + return ((DataProvider *)opaque)->GetData(pBuf, nBuf); + } +}; + +inline cudaVideoCodec FFmpeg2NvCodecId(AVCodecID id) { + switch (id) { + case AV_CODEC_ID_MPEG1VIDEO: + return cudaVideoCodec_MPEG1; + case AV_CODEC_ID_MPEG2VIDEO: + return cudaVideoCodec_MPEG2; + case AV_CODEC_ID_MPEG4: + return cudaVideoCodec_MPEG4; + case AV_CODEC_ID_WMV3: + case AV_CODEC_ID_VC1: + return cudaVideoCodec_VC1; + case AV_CODEC_ID_H264: + return cudaVideoCodec_H264; + case AV_CODEC_ID_HEVC: + return cudaVideoCodec_HEVC; + case AV_CODEC_ID_VP8: + return cudaVideoCodec_VP8; + case AV_CODEC_ID_VP9: + return cudaVideoCodec_VP9; + case AV_CODEC_ID_MJPEG: + return cudaVideoCodec_JPEG; + case AV_CODEC_ID_AV1: + return cudaVideoCodec_AV1; + default: + return cudaVideoCodec_NumCodecs; + } +} diff --git a/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h new file mode 100644 index 000000000..08e43e603 --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/Utils/FFmpegStreamer.h @@ -0,0 +1,148 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#pragma once + +#include +#include +extern "C" { +#include +#include +#include +}; +#include "Logger.h" + +using namespace std; + +extern simplelogger::Logger *logger; + +static string AvErrorToString(int av_error_code) { + const auto buf_size = 1024U; + char *err_string = (char *)calloc(buf_size, sizeof(*err_string)); + if (!err_string) { + return string(); + } + + if (0 != av_strerror(av_error_code, err_string, buf_size - 1)) { + free(err_string); + stringstream ss; + ss << "Unknown error with code " << av_error_code; + return ss.str(); + } + + string str(err_string); + free(err_string); + return str; +} + +class FFmpegStreamer { + private: + AVFormatContext *oc = NULL; + AVStream *vs = NULL; + int nFps = 0; + + public: + FFmpegStreamer(AVCodecID eCodecId, int nWidth, int nHeight, int nFps, const char *szInFilePath) : nFps(nFps) { + avformat_network_init(); + + int ret = 0; + + if ((eCodecId == AV_CODEC_ID_H264) || (eCodecId == AV_CODEC_ID_HEVC)) + ret = avformat_alloc_output_context2(&oc, NULL, "mpegts", NULL); + else if (eCodecId == AV_CODEC_ID_AV1) + ret = avformat_alloc_output_context2(&oc, NULL, "ivf", NULL); + + if (ret < 0) { + LOG(ERROR) << "FFmpeg: failed to allocate an AVFormatContext. Error message: " << AvErrorToString(ret); + return; + } + + oc->url = av_strdup(szInFilePath); + LOG(INFO) << "Streaming destination: " << oc->url; + + // Add video stream to oc + vs = avformat_new_stream(oc, NULL); + if (!vs) { + LOG(ERROR) << "FFMPEG: Could not alloc video stream"; + return; + } + vs->id = 0; + + // Set video parameters + AVCodecParameters *vpar = vs->codecpar; + vpar->codec_id = eCodecId; + vpar->codec_type = AVMEDIA_TYPE_VIDEO; + vpar->width = nWidth; + vpar->height = nHeight; + + // Everything is ready. Now open the output stream. + if (avio_open(&oc->pb, oc->url, AVIO_FLAG_WRITE) < 0) { + LOG(ERROR) << "FFMPEG: Could not open " << oc->url; + return; + } + + // Write the container header + if (avformat_write_header(oc, NULL)) { + LOG(ERROR) << "FFMPEG: avformat_write_header error!"; + return; + } + } + ~FFmpegStreamer() { + if (oc) { + av_write_trailer(oc); + avio_close(oc->pb); + avformat_free_context(oc); + } + } + + bool Stream(uint8_t *pData, int nBytes, int nPts) { + AVPacket *pkt = av_packet_alloc(); + if (!pkt) { + LOG(ERROR) << "AVPacket allocation failed !"; + return false; + } + pkt->pts = av_rescale_q(nPts++, AVRational{1, nFps}, vs->time_base); + // No B-frames + pkt->dts = pkt->pts; + pkt->stream_index = vs->index; + pkt->data = pData; + pkt->size = nBytes; + + if (!memcmp(pData, "\x00\x00\x00\x01\x67", 5)) { + pkt->flags |= AV_PKT_FLAG_KEY; + } + + // Write the compressed frame into the output + int ret = av_write_frame(oc, pkt); + av_write_frame(oc, NULL); + if (ret < 0) { + LOG(ERROR) << "FFMPEG: Error while writing video frame"; + } + + av_packet_free(&pkt); + return true; + } +}; diff --git a/third_party/Video_Codec_SDK/Samples/Utils/Logger.h b/third_party/Video_Codec_SDK/Samples/Utils/Logger.h new file mode 100644 index 000000000..5d2f069cf --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/Utils/Logger.h @@ -0,0 +1,235 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include + +#pragma comment(lib, "ws2_32.lib") +#undef ERROR +#else +#include +#include +#include +#include +#define SOCKET int +#define INVALID_SOCKET -1 +#endif + +enum LogLevel { TRACE, INFO, WARNING, ERROR, FATAL }; + +namespace simplelogger { +class Logger { + public: + Logger(LogLevel level, bool bPrintTimeStamp) : level(level), bPrintTimeStamp(bPrintTimeStamp) {} + virtual ~Logger() {} + virtual std::ostream &GetStream() = 0; + virtual void FlushStream() {} + bool ShouldLogFor(LogLevel l) { return l >= level; } + char *GetLead(LogLevel l, const char *szFile, int nLine, const char *szFunc) { + if (l < TRACE || l > FATAL) { + sprintf(szLead, "[?????] "); + return szLead; + } + const char *szLevels[] = {"TRACE", "INFO", "WARN", "ERROR", "FATAL"}; + if (bPrintTimeStamp) { + time_t t = time(NULL); + struct tm *ptm = localtime(&t); + sprintf(szLead, "[%-5s][%02d:%02d:%02d] ", szLevels[l], ptm->tm_hour, ptm->tm_min, ptm->tm_sec); + } else { + sprintf(szLead, "[%-5s] ", szLevels[l]); + } + return szLead; + } + void EnterCriticalSection() { mtx.lock(); } + void LeaveCriticalSection() { mtx.unlock(); } + + private: + LogLevel level; + char szLead[80]; + bool bPrintTimeStamp; + std::mutex mtx; +}; + +class LoggerFactory { + public: + static Logger *CreateFileLogger(std::string strFilePath, LogLevel level = INFO, bool bPrintTimeStamp = true) { + return new FileLogger(strFilePath, level, bPrintTimeStamp); + } + static Logger *CreateConsoleLogger(LogLevel level = INFO, bool bPrintTimeStamp = true) { + return new ConsoleLogger(level, bPrintTimeStamp); + } + static Logger *CreateUdpLogger(char *szHost, unsigned uPort, LogLevel level = INFO, bool bPrintTimeStamp = true) { + return new UdpLogger(szHost, uPort, level, bPrintTimeStamp); + } + + private: + LoggerFactory() {} + + class FileLogger : public Logger { + public: + FileLogger(std::string strFilePath, LogLevel level, bool bPrintTimeStamp) : Logger(level, bPrintTimeStamp) { + pFileOut = new std::ofstream(); + pFileOut->open(strFilePath.c_str()); + } + ~FileLogger() { pFileOut->close(); } + std::ostream &GetStream() { return *pFileOut; } + + private: + std::ofstream *pFileOut; + }; + + class ConsoleLogger : public Logger { + public: + ConsoleLogger(LogLevel level, bool bPrintTimeStamp) : Logger(level, bPrintTimeStamp) {} + std::ostream &GetStream() { return std::cout; } + }; + + class UdpLogger : public Logger { + private: + class UdpOstream : public std::ostream { + public: + UdpOstream(char *szHost, unsigned short uPort) : std::ostream(&sb), socket(INVALID_SOCKET) { +#ifdef _WIN32 + WSADATA w; + if (WSAStartup(0x0101, &w) != 0) { + fprintf(stderr, "WSAStartup() failed.\n"); + return; + } +#endif + socket = ::socket(AF_INET, SOCK_DGRAM, 0); + if (socket == INVALID_SOCKET) { +#ifdef _WIN32 + WSACleanup(); +#endif + fprintf(stderr, "socket() failed.\n"); + return; + } +#ifdef _WIN32 + unsigned int b1, b2, b3, b4; + sscanf(szHost, "%u.%u.%u.%u", &b1, &b2, &b3, &b4); + struct in_addr addr = {(unsigned char)b1, (unsigned char)b2, (unsigned char)b3, (unsigned char)b4}; +#else + struct in_addr addr = {inet_addr(szHost)}; +#endif + struct sockaddr_in s = {AF_INET, htons(uPort), addr}; + server = s; + } + ~UdpOstream() throw() { + if (socket == INVALID_SOCKET) { + return; + } +#ifdef _WIN32 + closesocket(socket); + WSACleanup(); +#else + close(socket); +#endif + } + void Flush() { + if (sendto(socket, sb.str().c_str(), (int)sb.str().length() + 1, 0, (struct sockaddr *)&server, + (int)sizeof(sockaddr_in)) == -1) { + fprintf(stderr, "sendto() failed.\n"); + } + sb.str(""); + } + + private: + std::stringbuf sb; + SOCKET socket; + struct sockaddr_in server; + }; + + public: + UdpLogger(char *szHost, unsigned uPort, LogLevel level, bool bPrintTimeStamp) + : Logger(level, bPrintTimeStamp), udpOut(szHost, (unsigned short)uPort) {} + UdpOstream &GetStream() { return udpOut; } + virtual void FlushStream() { udpOut.Flush(); } + + private: + UdpOstream udpOut; + }; +}; + +class LogTransaction { + public: + LogTransaction(Logger *pLogger, LogLevel level, const char *szFile, const int nLine, const char *szFunc) + : pLogger(pLogger), level(level) { + if (!pLogger) { + std::cout << "[-----] "; + return; + } + if (!pLogger->ShouldLogFor(level)) { + return; + } + pLogger->EnterCriticalSection(); + pLogger->GetStream() << pLogger->GetLead(level, szFile, nLine, szFunc); + } + ~LogTransaction() { + if (!pLogger) { + std::cout << std::endl; + return; + } + if (!pLogger->ShouldLogFor(level)) { + return; + } + pLogger->GetStream() << std::endl; + pLogger->FlushStream(); + pLogger->LeaveCriticalSection(); + if (level == FATAL) { + exit(1); + } + } + std::ostream &GetStream() { + if (!pLogger) { + return std::cout; + } + if (!pLogger->ShouldLogFor(level)) { + return ossNull; + } + return pLogger->GetStream(); + } + + private: + Logger *pLogger; + LogLevel level; + std::ostringstream ossNull; +}; + +} // namespace simplelogger + +extern simplelogger::Logger *logger; +#define LOG(level) simplelogger::LogTransaction(logger, level, __FILE__, __LINE__, __FUNCTION__).GetStream() diff --git a/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h b/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h new file mode 100644 index 000000000..065a7cd9b --- /dev/null +++ b/third_party/Video_Codec_SDK/Samples/Utils/NvCodecUtils.h @@ -0,0 +1,547 @@ +/* + * This copyright notice applies to this header file only: + * + * Copyright (c) 2010-2023 NVIDIA Corporation + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the software, and to permit persons to whom the + * software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +//--------------------------------------------------------------------------- +//! \file NvCodecUtils.h +//! \brief Miscellaneous classes and error checking functions. +//! +//! Used by Transcode/Encode samples apps for reading input files, mutithreading, performance measurement or colorspace +//! conversion while decoding. +//--------------------------------------------------------------------------- + +#pragma once +#include "Logger.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern simplelogger::Logger *logger; + +#ifdef __cuda_cuda_h__ +inline bool check(CUresult e, int iLine, const char *szFile) { + if (e != CUDA_SUCCESS) { + const char *szErrName = NULL; + cuGetErrorName(e, &szErrName); + LOG(FATAL) << "CUDA driver API error " << szErrName << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +#ifdef __CUDA_RUNTIME_H__ +inline bool check(cudaError_t e, int iLine, const char *szFile) { + if (e != cudaSuccess) { + LOG(FATAL) << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +#ifdef _NV_ENCODEAPI_H_ +inline bool check(NVENCSTATUS e, int iLine, const char *szFile) { + const char *aszErrName[] = { + "NV_ENC_SUCCESS", + "NV_ENC_ERR_NO_ENCODE_DEVICE", + "NV_ENC_ERR_UNSUPPORTED_DEVICE", + "NV_ENC_ERR_INVALID_ENCODERDEVICE", + "NV_ENC_ERR_INVALID_DEVICE", + "NV_ENC_ERR_DEVICE_NOT_EXIST", + "NV_ENC_ERR_INVALID_PTR", + "NV_ENC_ERR_INVALID_EVENT", + "NV_ENC_ERR_INVALID_PARAM", + "NV_ENC_ERR_INVALID_CALL", + "NV_ENC_ERR_OUT_OF_MEMORY", + "NV_ENC_ERR_ENCODER_NOT_INITIALIZED", + "NV_ENC_ERR_UNSUPPORTED_PARAM", + "NV_ENC_ERR_LOCK_BUSY", + "NV_ENC_ERR_NOT_ENOUGH_BUFFER", + "NV_ENC_ERR_INVALID_VERSION", + "NV_ENC_ERR_MAP_FAILED", + "NV_ENC_ERR_NEED_MORE_INPUT", + "NV_ENC_ERR_ENCODER_BUSY", + "NV_ENC_ERR_EVENT_NOT_REGISTERED", + "NV_ENC_ERR_GENERIC", + "NV_ENC_ERR_INCOMPATIBLE_CLIENT_KEY", + "NV_ENC_ERR_UNIMPLEMENTED", + "NV_ENC_ERR_RESOURCE_REGISTER_FAILED", + "NV_ENC_ERR_RESOURCE_NOT_REGISTERED", + "NV_ENC_ERR_RESOURCE_NOT_MAPPED", + }; + if (e != NV_ENC_SUCCESS) { + LOG(FATAL) << "NVENC error " << aszErrName[e] << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +#ifdef _WINERROR_ +inline bool check(HRESULT e, int iLine, const char *szFile) { + if (e != S_OK) { + std::stringstream stream; + stream << std::hex << std::uppercase << e; + LOG(FATAL) << "HRESULT error 0x" << stream.str() << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +#if defined(__gl_h_) || defined(__GL_H__) +inline bool check(GLenum e, int iLine, const char *szFile) { + if (e != 0) { + LOG(ERROR) << "GLenum error " << e << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} +#endif + +inline bool check(int e, int iLine, const char *szFile) { + if (e < 0) { + LOG(ERROR) << "General error " << e << " at line " << iLine << " in file " << szFile; + return false; + } + return true; +} + +#define ck(call) check(call, __LINE__, __FILE__) +#define MAKE_FOURCC(ch0, ch1, ch2, ch3) \ + ((uint32_t)(uint8_t)(ch0) | ((uint32_t)(uint8_t)(ch1) << 8) | ((uint32_t)(uint8_t)(ch2) << 16) | \ + ((uint32_t)(uint8_t)(ch3) << 24)) + +/** + * @brief Wrapper class around std::thread + */ +class NvThread { + public: + NvThread() = default; + NvThread(const NvThread &) = delete; + NvThread &operator=(const NvThread &other) = delete; + + NvThread(std::thread &&thread) : t(std::move(thread)) {} + + NvThread(NvThread &&thread) : t(std::move(thread.t)) {} + + NvThread &operator=(NvThread &&other) { + t = std::move(other.t); + return *this; + } + + ~NvThread() { join(); } + + void join() { + if (t.joinable()) { + t.join(); + } + } + + private: + std::thread t; +}; + +#ifndef _WIN32 +#define _stricmp strcasecmp +#define _stat64 stat64 +#endif + +/** + * @brief Utility class to allocate buffer memory. Helps avoid I/O during the encode/decode loop in case of performance + * tests. + */ +class BufferedFileReader { + public: + /** + * @brief Constructor function to allocate appropriate memory and copy file contents into it + */ + BufferedFileReader(const char *szFileName, bool bPartial = false) { + struct _stat64 st; + + if (_stat64(szFileName, &st) != 0) { + return; + } + + nSize = st.st_size; + while (nSize) { + try { + pBuf = new uint8_t[(size_t)nSize]; + if (nSize != st.st_size) { + LOG(WARNING) << "File is too large - only " << std::setprecision(4) << 100.0 * nSize / st.st_size + << "% is loaded"; + } + break; + } catch (std::bad_alloc) { + if (!bPartial) { + LOG(ERROR) << "Failed to allocate memory in BufferedReader"; + return; + } + nSize = (uint32_t)(nSize * 0.9); + } + } + + std::ifstream fpIn(szFileName, std::ifstream::in | std::ifstream::binary); + if (!fpIn) { + LOG(ERROR) << "Unable to open input file: " << szFileName; + return; + } + + std::streamsize nRead = fpIn.read(reinterpret_cast(pBuf), nSize).gcount(); + fpIn.close(); + + assert(nRead == nSize); + } + ~BufferedFileReader() { + if (pBuf) { + delete[] pBuf; + } + } + bool GetBuffer(uint8_t **ppBuf, uint64_t *pnSize) { + if (!pBuf) { + return false; + } + + *ppBuf = pBuf; + *pnSize = nSize; + return true; + } + + private: + uint8_t *pBuf = NULL; + uint64_t nSize = 0; +}; + +/** + * @brief Template class to facilitate color space conversion + */ +template class YuvConverter { + public: + YuvConverter(int nWidth, int nHeight) : nWidth(nWidth), nHeight(nHeight) { + pQuad = new T[((nWidth + 1) / 2) * ((nHeight + 1) / 2)]; + } + ~YuvConverter() { delete[] pQuad; } + void PlanarToUVInterleaved(T *pFrame, int nPitch = 0) { + if (nPitch == 0) { + nPitch = nWidth; + } + + // sizes of source surface plane + int nSizePlaneY = nPitch * nHeight; + int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2); + int nSizePlaneV = nSizePlaneU; + + T *puv = pFrame + nSizePlaneY; + if (nPitch == nWidth) { + memcpy(pQuad, puv, nSizePlaneU * sizeof(T)); + } else { + for (int i = 0; i < (nHeight + 1) / 2; i++) { + memcpy(pQuad + ((nWidth + 1) / 2) * i, puv + ((nPitch + 1) / 2) * i, ((nWidth + 1) / 2) * sizeof(T)); + } + } + T *pv = puv + nSizePlaneU; + for (int y = 0; y < (nHeight + 1) / 2; y++) { + for (int x = 0; x < (nWidth + 1) / 2; x++) { + puv[y * nPitch + x * 2] = pQuad[y * ((nWidth + 1) / 2) + x]; + puv[y * nPitch + x * 2 + 1] = pv[y * ((nPitch + 1) / 2) + x]; + } + } + } + void UVInterleavedToPlanar(T *pFrame, int nPitch = 0) { + if (nPitch == 0) { + nPitch = nWidth; + } + + // sizes of source surface plane + int nSizePlaneY = nPitch * nHeight; + int nSizePlaneU = ((nPitch + 1) / 2) * ((nHeight + 1) / 2); + int nSizePlaneV = nSizePlaneU; + + T *puv = pFrame + nSizePlaneY, *pu = puv, *pv = puv + nSizePlaneU; + + // split chroma from interleave to planar + for (int y = 0; y < (nHeight + 1) / 2; y++) { + for (int x = 0; x < (nWidth + 1) / 2; x++) { + pu[y * ((nPitch + 1) / 2) + x] = puv[y * nPitch + x * 2]; + pQuad[y * ((nWidth + 1) / 2) + x] = puv[y * nPitch + x * 2 + 1]; + } + } + if (nPitch == nWidth) { + memcpy(pv, pQuad, nSizePlaneV * sizeof(T)); + } else { + for (int i = 0; i < (nHeight + 1) / 2; i++) { + memcpy(pv + ((nPitch + 1) / 2) * i, pQuad + ((nWidth + 1) / 2) * i, ((nWidth + 1) / 2) * sizeof(T)); + } + } + } + + private: + T *pQuad; + int nWidth, nHeight; +}; + +/** + * @brief Class for writing IVF format header for AV1 codec + */ +class IVFUtils { + public: + void WriteFileHeader(std::vector &vPacket, uint32_t nFourCC, uint32_t nWidth, uint32_t nHeight, + uint32_t nFrameRateNum, uint32_t nFrameRateDen, uint32_t nFrameCnt) { + char header[32]; + + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); // version + mem_put_le16(header + 6, 32); // header size + mem_put_le32(header + 8, nFourCC); // fourcc + mem_put_le16(header + 12, nWidth); // width + mem_put_le16(header + 14, nHeight); // height + mem_put_le32(header + 16, nFrameRateNum); // rate + mem_put_le32(header + 20, nFrameRateDen); // scale + mem_put_le32(header + 24, nFrameCnt); // length + mem_put_le32(header + 28, 0); // unused + + vPacket.insert(vPacket.end(), &header[0], &header[32]); + } + + void WriteFrameHeader(std::vector &vPacket, size_t nFrameSize, int64_t pts) { + char header[12]; + mem_put_le32(header, (int)nFrameSize); + mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF)); + mem_put_le32(header + 8, (int)(pts >> 32)); + + vPacket.insert(vPacket.end(), &header[0], &header[12]); + } + + private: + static inline void mem_put_le32(void *vmem, int val) { + unsigned char *mem = (unsigned char *)vmem; + mem[0] = (unsigned char)((val >> 0) & 0xff); + mem[1] = (unsigned char)((val >> 8) & 0xff); + mem[2] = (unsigned char)((val >> 16) & 0xff); + mem[3] = (unsigned char)((val >> 24) & 0xff); + } + + static inline void mem_put_le16(void *vmem, int val) { + unsigned char *mem = (unsigned char *)vmem; + mem[0] = (unsigned char)((val >> 0) & 0xff); + mem[1] = (unsigned char)((val >> 8) & 0xff); + } +}; + +/** + * @brief Utility class to measure elapsed time in seconds between the block of executed code + */ +class StopWatch { + public: + void Start() { t0 = std::chrono::high_resolution_clock::now(); } + double Stop() { + return std::chrono::duration_cast( + std::chrono::high_resolution_clock::now().time_since_epoch() - t0.time_since_epoch()) + .count() / + 1.0e9; + } + + private: + std::chrono::high_resolution_clock::time_point t0; +}; + +template class ConcurrentQueue { + public: + ConcurrentQueue() {} + ConcurrentQueue(size_t size) : maxSize(size) {} + ConcurrentQueue(const ConcurrentQueue &) = delete; + ConcurrentQueue &operator=(const ConcurrentQueue &) = delete; + + void setSize(size_t s) { maxSize = s; } + + void push_back(const T &value) { + // Do not use a std::lock_guard here. We will need to explicitly + // unlock before notify_one as the other waiting thread will + // automatically try to acquire mutex once it wakes up + // (which will happen on notify_one) + std::unique_lock lock(m_mutex); + auto wasEmpty = m_List.empty(); + + while (full()) { + m_cond.wait(lock); + } + + m_List.push_back(value); + if (wasEmpty && !m_List.empty()) { + lock.unlock(); + m_cond.notify_one(); + } + } + + T pop_front() { + std::unique_lock lock(m_mutex); + + while (m_List.empty()) { + m_cond.wait(lock); + } + auto wasFull = full(); + T data = std::move(m_List.front()); + m_List.pop_front(); + + if (wasFull && !full()) { + lock.unlock(); + m_cond.notify_one(); + } + + return data; + } + + T front() { + std::unique_lock lock(m_mutex); + + while (m_List.empty()) { + m_cond.wait(lock); + } + + return m_List.front(); + } + + size_t size() { + std::unique_lock lock(m_mutex); + return m_List.size(); + } + + bool empty() { + std::unique_lock lock(m_mutex); + return m_List.empty(); + } + void clear() { + std::unique_lock lock(m_mutex); + m_List.clear(); + } + + private: + bool full() { + if (maxSize > 0 && m_List.size() == maxSize) + return true; + return false; + } + + private: + std::list m_List; + std::mutex m_mutex; + std::condition_variable m_cond; + size_t maxSize; +}; + +inline void CheckInputFile(const char *szInFilePath) { + std::ifstream fpIn(szInFilePath, std::ios::in | std::ios::binary); + if (fpIn.fail()) { + std::ostringstream err; + err << "Unable to open input file: " << szInFilePath << std::endl; + throw std::invalid_argument(err.str()); + } +} + +inline void ValidateResolution(int nWidth, int nHeight) { + + if (nWidth <= 0 || nHeight <= 0) { + std::ostringstream err; + err << "Please specify positive non zero resolution as -s WxH. Current resolution is " << nWidth << "x" + << nHeight << std::endl; + throw std::invalid_argument(err.str()); + } +} + +template +void Nv12ToColor32(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 0); +template +void Nv12ToColor64(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 0); + +template +void P016ToColor32(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 4); +template +void P016ToColor64(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 4); + +template +void YUV444ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 0); +template +void YUV444ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 0); + +template +void YUV444P16ToColor32(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 4); +template +void YUV444P16ToColor64(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, + int iMatrix = 4); + +template +void Nv12ToColorPlanar(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, + int iMatrix = 0); +template +void P016ToColorPlanar(uint8_t *dpP016, int nP016Pitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, + int iMatrix = 4); + +template +void YUV444ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, + int iMatrix = 0); +template +void YUV444P16ToColorPlanar(uint8_t *dpYUV444, int nPitch, uint8_t *dpBgrp, int nBgrpPitch, int nWidth, int nHeight, + int iMatrix = 4); + +void Bgra64ToP016(uint8_t *dpBgra, int nBgraPitch, uint8_t *dpP016, int nP016Pitch, int nWidth, int nHeight, + int iMatrix = 4); + +void ConvertUInt8ToUInt16(uint8_t *dpUInt8, uint16_t *dpUInt16, int nSrcPitch, int nDestPitch, int nWidth, int nHeight); +void ConvertUInt16ToUInt8(uint16_t *dpUInt16, uint8_t *dpUInt8, int nSrcPitch, int nDestPitch, int nWidth, int nHeight); + +void ResizeNv12(unsigned char *dpDstNv12, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcNv12, + int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char *dpDstNv12UV = nullptr); +void ResizeP016(unsigned char *dpDstP016, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcP016, + int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char *dpDstP016UV = nullptr); + +void ScaleYUV420(unsigned char *dpDstY, unsigned char *dpDstU, unsigned char *dpDstV, int nDstPitch, + int nDstChromaPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcY, unsigned char *dpSrcU, + unsigned char *dpSrcV, int nSrcPitch, int nSrcChromaPitch, int nSrcWidth, int nSrcHeight, + bool bSemiplanar); + +#ifdef __cuda_cuda_h__ +void ComputeCRC(uint8_t *pBuffer, uint32_t *crcValue, CUstream_st *outputCUStream); +#endif From d246bab430adeb461072918a551b2e2b68c9bce5 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 23 Oct 2023 11:21:17 +0800 Subject: [PATCH 29/33] Dockerfile - update mlc version into 3.10 for cuda and rocm dockerfiles (#562) **Description** Update mlc version into 3.10 for cuda and rocm dockerfiles to be consistent with cuda12 dockerfile Co-authored-by: yukirora --- dockerfile/cuda11.1.1.dockerfile | 4 ++-- dockerfile/rocm5.0.x.dockerfile | 4 ++-- dockerfile/rocm5.1.x.dockerfile | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index d7feb2baa..6b3a2acb2 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -13,7 +13,7 @@ FROM nvcr.io/nvidia/pytorch:20.12-py3 # - HPC-X: v2.8.3 # - NCCL RDMA SHARP plugins: 7cccbc1 # Intel: -# - mlc: v3.9a +# - mlc: v3.10 LABEL maintainer="SuperBench" @@ -111,7 +111,7 @@ RUN cd /tmp && \ # Install Intel MLC RUN cd /tmp && \ - wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \ + wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \ tar xzf mlc.tgz Linux/mlc && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz diff --git a/dockerfile/rocm5.0.x.dockerfile b/dockerfile/rocm5.0.x.dockerfile index 6830263ce..02b33c3f9 100644 --- a/dockerfile/rocm5.0.x.dockerfile +++ b/dockerfile/rocm5.0.x.dockerfile @@ -17,7 +17,7 @@ FROM ${BASE_IMAGE} # Mellanox: # - OFED: 5.2-2.2.3.0 # Intel: -# - mlc: v3.9a +# - mlc: v3.10 LABEL maintainer="SuperBench" @@ -97,7 +97,7 @@ RUN cd /tmp && \ # Install Intel MLC RUN cd /tmp && \ - wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \ + wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \ tar xzf mlc.tgz Linux/mlc && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz diff --git a/dockerfile/rocm5.1.x.dockerfile b/dockerfile/rocm5.1.x.dockerfile index 5e4b118e0..292293a3e 100644 --- a/dockerfile/rocm5.1.x.dockerfile +++ b/dockerfile/rocm5.1.x.dockerfile @@ -16,7 +16,7 @@ FROM ${BASE_IMAGE} # Mellanox: # - OFED: 5.2-2.2.3.0 # Intel: -# - mlc: v3.9a +# - mlc: v3.10 LABEL maintainer="SuperBench" @@ -109,7 +109,7 @@ RUN cd /tmp && \ # Install Intel MLC RUN cd /tmp && \ - wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \ + wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \ tar xzf mlc.tgz Linux/mlc && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz From 07477c3baea7c8acf4f65c93c7d7d1069f4f7081 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 5 Nov 2023 11:35:49 +0000 Subject: [PATCH 30/33] Bump postcss from 8.3.5 to 8.4.31 in /website (#564) Bumps [postcss](https://github.com/postcss/postcss) from 8.3.5 to 8.4.31. - [Release notes](https://github.com/postcss/postcss/releases) - [Changelog](https://github.com/postcss/postcss/blob/main/CHANGELOG.md) - [Commits](postcss/postcss@8.3.5...8.4.31) --- updated-dependencies: - dependency-name: postcss dependency-type: indirect ... Signed-off-by: dependabot[bot] --- website/package-lock.json | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/website/package-lock.json b/website/package-lock.json index a2e3b219d..b7eee3fe0 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -7291,11 +7291,6 @@ "integrity": "sha512-2ZTgtl0nJsO0KQCjEpxcIr5D+Yv90plTitZt9JBfQvVJDS5seMl3FOvsh3+9CoYWXf/1l5OaZzzF6nDm4cagaQ==", "optional": true }, - "nanoid": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.2.0.tgz", - "integrity": "sha512-fmsZYa9lpn69Ad5eDn7FMcnnSR+8R34W9qJEijxYhTbfOWzr22n1QxCMzXLK+ODyW2973V3Fux959iQoUxzUIA==" - }, "nanomatch": { "version": "1.2.13", "resolved": "https://registry.npmjs.org/nanomatch/-/nanomatch-1.2.13.tgz", @@ -7765,6 +7760,11 @@ "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz", "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==" }, + "picocolors": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz", + "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==" + }, "picomatch": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.0.tgz", @@ -7870,13 +7870,25 @@ "integrity": "sha1-AerA/jta9xoqbAL+q7jB/vfgDqs=" }, "postcss": { - "version": "8.3.5", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.3.5.tgz", - "integrity": "sha512-NxTuJocUhYGsMiMFHDUkmjSKT3EdH4/WbGF6GCi1NDGk+vbcUTun4fpbOqaPtD8IIsztA2ilZm2DhYCuyN58gA==", + "version": "8.4.31", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz", + "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==", "requires": { - "colorette": "^1.2.2", - "nanoid": "^3.1.23", - "source-map-js": "^0.6.2" + "nanoid": "^3.3.6", + "picocolors": "^1.0.0", + "source-map-js": "^1.0.2" + }, + "dependencies": { + "nanoid": { + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz", + "integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==" + }, + "source-map-js": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.0.2.tgz", + "integrity": "sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==" + } } }, "postcss-calc": { @@ -9677,11 +9689,6 @@ "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=" }, - "source-map-js": { - "version": "0.6.2", - "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-0.6.2.tgz", - "integrity": "sha512-/3GptzWzu0+0MBQFrDKzw/DvvMTUORvgY6k6jd/VS6iCR4RDTKWH6v6WPwQoUO8667uQEf9Oe38DxAYWY5F/Ug==" - }, "source-map-resolve": { "version": "0.5.3", "resolved": "https://registry.npmjs.org/source-map-resolve/-/source-map-resolve-0.5.3.tgz", From ce3737f98b27543c5d7ceb88259685a5736bc896 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:36:42 +0800 Subject: [PATCH 31/33] Bump @babel/traverse from 7.14.5 to 7.23.2 in /website (#566) Bumps [@babel/traverse](https://github.com/babel/babel/tree/HEAD/packages/babel-traverse) from 7.14.5 to 7.23.2. - [Release notes](https://github.com/babel/babel/releases) - [Changelog](https://github.com/babel/babel/blob/main/CHANGELOG.md) - [Commits](https://github.com/babel/babel/commits/v7.23.2/packages/babel-traverse) --- updated-dependencies: - dependency-name: "@babel/traverse" dependency-type: indirect ... Signed-off-by: dependabot[bot] --- website/package-lock.json | 139 +++++++++++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 11 deletions(-) diff --git a/website/package-lock.json b/website/package-lock.json index b7eee3fe0..7bf8c5310 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -271,6 +271,11 @@ } } }, + "@babel/helper-environment-visitor": { + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-environment-visitor/-/helper-environment-visitor-7.22.20.tgz", + "integrity": "sha512-zfedSIzFhat/gFhWfHtgWvlec0nqB9YEIVrpuwjruLlXfUSnA8cJB0miHKwqDnQ7d32aKo2xt88/xZptwxbfhA==" + }, "@babel/helper-explode-assignable-expression": { "version": "7.14.5", "resolved": "https://registry.npmjs.org/@babel/helper-explode-assignable-expression/-/helper-explode-assignable-expression-7.14.5.tgz", @@ -394,6 +399,11 @@ "@babel/types": "^7.14.5" } }, + "@babel/helper-string-parser": { + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.22.5.tgz", + "integrity": "sha512-mM4COjgZox8U+JcXQwPijIZLElkgEpO5rsERVDJTc2qfCDfERyob6k5WegS14SX18IIjv+XD+GrqNumY5JRCDw==" + }, "@babel/helper-validator-identifier": { "version": "7.14.5", "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.14.5.tgz", @@ -1268,19 +1278,126 @@ } }, "@babel/traverse": { - "version": "7.14.5", - "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.14.5.tgz", - "integrity": "sha512-G3BiS15vevepdmFqmUc9X+64y0viZYygubAMO8SvBmKARuF6CPSZtH4Ng9vi/lrWlZFGe3FWdXNy835akH8Glg==", - "requires": { - "@babel/code-frame": "^7.14.5", - "@babel/generator": "^7.14.5", - "@babel/helper-function-name": "^7.14.5", - "@babel/helper-hoist-variables": "^7.14.5", - "@babel/helper-split-export-declaration": "^7.14.5", - "@babel/parser": "^7.14.5", - "@babel/types": "^7.14.5", + "version": "7.23.2", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.23.2.tgz", + "integrity": "sha512-azpe59SQ48qG6nu2CzcMLbxUudtN+dOM9kDbUqGq3HXUJRlo7i8fvPoxQUzYgLZ4cMVmuZgm8vvBpNeRhd6XSw==", + "requires": { + "@babel/code-frame": "^7.22.13", + "@babel/generator": "^7.23.0", + "@babel/helper-environment-visitor": "^7.22.20", + "@babel/helper-function-name": "^7.23.0", + "@babel/helper-hoist-variables": "^7.22.5", + "@babel/helper-split-export-declaration": "^7.22.6", + "@babel/parser": "^7.23.0", + "@babel/types": "^7.23.0", "debug": "^4.1.0", "globals": "^11.1.0" + }, + "dependencies": { + "@babel/code-frame": { + "version": "7.22.13", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.22.13.tgz", + "integrity": "sha512-XktuhWlJ5g+3TJXc5upd9Ks1HutSArik6jf2eAjYFyIOf4ej3RN+184cZbzDvbPnuTJIUhPKKJE3cIsYTiAT3w==", + "requires": { + "@babel/highlight": "^7.22.13", + "chalk": "^2.4.2" + } + }, + "@babel/generator": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.23.0.tgz", + "integrity": "sha512-lN85QRR+5IbYrMWM6Y4pE/noaQtg4pNiqeNGX60eqOfo6gtEj6uw/JagelB8vVztSd7R6M5n1+PQkDbHbBRU4g==", + "requires": { + "@babel/types": "^7.23.0", + "@jridgewell/gen-mapping": "^0.3.2", + "@jridgewell/trace-mapping": "^0.3.17", + "jsesc": "^2.5.1" + } + }, + "@babel/helper-function-name": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.23.0.tgz", + "integrity": "sha512-OErEqsrxjZTJciZ4Oo+eoZqeW9UIiOcuYKRJA4ZAgV9myA+pOXhhmpfNCKjEH/auVfEYVFJ6y1Tc4r0eIApqiw==", + "requires": { + "@babel/template": "^7.22.15", + "@babel/types": "^7.23.0" + } + }, + "@babel/helper-hoist-variables": { + "version": "7.22.5", + "resolved": "https://registry.npmjs.org/@babel/helper-hoist-variables/-/helper-hoist-variables-7.22.5.tgz", + "integrity": "sha512-wGjk9QZVzvknA6yKIUURb8zY3grXCcOZt+/7Wcy8O2uctxhplmUPkOdlgoNhmdVee2c92JXbf1xpMtVNbfoxRw==", + "requires": { + "@babel/types": "^7.22.5" + } + }, + "@babel/helper-split-export-declaration": { + "version": "7.22.6", + "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.22.6.tgz", + "integrity": "sha512-AsUnxuLhRYsisFiaJwvp1QF+I3KjD5FOxut14q/GzovUe6orHLesW2C7d754kRm53h5gqrz6sFl6sxc4BVtE/g==", + "requires": { + "@babel/types": "^7.22.5" + } + }, + "@babel/helper-validator-identifier": { + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz", + "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==" + }, + "@babel/highlight": { + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.22.20.tgz", + "integrity": "sha512-dkdMCN3py0+ksCgYmGG8jKeGA/8Tk+gJwSYYlFGxG5lmhfKNoAy004YpLxpS1W2J8m/EK2Ew+yOs9pVRwO89mg==", + "requires": { + "@babel/helper-validator-identifier": "^7.22.20", + "chalk": "^2.4.2", + "js-tokens": "^4.0.0" + } + }, + "@babel/parser": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.23.0.tgz", + "integrity": "sha512-vvPKKdMemU85V9WE/l5wZEmImpCtLqbnTvqDS2U1fJ96KrxoW7KrXhNsNCblQlg8Ck4b85yxdTyelsMUgFUXiw==" + }, + "@babel/template": { + "version": "7.22.15", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.22.15.tgz", + "integrity": "sha512-QPErUVm4uyJa60rkI73qneDacvdvzxshT3kksGqlGWYdOTIUOwJ7RDUL8sGqslY1uXWSL6xMFKEXDS3ox2uF0w==", + "requires": { + "@babel/code-frame": "^7.22.13", + "@babel/parser": "^7.22.15", + "@babel/types": "^7.22.15" + } + }, + "@babel/types": { + "version": "7.23.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.23.0.tgz", + "integrity": "sha512-0oIyUfKoI3mSqMvsxBdclDwxXKXAUA8v/apZbc+iSyARYou1o8ZGDxbUYyLFoW2arqS2jDGqJuZvv1d/io1axg==", + "requires": { + "@babel/helper-string-parser": "^7.22.5", + "@babel/helper-validator-identifier": "^7.22.20", + "to-fast-properties": "^2.0.0" + } + }, + "@jridgewell/trace-mapping": { + "version": "0.3.20", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.20.tgz", + "integrity": "sha512-R8LcPeWZol2zR8mmH3JeKQ6QRCFb7XgUhV9ZlGhHLGyg4wpPiPZNQOOWhFZhxKw8u//yTbNGI42Bx/3paXEQ+Q==", + "requires": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "chalk": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz", + "integrity": "sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==", + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + } } }, "@babel/types": { From c7800bb8e038baa103b8f6a14572238061b410f7 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Tue, 14 Nov 2023 11:52:56 +0800 Subject: [PATCH 32/33] Bug Fix - remove cp ptx file command in gpu burn test (#567) **Description** remove cp ptx file in gpu burn test since the command is run inside self.args.bin_dir dir. https://github.com/microsoft/superbenchmark/blob/d246bab430adeb461072918a551b2e2b68c9bce5/superbench/benchmarks/micro_benchmarks/micro_base.py#L183 --- superbench/benchmarks/micro_benchmarks/gpu_burn_test.py | 6 +----- tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py index c5ef05eae..fba4ad2b3 100644 --- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py +++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py @@ -66,12 +66,8 @@ def _preprocess(self): if self._args.tensor_core: command += ' -tc' command += ' {} '.format(self._args.time) - # copy compare.ptx which needs to be in the working directory - compare_copy = 'cp ' + self._args.bin_dir + '/compare.ptx ./' - # remove compare.ptx from working directory - compare_rm = 'rm ' + 'compare.ptx' - self._commands.append(compare_copy + ' && ' + command + ' && ' + compare_rm) + self._commands.append(command) return True diff --git a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py index eff5af202..3ec352c4d 100644 --- a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py +++ b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py @@ -46,14 +46,10 @@ def test_gpu_burn(self, results): assert (benchmark._args.tensor_core) # Check command - compare_copy = 'cp ' + benchmark._args.bin_dir + '/compare.ptx ./' - compare_rm = 'rm ' + 'compare.ptx' assert (1 == len(benchmark._commands)) - assert (benchmark._commands[0].startswith(compare_copy)) assert ('-d' in benchmark._commands[0]) assert ('-tc' in benchmark._commands[0]) assert (str(time) in benchmark._commands[0]) - assert (compare_rm in benchmark._commands[0]) # Check results assert (benchmark._process_raw_result(0, results)) From f53d941a22fc0746e98ef3560a6799422be8fa47 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Mon, 20 Nov 2023 11:21:20 +0800 Subject: [PATCH 33/33] Benchmarks: micro benchmarks - add int8 support for cublaslt function (#574) **Description** add int8 support for cublaslt function. --- superbench/benchmarks/micro_benchmarks/cublaslt_function.py | 2 +- .../micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu | 5 +++++ .../micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc | 2 ++ tests/benchmarks/micro_benchmarks/test_cublaslt_function.py | 6 +++--- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py index 59733ea10..9bf3d99f3 100644 --- a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py +++ b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py @@ -23,7 +23,7 @@ def __init__(self, name, parameters=''): super().__init__(name, parameters) self._bin_name = 'cublaslt_gemm' - self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2'] + self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'int8'] def mrange(self, start, stop=-1, multiplication_factor=2): """Range constructor with multiplication factor. diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu index 788b1989d..002b06447 100644 --- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu +++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu @@ -16,6 +16,7 @@ using fp16 = half; using bf16 = nv_bfloat16; using fp8e4m3 = __nv_fp8_e4m3; using fp8e5m2 = __nv_fp8_e5m2; +using int8 = int8_t; struct Args { int m = 16; @@ -84,6 +85,8 @@ template cudaDataType_t get_datatype() { return CUDA_R_8F_E4M3; if (std::is_same::value) return CUDA_R_8F_E5M2; + if (std::is_same::value) + return CUDA_R_8I; throw std::invalid_argument("Unknown type"); } @@ -162,6 +165,8 @@ int main(int argc, char **argv) { run(&args); else if (args.in_type == "fp8e5m2") run(&args); + else if (args.in_type == "int8") + run(&args); else throw std::invalid_argument("Unknown type " + args.in_type); diff --git a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc index 4842c22d1..6ec5a101e 100644 --- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc +++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc @@ -62,6 +62,8 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l gemm_compute_type = CUBLAS_COMPUTE_32F; if (a_type == CUDA_R_64F || b_type == CUDA_R_64F) gemm_compute_type = CUBLAS_COMPUTE_64F; + if (a_type == CUDA_R_8I) + gemm_compute_type = CUBLAS_COMPUTE_32I; cublasLtMatmulDesc_t op_desc = nullptr; CUBLAS_CHECK(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F)); diff --git a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py index b504062a2..a6fae8f0e 100644 --- a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py +++ b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py @@ -63,15 +63,15 @@ def test_cublaslt_gemm_command_generation(self): (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) benchmark = benchmark_cls( self.benchmark_name, - parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64', + parameters='--batch 2:16:2 --shapes 2:4,4:8,8:32 32:128:4,128,128 --in_types fp16 fp32 fp64 int8', ) self.assertTrue(benchmark._preprocess()) - self.assertEqual(4 * (2 * 2 * 3 + 2) * 3, len(benchmark._commands)) + self.assertEqual(4 * (2 * 2 * 3 + 2) * len(benchmark._args.in_types), len(benchmark._commands)) def cmd(t, b, m, n, k): return f'{benchmark._CublasLtBenchmark__bin_path} -m {m} -n {n} -k {k} -b {b} -w 20 -i 50 -t {t}' - for _t in ['fp16', 'fp32', 'fp64']: + for _t in ['fp16', 'fp32', 'fp64', 'int8']: for _b in [2, 4, 8, 16]: for _m in [2, 4]: for _n in [4, 8]: