diff --git a/.azure-pipelines/ansible-integration-test.yml b/.azure-pipelines/ansible-integration-test.yml
index f5b34dd60..9c8550966 100644
--- a/.azure-pipelines/ansible-integration-test.yml
+++ b/.azure-pipelines/ansible-integration-test.yml
@@ -7,6 +7,7 @@ trigger:
 
 pool:
   name: SuperBench CI
+  demands: ansible-agent
   vmImage: ubuntu-latest
 
 container:
diff --git a/.azure-pipelines/cpu-unit-test.yml b/.azure-pipelines/cpu-unit-test.yml
index 7fc698f4f..1de67824f 100644
--- a/.azure-pipelines/cpu-unit-test.yml
+++ b/.azure-pipelines/cpu-unit-test.yml
@@ -7,12 +7,12 @@ trigger:
 
 strategy:
   matrix:
-    python-3.6:
-      imageTag: '3.6'
     python-3.7:
       imageTag: '3.7'
     python-3.8:
       imageTag: '3.8'
+    python-3.10:
+      imageTag: '3.10'
     # TODO
     #python-latest:
     #  imageTag: '3'
diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml
index e0a69fc0d..36f03d242 100644
--- a/.azure-pipelines/cuda-unit-test.yml
+++ b/.azure-pipelines/cuda-unit-test.yml
@@ -7,22 +7,26 @@ trigger:
 
 pool:
   name: SuperBench CI
+  demands: cuda-agent
   vmImage: ubuntu-latest
 
 container:
-  image: nvcr.io/nvidia/pytorch:20.12-py3
-  options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/'
+  image: nvcr.io/nvidia/pytorch:24.03-py3
+  options: '--name cuda-ci -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker:ro'
 
 steps:
   - script: |
       echo "##vso[task.prependpath]$HOME/.local/bin"
     displayName: Export path
   - script: |
+      docker exec -t -u root -e DEBIAN_FRONTEND=noninteractive cuda-ci bash -c \
+        "apt-get update -y -q && \
+        yes '' | apt-get install -y -q sudo && \
+        apt-get install -y -q \
+        ffmpeg libavcodec-dev libavformat-dev libavutil-dev libboost-program-options-dev libswresample-dev"
       python3 -m pip install --upgrade pip setuptools==65.7
       python3 -m pip install .[test,nvworker]
       make postinstall
-      sudo DEBIAN_FRONTEND=noninteractive apt-get update
-      sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev
     displayName: Install dependencies
   - script: |
       python3 setup.py lint
diff --git a/.codecov.yml b/.codecov.yml
index 81d50f8bc..8f9f5de87 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -14,8 +14,9 @@ coverage:
         target: 80%
         threshold: 1%
         flags:
-          - cpu-python3.6-unit-test
           - cpu-python3.7-unit-test
+          - cpu-python3.8-unit-test
+          - cpu-python3.10-unit-test
           - cuda-unit-test
           - directx-unit-test
     patch:
@@ -23,7 +24,8 @@ coverage:
         target: 80%
         threshold: 1%
         flags:
-          - cpu-python3.6-unit-test
           - cpu-python3.7-unit-test
+          - cpu-python3.8-unit-test
+          - cpu-python3.10-unit-test
           - cuda-unit-test
           - directx-unit-test
diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx
index 30fdee829..0a582e92f 100644
--- a/docs/getting-started/installation.mdx
+++ b/docs/getting-started/installation.mdx
@@ -26,7 +26,7 @@ Here're the system requirements for control node.
 ### Requirements
 
 * Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later.
-* [Python](https://www.python.org/) version 3.6 or later (which can be checked by running `python3 --version`).
+* [Python](https://www.python.org/) version 3.7 or later (which can be checked by running `python3 --version`).
 * [Pip](https://pip.pypa.io/en/stable/installing/) version 18.0 or later (which can be checked by running `python3 -m pip --version`).
 
 :::note
diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 388bfa119..77f5de85b 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -384,6 +384,82 @@ with topology distance of 2, 4, 6, respectively.
 | ib-traffic/ib\_write\_bw\_${msg_size}\_${direction}\_${line}\_${pair}:${server}\_${client}  | bandwidth (GB/s) | The max bandwidth of perftest (ib_write_bw, ib_send_bw, ib_read_bw) using ${msg_size} with ${direction}('cpu-to-cpu'/'gpu-to-gpu'/'gpu-to-cpu'/'cpu-to-gpu') run between the ${pair}<sup>th</sup> node pair in the ${line}<sup>th</sup> line of the config, ${server} and ${client} are the hostname of server and client.  |
 | ib-traffic/ib\_write\_lat\_${msg_size}\_${direction}\_${line}\_${pair}:${server}\_${client} | time (us)        | The max latency of perftest (ib_write_lat, ib_send_lat, ib_read_lat) using ${msg_size} with ${direction}('cpu-to-cpu'/'gpu-to-gpu'/'gpu-to-cpu'/'cpu-to-gpu') run between the ${pair}<sup>th</sup> node pair in the ${line}<sup>th</sup> line of the config, ${server} and ${client} are the hostname of server and client. |
 
+### `nvbandwidth`
+
+#### Introduction
+
+Measures bandwidth and latency for various memcpy patterns across different links using copy engine or kernel copy methods,
+performed by [nvbandwidth](https://github.com/NVIDIA/nvbandwidth)
+
+#### Metrics
+
+| Metrics                                                 | Unit                   | Description                                                                                                                                                                |
+|---------------------------------------------------------|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| host_to_device_memcpy_ce_cpu[0-9]_gpu[0-9]_bw      | GB/s                | Host to device CE memcpy using cuMemcpyAsync                      |
+| host_to_device_memcpy_ce_sum_bw                    | GB/s                | Sum of the output matrix                                           |
+| device_to_host_memcpy_ce_cpu[0-9]_gpu[0-9]_bw      | GB/s                | Device to host CE memcpy using cuMemcpyAsync                      |
+| device_to_host_memcpy_ce_sum_bw                    | GB/s                | Sum of the output matrix                                           |
+| host_to_device_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw | GB/s      | A host to device copy is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported. |
+| host_to_device_bidirectional_memcpy_ce_sum_bw      | GB/s                | Sum of the output matrix                                           |
+| device_to_host_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw | GB/s      | A device to host copy is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported. |
+| device_to_host_bidirectional_memcpy_ce_sum_bw      | GB/s                | Sum of the output matrix                                           |
+| device_to_device_memcpy_read_ce_gpu[0-9]_gpu[0-9]_bw | GB/s               | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. Read tests launch a copy from the peer device to the target using the target's context. |
+| device_to_device_memcpy_read_ce_sum_bw             | GB/s                | Sum of the output matrix                                           |
+| device_to_device_memcpy_write_ce_gpu[0-9]_gpu[0-9]_bw | GB/s              | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. Write tests launch a copy from the target device to the peer using the target's context. |
+| device_to_device_memcpy_write_ce_sum_bw            | GB/s                | Sum of the output matrix                                           |
+| device_to_device_bidirectional_memcpy_read_ce_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. A copy in the opposite direction of the measured copy is run simultaneously but not measured. Read tests launch a copy from the peer device to the target using the target's context. |
+| device_to_device_bidirectional_memcpy_read_ce_sum_bw | GB/s               | Sum of the output matrix                                           |
+| device_to_device_bidirectional_memcpy_write_ce_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of cuMemcpyAsync between each pair of accessible peers. A copy in the opposite direction of the measured copy is run simultaneously but not measured. Write tests launch a copy from the target device to the peer using the target's context. |
+| device_to_device_bidirectional_memcpy_write_ce_sum_bw | GB/s               | Sum of the output matrix                                           |
+| all_to_host_memcpy_ce_cpu[0-9]_gpu[0-9]_bw         | GB/s                | Measures bandwidth of cuMemcpyAsync between a single device and the host while simultaneously running copies from all other devices to the host. |
+| all_to_host_memcpy_ce_sum_bw                       | GB/s                | Sum of the output matrix                                           |
+| all_to_host_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw | GB/s              | A device to host copy is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic. |
+| all_to_host_bidirectional_memcpy_ce_sum_bw         | GB/s                | Sum of the output matrix                                           |
+| host_to_all_memcpy_ce_cpu[0-9]_gpu[0-9]_bw         | GB/s                | Measures bandwidth of cuMemcpyAsync between the host to a single device while simultaneously running copies from the host to all other devices. |
+| host_to_all_memcpy_ce_sum_bw                       | GB/s                | Sum of the output matrix                                           |
+| host_to_all_bidirectional_memcpy_ce_cpu[0-9]_gpu[0-9]_bw | GB/s              | A host to device copy is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic. |
+| host_to_all_bidirectional_memcpy_ce_sum_bw         | GB/s                | Sum of the output matrix                                           |
+| all_to_one_write_ce_gpu[0-9]_gpu[0-9]_bw           | GB/s                | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context. |
+| all_to_one_write_ce_sum_bw                         | GB/s                | Sum of the output matrix                                           |
+| all_to_one_read_ce_gpu[0-9]_gpu[0-9]_bw            | GB/s                | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context. |
+| all_to_one_read_ce_sum_bw                          | GB/s                | Sum of the output matrix                                           |
+| one_to_all_write_ce_gpu[0-9]_gpu[0-9]_bw           | GB/s                | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context. |
+| one_to_all_write_ce_sum_bw                         | GB/s                | Sum of the output matrix                                           |
+| one_to_all_read_ce_gpu[0-9]_gpu[0-9]_bw            | GB/s                | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context. |
+| one_to_all_read_ce_sum_bw                          | GB/s                | Sum of the output matrix                                           |
+| host_to_device_memcpy_sm_cpu[0-9]_gpu[0-9]_bw      | GB/s                | Host to device SM memcpy using a copy kernel                      |
+| host_to_device_memcpy_sm_sum_bw                    | GB/s                | Sum of the output matrix                                           |
+| device_to_host_memcpy_sm_cpu[0-9]_gpu[0-9]_bw      | GB/s                | Device to host SM memcpy using a copy kernel                      |
+| device_to_host_memcpy_sm_sum_bw                    | GB/s                | Sum of the output matrix                                           |
+| device_to_device_memcpy_read_sm_gpu[0-9]_gpu[0-9]_bw | GB/s               | Measures bandwidth of a copy kernel between each pair of accessible peers. Read tests launch a copy from the peer device to the target using the target's context. |
+| device_to_device_memcpy_read_sm_sum_bw             | GB/s                | Sum of the output matrix                                           |
+| device_to_device_memcpy_write_sm_gpu[0-9]_gpu[0-9]_bw | GB/s              | Measures bandwidth of a copy kernel between each pair of accessible peers. Write tests launch a copy from the target device to the peer using the target's context. |
+| device_to_device_memcpy_write_sm_sum_bw            | GB/s                | Sum of the output matrix                                           |
+| device_to_device_bidirectional_memcpy_read_sm_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of a copy kernel between each pair of accessible peers. Copies are run in both directions between each pair, and the sum is reported. Read tests launch a copy from the peer device to the target using the target's context. |
+| device_to_device_bidirectional_memcpy_read_sm_sum_bw | GB/s               | Sum of the output matrix                                           |
+| device_to_device_bidirectional_memcpy_write_sm_gpu[0-9]_gpu[0-9]_bw | GB/s | Measures bandwidth of a copy kernel between each pair of accessible peers. Copies are run in both directions between each pair, and the sum is reported. Write tests launch a copy from the target device to the peer using the target's context. |
+| device_to_device_bidirectional_memcpy_write_sm_sum_bw | GB/s               | Sum of the output matrix                                           |
+| all_to_host_memcpy_sm_cpu[0-9]_gpu[0-9]_bw         | GB/s                | Measures bandwidth of a copy kernel between a single device and the host while simultaneously running copies from all other devices to the host. |
+| all_to_host_memcpy_sm_sum_bw                       | GB/s                | Sum of the output matrix                                           |
+| all_to_host_bidirectional_memcpy_sm_cpu[0-9]_gpu[0-9]_bw | GB/s              | A device to host bandwidth of a copy kernel is measured while a host to device copy is run simultaneously. Only the device to host copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic using copy kernels. |
+| all_to_host_bidirectional_memcpy_sm_sum_bw         | GB/s                | Sum of the output matrix                                           |
+| host_to_all_memcpy_sm_cpu[0-9]_gpu[0-9]_bw         | GB/s                | Measures bandwidth of a copy kernel between the host to a single device while simultaneously running copies from the host to all other devices. |
+| host_to_all_memcpy_sm_sum_bw                       | GB/s                | Sum of the output matrix                                           |
+| host_to_all_bidirectional_memcpy_sm_cpu[0-9]_gpu[0-9]_bw | GB/s              | A host to device bandwidth of a copy kernel is measured while a device to host copy is run simultaneously. Only the host to device copy bandwidth is reported. All other devices generate simultaneous host to device and device to host interfering traffic using copy kernels. |
+| host_to_all_bidirectional_memcpy_sm_sum_bw         | GB/s                | Sum of the output matrix                                           |
+| all_to_one_write_sm_gpu[0-9]_gpu[0-9]_bw           | GB/s                | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context. |
+| all_to_one_write_sm_sum_bw                         | GB/s                | Sum of the output matrix                                           |
+| all_to_one_read_sm_gpu[0-9]_gpu[0-9]_bw            | GB/s                | Measures the total bandwidth of copies from all accessible peers to a single device, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context. |
+| all_to_one_read_sm_sum_bw                          | GB/s                | Sum of the output matrix                                           |
+| one_to_all_write_sm_gpu[0-9]_gpu[0-9]_bw           | GB/s                | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total outbound bandwidth for each device. Write tests launch a copy from the target device to the peer using the target's context. |
+| one_to_all_write_sm_sum_bw                         | GB/s                | Sum of the output matrix                                           |
+| one_to_all_read_sm_gpu[0-9]_gpu[0-9]_bw            | GB/s                | Measures the total bandwidth of copies from a single device to all accessible peers, for each device. Bandwidth is reported as the total inbound bandwidth for each device. Read tests launch a copy from the peer device to the target using the target's context. |
+| one_to_all_read_sm_sum_bw                          | GB/s                | Sum of the output matrix                                           |
+| host_device_latency_sm_cpu[0-9]_gpu[0-9]_lat       | µs                  | Host - device SM copy latency using a ptr chase kernel            |
+| host_device_latency_sm_sum_lat                     | µs                  | Sum of the output matrix                                           |
+| device_to_device_latency_sm_gpu[0-9]_gpu[0-9]_lat  | µs                  | Measures latency of a pointer dereference operation between each pair of accessible peers. Memory is allocated on a GPU and is accessed by the peer GPU to determine latency. |
+| device_to_device_latency_sm_sum_lat                | µs                  | Sum of the output matrix                                           |
+
 
 ## Computation-communication Benchmarks
 
diff --git a/examples/benchmarks/nvbandwidth.py b/examples/benchmarks/nvbandwidth.py
new file mode 100644
index 000000000..45b836734
--- /dev/null
+++ b/examples/benchmarks/nvbandwidth.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for nvbandwidth benchmark.
+
+Commands to run:
+  python3 examples/benchmarks/nvbandwidth.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbandwidth',
+        platform=Platform.CPU,
+        parameters=(
+            '--buffer_size 128 '
+            '--test_cases 0,1,19,20 '
+            '--skip_verification '
+            '--disable_affinity '
+            '--use_mean '
+            '--num_loops 10'
+        )
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
diff --git a/setup.py b/setup.py
index 738095889..cf9779a08 100644
--- a/setup.py
+++ b/setup.py
@@ -131,17 +131,17 @@ def run(self):
         'Operating System :: POSIX',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3 :: Only',
-        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
         'Topic :: System :: Benchmark',
         'Topic :: System :: Clustering',
         'Topic :: System :: Hardware',
     ],
     keywords='benchmark, AI systems',
     packages=find_packages(exclude=['tests']),
-    python_requires='>=3.6, <4',
+    python_requires='>=3.7, <4',
     use_scm_version={
         'local_scheme': 'node-and-date',
         'version_scheme': lambda _: superbench.__version__,
diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py
index 86c6b6d15..014103744 100644
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@@ -48,6 +48,8 @@ def __init__(self, name, parameters=''):
             allow_abbrev=False,
             formatter_class=SortedMetavarTypeHelpFormatter,
         )
+        # Fix optionals title in Python 3.10
+        self._parser._optionals.title = 'optional arguments'
         self._args = None
         self._curr_run_index = 0
         self._result = None
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 94a0d17c1..1a4c6d1bb 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -37,6 +37,7 @@
 from superbench.benchmarks.micro_benchmarks.directx_gpu_copy_performance import DirectXGPUCopyBw
 from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
+from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
 
 __all__ = [
     'BlasLtBaseBenchmark',
@@ -73,4 +74,5 @@
     'DirectXGPUCopyBw',
     'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
+    'NvBandwidthBenchmark',
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbandwidth.py b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
new file mode 100644
index 000000000..81a032195
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbandwidth.py
@@ -0,0 +1,225 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the NV Bandwidth Test."""
+
+import os
+import re
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class NvBandwidthBenchmark(MicroBenchmarkWithInvoke):
+    """The NV Bandwidth Test benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'nvbandwidth'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--buffer_size',
+            type=int,
+            default=64,
+            required=False,
+            help='Memcpy buffer size in MiB. Default is 64.',
+        )
+
+        self._parser.add_argument(
+            '--test_cases',
+            type=str,
+            default='',
+            required=False,
+            help=(
+                'Specify the test case(s) to run, either by name or index. By default, all test cases are executed. '
+                'Example: --test_cases 0,1,2,19,20'
+            ),
+        )
+
+        self._parser.add_argument(
+            '--skip_verification',
+            action='store_true',
+            help='Skips data verification after copy. Default is False.',
+        )
+
+        self._parser.add_argument(
+            '--disable_affinity',
+            action='store_true',
+            help='Disable automatic CPU affinity control. Default is False.',
+        )
+
+        self._parser.add_argument(
+            '--use_mean',
+            action='store_true',
+            help='Use mean instead of median for results. Default is False.',
+        )
+
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            default=3,
+            required=False,
+            help='Iterations of the benchmark. Default is 3.',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        if not self._set_binary_path():
+            return False
+
+        # Construct the command for nvbandwidth
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+
+        if self._args.buffer_size:
+            command += f' --bufferSize {self._args.buffer_size}'
+
+        if self._args.test_cases:
+            command += ' --testcase ' + ' '.join([testcase.strip() for testcase in self._args.test_cases.split(',')])
+
+        if self._args.skip_verification:
+            command += ' --skipVerification'
+
+        if self._args.disable_affinity:
+            command += ' --disableAffinity'
+
+        if self._args.use_mean:
+            command += ' --useMean'
+
+        if self._args.num_loops:
+            command += f' --testSamples {self._args.num_loops}'
+
+        self._commands.append(command)
+
+        return True
+
+    def _process_raw_line(self, line, parse_status):
+        """Process a single line of raw output from the nvbandwidth benchmark.
+
+        This function updates the `parse_status` dictionary with parsed results from the given `line`.
+        It detects the start of a test, parses matrix headers and rows, and extracts summary results.
+
+        Args:
+            line (str): A single line of raw output from the benchmark.
+            parse_status (dict): A dictionary to maintain the current parsing state and results. It should contain:
+                - 'test_name' (str): The name of the current test being parsed.
+                - 'benchmark_type' (str): 'bw' or 'lat'. It also indicating if matrix data is being parsed.
+                - 'matrix_header' (list): The header of the matrix being parsed.
+                - 'results' (dict): A dictionary to store the parsed results.
+
+        Return:
+            None
+        """
+        # Regular expressions for summary line and matrix header detection
+        block_start_pattern = re.compile(r'^Running\s+(.+)$')
+        summary_pattern = re.compile(r'SUM (\S+) (\d+\.\d+)')
+        matrix_header_line = re.compile(r'^(memcpy|memory latency)')
+        matrix_row_pattern = re.compile(r'^\s*\d')
+
+        line = line.strip()
+
+        # Detect the start of a test
+        if block_start_pattern.match(line):
+            parse_status['test_name'] = block_start_pattern.match(line).group(1).lower()[:-1]
+            return
+
+        # Detect the start of matrix data
+        if parse_status['test_name'] and matrix_header_line.match(line):
+            parse_status['benchmark_type'] = 'bw' if 'bandwidth' in line else 'lat'
+            return
+
+        # Parse the matrix header
+        if (
+            parse_status['test_name'] and parse_status['benchmark_type'] and not parse_status['matrix_header']
+            and matrix_row_pattern.match(line)
+        ):
+            parse_status['matrix_header'] = line.split()
+            return
+
+        # Parse matrix rows
+        if parse_status['test_name'] and parse_status['benchmark_type'] and matrix_row_pattern.match(line):
+            row_data = line.split()
+            row_index = row_data[0]
+            for col_index, value in enumerate(row_data[1:], start=1):
+                col_header = parse_status['matrix_header'][col_index - 1]
+                test_name = parse_status['test_name']
+                benchmark_type = parse_status['benchmark_type']
+                metric_name = f'{test_name}_cpu{row_index}_gpu{col_header}_{benchmark_type}'
+                parse_status['results'][metric_name] = float(value)
+            return
+
+        # Parse summary results
+        summary_match = summary_pattern.search(line)
+        if summary_match:
+            value = float(summary_match.group(2))
+            test_name = parse_status['test_name']
+            benchmark_type = parse_status['benchmark_type']
+            parse_status['results'][f'{test_name}_sum_{benchmark_type}'] = value
+
+            # Reset parsing state for next test
+            parse_status['test_name'] = ''
+            parse_status['benchmark_type'] = None
+            parse_status['matrix_header'].clear()
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+           self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        try:
+            self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+            content = raw_output.splitlines()
+            parsing_status = {
+                'results': {},
+                'benchmark_type': None,
+                'matrix_header': [],
+                'test_name': '',
+            }
+
+            for line in content:
+                self._process_raw_line(line, parsing_status)
+
+            if not parsing_status['results']:
+                self._result.add_raw_data('nvbandwidth', 'No valid results found', self._args.log_raw_data)
+                return False
+
+            # Store parsed results
+            for metric, value in parsing_status['results'].items():
+                self._result.add_result(metric, value)
+
+            return True
+        except Exception as e:
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            self._result.add_result('abort', 1)
+            return False
+
+
+BenchmarkRegistry.register_benchmark('nvbandwidth', NvBandwidthBenchmark, platform=Platform.CUDA)
diff --git a/tests/analyzer/test_summaryop.py b/tests/analyzer/test_summaryop.py
index 3b1054444..889ebc1e8 100644
--- a/tests/analyzer/test_summaryop.py
+++ b/tests/analyzer/test_summaryop.py
@@ -4,7 +4,7 @@
 """Tests for SummaryOp module."""
 
 import unittest
-from numpy import NaN, float64
+from numpy import nan, float64
 
 import pandas as pd
 
@@ -55,7 +55,7 @@ def test_rule_op(self):
         # Test - std
         result = SummaryOp.std(raw_data_df)
         print(result)
-        expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, NaN], index=['a', 'b', 'c', 'd'], dtype=float64)
+        expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, nan], index=['a', 'b', 'c', 'd'], dtype=float64)
         pd.testing.assert_series_equal(result, expectedResult)
         # Test - count
         result = SummaryOp.count(raw_data_df)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
new file mode 100644
index 000000000..f6c82a030
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbandwidth.py
@@ -0,0 +1,80 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbandwidth benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvBandwidthBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NV Bandwidth benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbandwidth'])
+
+    def test_nvbandwidth_preprocess(self):
+        """Test NV Bandwidth benchmark preprocess."""
+        benchmark_name = 'nvbandwidth'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = (
+            '--buffer_size 256 '
+            '--test_cases 0,1,2,19,20 '
+            '--skip_verification '
+            '--disable_affinity '
+            '--use_mean '
+            '--num_loops 100'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--bufferSize 256' in benchmark._commands[0])
+        assert ('--testcase 0 1 2 19 20' in benchmark._commands[0])
+        assert ('--skipVerification' in benchmark._commands[0])
+        assert ('--disableAffinity' in benchmark._commands[0])
+        assert ('--useMean' in benchmark._commands[0])
+        assert ('--testSamples 100' in benchmark._commands[0])
+
+    @decorator.load_data('tests/data/nvbandwidth_results.log')
+    def test_nvbandwidth_result_parsing_real_output(self, results):
+        """Test NV Bandwidth benchmark result parsing."""
+        benchmark_name = 'nvbandwidth'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate parsed results
+        assert benchmark.result['host_to_device_memcpy_ce_cpu0_gpu0_bw'][0] == 369.36
+        assert benchmark.result['host_to_device_memcpy_ce_cpu0_gpu1_bw'][0] == 269.33
+        assert benchmark.result['host_to_device_memcpy_ce_sum_bw'][0] == 1985.60
+        assert benchmark.result['device_to_host_memcpy_ce_cpu0_gpu1_bw'][0] == 312.11
+        assert benchmark.result['device_to_host_memcpy_ce_sum_bw'][0] == 607.26
+        assert benchmark.result['host_device_latency_sm_cpu0_gpu0_lat'][0] == 772.58
+        assert benchmark.result['host_device_latency_sm_sum_lat'][0] == 772.58
diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
index d92cd187b..96e1718a0 100644
--- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py
+++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py
@@ -250,16 +250,35 @@ def test_pytorch_empty_cache():
     # Register mnist benchmark.
     BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST)
 
+    # Get initial memory reserved
+    init_res_memory = torch.cuda.memory_reserved()
+
     # Test cache empty by manually calling torch.cuda.empty_cache().
     parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train'
     benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters)
+
     assert (benchmark)
     assert (benchmark._preprocess())
     assert (benchmark._benchmark())
     del benchmark
-    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] > 0)
+
+    # Get current reserved memory after benchmark
+    post_bm_res_memory = torch.cuda.memory_reserved()
+
+    # Assert that memory is increased after benchmark
+    assert (post_bm_res_memory >= init_res_memory)
+
+    # Manually empty cache and get reserved memory
+    # Calling empty_cache() releases all unused cached memory from PyTorch so that those can be used by
+    # other GPU applications. However, the occupied GPU memory by tensors will not be freed so it can not
+    # increase the amount of GPU memory available for PyTorch.
+    # https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management
     torch.cuda.empty_cache()
-    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0)
+    post_empty_cache_res_memory = torch.cuda.memory_reserved()
+
+    # Assert that some memory is released after manually empty cache. The cache is not guaranteed to be reset
+    # back to the init_res_memory due to some tensors not being released.
+    assert (post_empty_cache_res_memory <= post_bm_res_memory)
 
     # Test automatic cache empty.
     context = BenchmarkRegistry.create_benchmark_context(
@@ -268,4 +287,4 @@ def test_pytorch_empty_cache():
 
     benchmark = BenchmarkRegistry.launch_benchmark(context)
     assert (benchmark)
-    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0)
+    assert (torch.cuda.memory_reserved() == post_empty_cache_res_memory)
diff --git a/tests/data/nvbandwidth_results.log b/tests/data/nvbandwidth_results.log
new file mode 100644
index 000000000..4c7618e95
--- /dev/null
+++ b/tests/data/nvbandwidth_results.log
@@ -0,0 +1,134 @@
+nvbandwidth Version: v0.6
+Built from Git version: v0.6
+
+CUDA Runtime Version: 12040
+CUDA Driver Version: 12040
+Driver Version: 550.54.15
+
+Device 0: NVIDIA GH200 480GB (00000009:01:00)
+
+Running host_to_device_memcpy_ce.
+memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
+           0         1         2
+ 0    369.36    269.33    412.11
+ 1    323.36    299.33    312.11
+
+SUM host_to_device_memcpy_ce 1985.60
+
+Running device_to_host_memcpy_ce.
+memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)
+           0         1
+ 0    295.15    312.11
+
+SUM device_to_host_memcpy_ce 607.26
+
+Running host_to_device_bidirectional_memcpy_ce.
+memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    176.92
+
+SUM host_to_device_bidirectional_memcpy_ce 176.92
+
+Running device_to_host_bidirectional_memcpy_ce.
+memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    187.26
+
+SUM device_to_host_bidirectional_memcpy_ce 187.26
+
+Waived:
+Waived:
+Waived:
+Waived:
+Running all_to_host_memcpy_ce.
+memcpy CE CPU(row) <- GPU(column) bandwidth (GB/s)
+           0
+ 0    295.15
+
+SUM all_to_host_memcpy_ce 295.15
+
+Running all_to_host_bidirectional_memcpy_ce.
+memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    187.00
+
+SUM all_to_host_bidirectional_memcpy_ce 187.00
+
+Running host_to_all_memcpy_ce.
+memcpy CE CPU(row) -> GPU(column) bandwidth (GB/s)
+           0
+ 0    370.13
+
+SUM host_to_all_memcpy_ce 370.13
+
+Running host_to_all_bidirectional_memcpy_ce.
+memcpy CE CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    176.86
+
+SUM host_to_all_bidirectional_memcpy_ce 176.86
+
+Waived:
+Waived:
+Waived:
+Waived:
+Running host_to_device_memcpy_sm.
+memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)
+           0
+ 0    372.33
+
+SUM host_to_device_memcpy_sm 372.33
+
+Running device_to_host_memcpy_sm.
+memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)
+           0
+ 0    351.93
+
+SUM device_to_host_memcpy_sm 351.93
+
+Waived:
+Waived:
+Waived:
+Waived:
+Running all_to_host_memcpy_sm.
+memcpy SM CPU(row) <- GPU(column) bandwidth (GB/s)
+           0
+ 0    352.98
+
+SUM all_to_host_memcpy_sm 352.98
+
+Running all_to_host_bidirectional_memcpy_sm.
+memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    156.53
+
+SUM all_to_host_bidirectional_memcpy_sm 156.53
+
+Running host_to_all_memcpy_sm.
+memcpy SM CPU(row) -> GPU(column) bandwidth (GB/s)
+           0
+ 0    360.93
+
+SUM host_to_all_memcpy_sm 360.93
+
+Running host_to_all_bidirectional_memcpy_sm.
+memcpy SM CPU(row) <-> GPU(column) bandwidth (GB/s)
+           0
+ 0    247.56
+
+SUM host_to_all_bidirectional_memcpy_sm 247.56
+
+Waived:
+Waived:
+Waived:
+Waived:
+Running host_device_latency_sm.
+memory latency SM CPU(row) <-> GPU(column) (ns)
+           0
+ 0    772.58
+
+SUM host_device_latency_sm 772.58
+
+Waived:
+NOTE: The reported results may not reflect the full capabilities of the platform.
+Performance can vary with software drivers, hardware clocks, and system topology.