From 61de0171760f02a24f6f091631d6c3120305c880 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:18:44 -0700
Subject: [PATCH 01/43] Skip failing newly added tests in accelerate (#6574)

Adding the new tests in
https://github.com/huggingface/accelerate/pull/3097 caused the
nv-accelerate-v100 tests to fail. Due to other CI issues we didn't
notice this at first. This just skips the problematic test for now.

cc: @stas00 / @muellerzr
---
 .github/workflows/nv-accelerate-v100.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 915493bb3183c..76b8c93fba3f9 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -55,4 +55,4 @@ jobs:
           # tmp fix: force newer datasets version
           #pip install "datasets>=2.0.0"
           pip list
-          pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
+          pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed -k "not test_prepare_multiple_models_zero3_inference"

From 7622cd9e68756d6e2a65e654f2b4ca678d55c251 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:34:38 -0700
Subject: [PATCH 02/43] Use msgpack for p2p comm (#6547)

Use msgpack for P2P communication in pipeline engine.

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/pipe/p2p.py | 8 ++++----
 requirements/requirements.txt | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
index 2b12a9573c4b9..ed6d80b8d4fbf 100644
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -3,7 +3,7 @@
 
 # DeepSpeed Team
 
-import pickle
+import msgpack
 import typing
 
 import torch
@@ -96,7 +96,7 @@ def wait():
 def send_obj(msg: typing.Any, dest: int):
     """Send an arbitrary python object to ``dest``.
 
-    Note: ``msg`` must be pickleable.
+    Note: ``msg`` must be serializable by msgpack.
 
     WARN: This incurs a CPU -> GPU transfer and should be used sparingly
     for performance reasons.
@@ -106,7 +106,7 @@ def send_obj(msg: typing.Any, dest: int):
         dest (int): Destination rank.
     """
     # serialize the message
-    msg = pickle.dumps(msg)
+    msg = msgpack.packb(msg)
     # construct a tensor to send
     msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(get_accelerator().device_name())
 
@@ -133,7 +133,7 @@ def recv_obj(sender: int) -> typing.Any:
     msg = torch.empty(length.item(), dtype=torch.uint8).to(get_accelerator().device_name())
     dist.recv(msg, src=sender)
 
-    msg = pickle.loads(msg.cpu().numpy().tobytes())
+    msg = msgpack.unpackb(msg.cpu().numpy().tobytes())
 
     def _to(x):
         """Recursively move to the current device."""
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 70c94a745435e..296398f680cc2 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,4 +1,5 @@
 hjson
+msgpack
 ninja
 numpy
 packaging>=20.0

From a5400974df59d99d58dabd173bb8d89180bbd773 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Thu, 26 Sep 2024 09:07:19 -0400
Subject: [PATCH 03/43] DeepNVMe perf tuning (#6560)

Add performance tuning utilities: `ds_nvme_tune` and `ds_io`.
Update tutorial with tuning section.

---------

Co-authored-by: Ubuntu <jomayeri@microsoft.com>
Co-authored-by: Joe Mayer <114769929+jomayeri@users.noreply.github.com>
---
 bin/ds_io                             |   6 +
 bin/ds_nvme_tune                      |   9 +
 deepspeed/nvme/__init__.py            |   8 +
 deepspeed/nvme/ds_aio_args.py         | 175 ++++++++++++++
 deepspeed/nvme/ds_aio_basic.py        | 134 +++++++++++
 deepspeed/nvme/ds_aio_handle.py       | 222 ++++++++++++++++++
 deepspeed/nvme/ds_aio_job.py          |  50 ++++
 deepspeed/nvme/parse_nvme_stats.py    | 148 ++++++++++++
 deepspeed/nvme/perf_generate_param.py |  97 ++++++++
 deepspeed/nvme/perf_run_sweep.py      | 320 ++++++++++++++++++++++++++
 deepspeed/nvme/perf_sweep_utils.py    |  13 ++
 deepspeed/nvme/test_ds_aio.py         |  25 ++
 deepspeed/nvme/test_ds_aio_utils.py   |  81 +++++++
 deepspeed/nvme/validate_async_io.py   |  10 +
 docs/_tutorials/deepnvme.md           |  52 ++++-
 setup.py                              |   2 +-
 16 files changed, 1350 insertions(+), 2 deletions(-)
 create mode 100644 bin/ds_io
 create mode 100644 bin/ds_nvme_tune
 create mode 100644 deepspeed/nvme/__init__.py
 create mode 100644 deepspeed/nvme/ds_aio_args.py
 create mode 100755 deepspeed/nvme/ds_aio_basic.py
 create mode 100755 deepspeed/nvme/ds_aio_handle.py
 create mode 100644 deepspeed/nvme/ds_aio_job.py
 create mode 100755 deepspeed/nvme/parse_nvme_stats.py
 create mode 100644 deepspeed/nvme/perf_generate_param.py
 create mode 100644 deepspeed/nvme/perf_run_sweep.py
 create mode 100644 deepspeed/nvme/perf_sweep_utils.py
 create mode 100755 deepspeed/nvme/test_ds_aio.py
 create mode 100755 deepspeed/nvme/test_ds_aio_utils.py
 create mode 100644 deepspeed/nvme/validate_async_io.py

diff --git a/bin/ds_io b/bin/ds_io
new file mode 100644
index 0000000000000..681fd634764c4
--- /dev/null
+++ b/bin/ds_io
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+from deepspeed.nvme import ds_io_main
+
+if __name__ == '__main__':
+    ds_io_main()
diff --git a/bin/ds_nvme_tune b/bin/ds_nvme_tune
new file mode 100644
index 0000000000000..117adfba22c08
--- /dev/null
+++ b/bin/ds_nvme_tune
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+from deepspeed.nvme import sweep_main, generate_main, parse_sweep_arguments
+
+if __name__ == '__main__':
+    args = parse_sweep_arguments()
+    print(f"Running DeepNVMe performance tuning on {args.nvme_dir}")
+    sweep_main(args)
+    generate_main(args.log_dir)
diff --git a/deepspeed/nvme/__init__.py b/deepspeed/nvme/__init__.py
new file mode 100644
index 0000000000000..6d0de857cbd34
--- /dev/null
+++ b/deepspeed/nvme/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .perf_run_sweep import sweep_main, parse_sweep_arguments
+from .perf_generate_param import generate_main
+from .test_ds_aio import ds_io_main
diff --git a/deepspeed/nvme/ds_aio_args.py b/deepspeed/nvme/ds_aio_args.py
new file mode 100644
index 0000000000000..9ed71c34a74d5
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_args.py
@@ -0,0 +1,175 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import argparse
+import os
+from .test_ds_aio_utils import refine_integer_value
+from deepspeed.accelerator import get_accelerator
+
+MAPPING_DELIMITER = ':'
+
+
+def refine_args(args):
+    if args.io_size and type(args.io_size) == str:
+        args.io_size = refine_integer_value(args.io_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+    return args
+
+
+def _get_mapping_dict(args):
+    if args.folder is not None:
+        d = {i: args.folder for i in range(args.multi_process)}
+    else:
+        d = {}
+        for m in args.folder_to_device_mapping:
+            fields = m.split(MAPPING_DELIMITER)
+            d[fields[1]] = fields[0]
+
+    return d
+
+
+def _validate_folder_mapping(args):
+    no_error = True
+    error_messages = []
+    invalid_mappings = [m for m in args.folder_to_device_mapping if MAPPING_DELIMITER not in m]
+    if len(invalid_mappings) > 0:
+        error_messages.append(
+            f'Missing delimiter ({MAPPING_DELIMITER}) in folder_to_device_mapping {invalid_mappings}')
+        no_error = False
+
+    folder_list = [m.split(MAPPING_DELIMITER)[0] for m in args.folder_to_device_mapping]
+    invalid_folders = [d for d in folder_list if not os.path.exists(d)]
+    if len(invalid_folders) > 0:
+        error_messages.append(f'Invalid folders in folder_to_device_mapping: {invalid_folders}')
+        no_error = False
+
+    if args.gpu:
+        device_list = [int(m.split(MAPPING_DELIMITER)[1]) for m in args.folder_to_device_mapping]
+        invalid_device_list = [dev_id for dev_id in device_list if not dev_id < get_accelerator().device_count()]
+        if len(invalid_device_list) > 0:
+            error_messages.append(f'Invalid device ids in folder_to_device_mapping: {invalid_device_list}')
+            no_error = False
+
+    return no_error, error_messages
+
+
+def validate_args(args):
+    no_error = True
+    error_messages = []
+
+    if args.folder is not None and len(args.folder_to_device_mapping) > 0:
+        error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.')
+        no_error = False
+    elif args.folder is None and len(args.folder_to_device_mapping) == 0:
+        error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.')
+        no_error = False
+
+    # Validate --folder
+    if args.folder is not None and not os.path.exists(args.folder):
+        no_error = False
+        error_messages.append(f'Invalid folder in --folder: {args.folder} ')
+
+    # Validate --folder_mapping_to_device
+    if len(args.folder_to_device_mapping) > 0:
+        no_mapping_error, mapping_error_messages = _validate_folder_mapping(args)
+        no_error = no_error and no_mapping_error
+        error_messages += mapping_error_messages
+
+    # Validate --gpu, --use_gds
+    if args.use_gds and not args.gpu:
+        error_messages.append(f'--gpu must be set to transfer with --use_gds')
+        no_error = False
+
+    if not no_error:
+        print(f'Found {len(error_messages)} validation errors')
+        for i, msg in enumerate(error_messages):
+            print(f'{i+1}: {msg}')
+
+    return no_error
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--folder', default=None, type=str, help='Folder to use for I/O.')
+
+    parser.add_argument('--folder_to_device_mapping',
+                        default=[],
+                        nargs='+',
+                        help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).'
+                        'Can be specified multiple times for multi-process runs,'
+                        'e.g. --folder_to_device_mapping /mnt/nvme0:0 --folder_to_device_mapping /mnt/nvme1:15 --gpu'
+                        'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15')
+
+    parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')
+
+    parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)')
+
+    parser.add_argument('--multi_process',
+                        type=int,
+                        default=1,
+                        help='Number of parallel processes doing I/O (default 1).')
+
+    parser.add_argument('--block_size',
+                        type=str,
+                        default='1M',
+                        help='I/O block size. Can use K, M, or G suffix (default 1M for 1 megabytes).')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth (default 32).')
+
+    parser.add_argument('--single_submit',
+                        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
+
+    parser.add_argument(
+        '--sequential_requests',
+        action='store_true',
+        help=
+        'Delay I/O request submission until completion of prior requests (default is overlap I/O submission and completion requests.).'
+    )
+
+    parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    parser.add_argument('--use_gds', action='store_true', help='Enable GDS AIO')
+
+    parser.add_argument('--slow_bounce_buffer',
+                        action='store_true',
+                        help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def get_validated_args():
+    args = parse_arguments()
+    args = refine_args(args)
+    if not validate_args(args):
+        quit()
+    print(f'Successful validation of command line arguments')
+
+    peer_tag = 'gpu' if args.gpu else 'process'
+    args.mapping_dict = _get_mapping_dict(args)
+    args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()]
+    assert len(args.mapping_dict) == len(args.mapping_list)
+    print(f'Configuring {len(args.mapping_list)} {peer_tag} to folder mapping')
+    for i, (device_id, folder) in enumerate(args.mapping_list):
+        print(f'[{i}]: {peer_tag} {device_id} <----> {folder}')
+
+    return args
diff --git a/deepspeed/nvme/ds_aio_basic.py b/deepspeed/nvme/ds_aio_basic.py
new file mode 100755
index 0000000000000..b346fe9bbfeb8
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_basic.py
@@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import AsyncIOBuilder
+from multiprocessing import Pool, Barrier
+from .test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_basic(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_basic_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, True)
+    return ctxt
+
+
+def pre_basic_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, False)
+    return ctxt
+
+
+def post_basic(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_basic_read(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                     args.single_submit, not args.sequential_requests, args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_basic_write(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                      args.single_submit, not args.sequential_requests, args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_basic_read
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_read
+    else:
+        schedule['pre'] = pre_basic_write
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, num_processes)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, num_processes)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, num_processes)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, num_processes)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_tasklet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_basic_multiprocessing(args, read_op):
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/deepspeed/nvme/ds_aio_handle.py b/deepspeed/nvme/ds_aio_handle.py
new file mode 100755
index 0000000000000..47c0cd709ec5a
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_handle.py
@@ -0,0 +1,222 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from multiprocessing import Pool, Barrier
+from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
+from deepspeed.accelerator import get_accelerator
+from .test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
+
+BUFFER = 'buffer'
+BOUNCE_BUFFER = 'bounce_buffer'
+
+
+def pre_handle(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    gds = True if args.use_gds else False
+    device_id, folder = args.mapping_list[tid]
+    filename = create_filename(folder, args.read, args.io_size, tid)
+    if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+        create_file(filename, args.io_size)
+
+    task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
+    bounce_buffer = None
+    if args.gpu:
+        device_name = get_accelerator().device_name(device_id)
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
+        if not (args.slow_bounce_buffer or gds):
+            bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
+                                          device='cpu').pin_memory()
+    else:
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid,
+             f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
+             force=True)
+
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    if gds:
+        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                not args.sequential_requests, io_parallel)
+        handle.pin_device_tensor(buffer)
+    else:
+        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                    not args.sequential_requests, io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
+
+    ctxt = {}
+    ctxt['file'] = filename
+    ctxt['num_bytes'] = args.io_size
+    ctxt['handle'] = handle
+    ctxt['gds'] = gds
+    ctxt[BUFFER] = buffer
+    ctxt[BOUNCE_BUFFER] = bounce_buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_handle_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, True)
+    return ctxt
+
+
+def pre_handle_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, False)
+    return ctxt
+
+
+def post_handle(pool_params):
+    _, _, ctxt = pool_params
+    for buf in [BUFFER, BOUNCE_BUFFER]:
+        if ctxt[buf] is not None:
+            if ctxt['gds']:
+                ctxt['handle'].unpin_device_tensor(ctxt[buf])
+            ctxt[buf].detach()
+            ctxt[buf] = None
+    return ctxt
+
+
+def main_parallel_read(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+    return ctxt
+
+
+def main_parallel_write(pool_params):
+    args, tid, ctxt = pool_params
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
+    handle = ctxt['handle']
+    start_time = time.time()
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_read(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate)
+    assert ret != -1
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_write(pool_parms):
+    args, tid, ctxt = pool_parms
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
+    handle = ctxt['handle']
+    start_time = time.time()
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_handle_read
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_read
+    else:
+        schedule['pre'] = pre_handle_write
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, num_processes)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, num_processes)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, num_processes)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, num_processes)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_tasklet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_handle_multiprocessing(args, read_op):
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/deepspeed/nvme/ds_aio_job.py b/deepspeed/nvme/ds_aio_job.py
new file mode 100644
index 0000000000000..0f9c8b5f1bcc5
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_job.py
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import subprocess
+import shlex
+
+
+class Job(object):
+
+    def __init__(self, cmd_line, output_file=None, work_dir=None):
+        self.cmd_line = cmd_line
+        self.output_file = output_file
+        self.work_dir = work_dir
+        self.output_fd = None
+
+    def cmd(self):
+        return self.cmd_line
+
+    def get_stdout(self):
+        return self.output_fd
+
+    def get_stderr(self):
+        return self.output_fd
+
+    def get_cwd(self):
+        return self.work_dir
+
+    def open_output_file(self):
+        if self.output_file is not None:
+            self.output_fd = open(self.output_file, 'w')
+
+    def close_output_file(self):
+        if self.output_fd is not None:
+            self.output_fd.close()
+            self.output_fd = None
+
+
+def run_job(job, verbose=False):
+    args = shlex.split(' '.join(job.cmd()))
+    if verbose:
+        print(f'args = {args}')
+    job.open_output_file()
+    proc = subprocess.run(args=args, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
+    job.close_output_file()
+    assert proc.returncode == 0, \
+    f"This command failed: {job.cmd()}"
diff --git a/deepspeed/nvme/parse_nvme_stats.py b/deepspeed/nvme/parse_nvme_stats.py
new file mode 100755
index 0000000000000..09c79ada5b369
--- /dev/null
+++ b/deepspeed/nvme/parse_nvme_stats.py
@@ -0,0 +1,148 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import argparse
+
+READ_SPEED = 'read_speed'
+WRITE_SPEED = 'write_speed'
+
+PERF_METRICS = [READ_SPEED, WRITE_SPEED]
+
+METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--log_dir', type=str, required=True, help='Folder of statistics logs')
+
+    parser.add_argument('--metric',
+                        type=str,
+                        required=True,
+                        help='Performance metric to report: [read_speed|write_speed]')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+
+    return args
+
+
+def extract_value(key, file):
+    INVALID_PREFIXES = ["ds"]
+    for p in INVALID_PREFIXES:
+        if key.startswith(p):
+            return key
+    try:
+        if key[0] in ['t', 'd', 'p']:
+            return int(key[1:])
+        if key.startswith("bs"):
+            if key.endswith('K'):
+                v = key[2:].split('K')
+                return int(v[0]) * 1024
+            elif key.endswith('M'):
+                v = key[2:].split('M')
+                return int(v[0]) * 1024 * 1024
+            else:
+                return int(key[2:])
+    except:
+        print(f"{file}: extract_value fails on {key}")
+        return None
+
+    return key
+
+
+def get_file_key(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    values = [extract_value(k, file) for k in fields]
+    return tuple(values)
+
+
+def get_thread_count(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    for key in fields:
+        if key[0] == 't':
+            return int(key[1:])
+    return 1
+
+
+"""
+Extract performance metric from log file.
+Sample file lines are:
+Task Read Latency = 0.031647682189941406 sec
+Task Read Speed = 12.342926020792527 GB/sec
+E2E Read Latency = 0.031697988510131836 sec
+E2E Read Speed = 12.323337169333062 GB/sec
+
+For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
+"""
+
+
+def get_metric(file, metric):
+    thread_count = get_thread_count(file)
+    with open(file) as f:
+        for line in f.readlines():
+            if line.startswith(METRIC_SEARCH[metric]):
+                if metric in [READ_SPEED, WRITE_SPEED]:
+                    fields = line.split()
+                    return float(fields[-2])
+                else:
+                    fields = line.split('=')
+                    return float(fields[-1])
+
+    return None
+
+
+def validate_args(args):
+    if not args.metric in PERF_METRICS:
+        print(f'{args.metric} is not a valid performance metrics')
+        return False
+
+    if not os.path.isdir(args.log_dir):
+        print(f'{args.log_dir} folder is not existent')
+        return False
+
+    return True
+
+
+def get_results(log_files, metric):
+    results = {}
+    for f in log_files:
+        file_key = get_file_key(f)
+        value = get_metric(f, metric)
+        results[file_key] = value
+
+    return results
+
+
+def get_sorted_results(log_dir, metric):
+    log_files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))]
+
+    log_files_path = [os.path.join(log_dir, f) for f in log_files]
+    results = get_results(log_files_path, metric)
+    result_keys = list(results.keys())
+    sorted_keys = sorted(result_keys)
+    return sorted_keys, results
+
+
+def main():
+    print("Parsing aio statistics")
+    args = parse_arguments()
+
+    if not validate_args(args):
+        quit()
+
+    sorted_keys, results = get_sorted_results(args.log_dir, args.metric)
+    for k in sorted_keys:
+        print(f'{k} = {results[k]}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepspeed/nvme/perf_generate_param.py b/deepspeed/nvme/perf_generate_param.py
new file mode 100644
index 0000000000000..d0313d728ad59
--- /dev/null
+++ b/deepspeed/nvme/perf_generate_param.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+import os
+import argparse
+import json
+from .parse_nvme_stats import READ_SPEED, WRITE_SPEED, get_sorted_results
+from .perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--log_dir',
+                        type=str,
+                        default=BENCH_LOG_DIR,
+                        help=f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}')
+    parser.add_argument('--verbose', action='store_true', help='Print debugging information.')
+
+    args = parser.parse_args()
+    if args.verbose:
+        print(f'args = {args}')
+
+    return args
+
+
+def validate_args(args):
+    for d in [READ_LOG_DIR, WRITE_LOG_DIR]:
+        log_dir = os.path.join(args.log_dir, d)
+        if not os.path.isdir(log_dir):
+            print(f'{log_dir} folder is not existent')
+            return False
+
+    return True
+
+
+def convert_to_param(key):
+    assert len(key) == 6
+    return {
+        "single_submit": "true" if key[0] == "single" else "false",
+        "overlap_events": "true" if key[1] == "overlap" else "false",
+        "num_threads": int(key[5]),
+        "queue_depth": int(key[3]),
+        "block_size": int(key[4])
+    }
+
+
+def generate_aio_param(read_log_dir, write_log_dir):
+    _, read_results = get_sorted_results(read_log_dir, READ_SPEED)
+    _, write_results = get_sorted_results(write_log_dir, WRITE_SPEED)
+    combined_perf = {key[1:]: value for key, value in read_results.items()}
+
+    for key, value in write_results.items():
+        new_key = key[1:]
+        if new_key in combined_perf:
+            combined_perf[new_key] += value
+        else:
+            combined_perf[new_key] = 0
+
+    optimal_key = None
+    optimal_perf = 0.0
+    for key, value in combined_perf.items():
+        if value > optimal_perf:
+            optimal_perf = value
+            optimal_key = key
+
+    aio_param = {"aio": convert_to_param(optimal_key)}
+
+    read_perf_keys = {key[1:]: key for key in read_results.keys()}
+    write_perf_keys = {key[1:]: key for key in write_results.keys()}
+    optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
+    optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
+
+    print(f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}')
+    print(json.dumps(aio_param, indent=3))
+
+
+def generate_main(log_dir):
+    read_log_dir = os.path.join(log_dir, READ_LOG_DIR)
+    write_log_dir = os.path.join(log_dir, WRITE_LOG_DIR)
+    generate_aio_param(read_log_dir, write_log_dir)
+
+
+def main():
+    args = parse_arguments()
+    if not validate_args(args):
+        quit()
+    print(f'Generate DeepNVMe configuration from {args.log_dir} logs')
+    generate_main(args.log_dir)
+
+
+if __name__ == "__main__":
+    generate_main()
diff --git a/deepspeed/nvme/perf_run_sweep.py b/deepspeed/nvme/perf_run_sweep.py
new file mode 100644
index 0000000000000..0155a4d46caee
--- /dev/null
+++ b/deepspeed/nvme/perf_run_sweep.py
@@ -0,0 +1,320 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+import os
+import sys
+import argparse
+import json
+import itertools
+import shutil
+
+from deepspeed.ops.op_builder import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
+from .ds_aio_job import Job, run_job
+from .perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
+    READ_LOG_DIR, WRITE_LOG_DIR
+
+OTHER_OPTIONS = '--handle'
+PERF_SCRIPT = 'ds_io'
+DEFAULT_SWEEP_CONFIG = {
+    "block_size": ["1M", "8M"],
+    "queue_depth": [32, 128],
+    "sequential_requests": [False],
+    "single_submit": [False],
+    "io_parallel": [1, 8],
+}
+
+
+class SweepConfig(object):
+
+    def __init__(self, args):
+        self.folder_to_device_mapping = get_ftd_map(args.nvme_dir)
+        self.search_space = get_sweep_config_dict(args.sweep_config)
+        self.search_space.update(self.folder_to_device_mapping)
+        self.read = not args.no_read
+        self.write = not args.no_write
+        self.flush_cache = args.flush_page_cache
+        self.log_dir = args.log_dir
+        self.verbose = args.verbose
+        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}'
+        if args.gpu:
+            self.other_options += ' --gpu'
+        if args.gds:
+            self.other_options += ' --use_gds'
+
+
+def validate_arguments(args):
+    if not async_io_setup():
+        error_msg = """
+            Failing because environment is not properly configured for deepspeed async i/o module.
+            Possible fix: apt install libaio-dev.
+        """
+        print(error_msg)
+        quit()
+
+    if args.gds and not gds_io_setup():
+        error_msg = """
+            Failing because environment is not properly configured for deepspeed GDS I/O operator.
+        """
+        print(error_msg)
+        quit()
+
+
+def parse_sweep_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--nvme_dir',
+                        nargs='+',
+                        required=True,
+                        help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
+
+    parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
+
+    parser.add_argument('--no_read', action='store_true', help='Disable read performance measurements.')
+
+    parser.add_argument('--no_write', action='store_true', help='Disable write performance measurements.')
+
+    parser.add_argument('--io_size',
+                        type=str,
+                        default="400M",
+                        help='Number of I/O bytes to read/write for performance measurements.')
+
+    parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.')
+
+    parser.add_argument('--gds', action='store_true', help='Run the sweep over NVIDIA GPUDirectStorage operator')
+
+    parser.add_argument(
+        '--flush_page_cache',
+        action='store_true',
+        help=
+        'Page cache will not be flushed and reported read speeds may be higher than actual ***Requires sudo access***.'
+    )
+
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default=BENCH_LOG_DIR,
+        help=f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}')
+
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
+
+    parser.add_argument('--verbose', action='store_true', help='Print debugging information.')
+
+    args = parser.parse_args()
+    if args.verbose:
+        print(f'args = {args}')
+    validate_arguments(args)
+
+    return args
+
+
+def dump_cmd_lines(cmd_lines):
+    print(f'cmd line count =  {len(cmd_lines)}')
+    for i, cmd in enumerate(cmd_lines):
+        print(f'{i}:  {cmd}')
+
+
+def get_ftd_map(nvme_dir_list):
+    ftd_list = [f'{dir}:{dev}' for dev, dir in enumerate(nvme_dir_list)]
+    ftd_arg = [' '.join(ftd for ftd in ftd_list)]
+    return {'folder_to_device_mapping': ftd_arg}
+
+
+def get_sweep_config_dict(sweep_config_json):
+    if sweep_config_json is None:
+        return DEFAULT_SWEEP_CONFIG
+
+    with open(sweep_config_json) as fp:
+        sweep_config = json.load(fp)
+    return sweep_config
+
+
+def get_sweep_cmd_lines(sweep_config_dict):
+
+    def flatten_options(key, value_list):
+        flat_list = []
+        for v in value_list:
+            if not type(v) is bool:
+                flat_list.append(f'--{key} {v}')
+            elif v:
+                flat_list.append(f'--{key}')
+            else:
+                flat_list.append(' ')
+
+        return flat_list
+
+    flat_list = [flatten_options(key, value) for key, value in sweep_config_dict.items()]
+    cmd_list = list(itertools.product(*flat_list))
+    cmd_list = [list(cmd) for cmd in cmd_list]
+    #dump_cmd_lines(cmd_list)
+    return cmd_list
+
+
+def launch_sweep(sweep_jobs, sync_job, flush_cache_job, verbose):
+    for perf_job in sweep_jobs:
+        if flush_cache_job is not None:
+            run_job(sync_job, verbose)
+            run_job(flush_cache_job, verbose)
+
+        run_job(perf_job, verbose)
+
+        run_job(sync_job, verbose)
+
+
+def create_cmd_tags(cmd_line):
+    tags = {}
+    for param_value in cmd_line:
+        fields = param_value.split()
+        if len(fields) == 1:
+            tags[fields[0]] = None
+        elif len(fields) == 2:
+            if fields[0] == '--folder_to_device_mapping':
+                tags[fields[0]] = len(fields[1:])
+            else:
+                tags[fields[0]] = fields[1]
+        elif len(fields) > 2:
+            tags[fields[0]] = len(fields[1:])
+    return tags
+
+
+def get_log_file(io_op_desc, cmd_line):
+    QUEUE_DEPTH = "--queue_depth"
+    BLOCK_SIZE = "--block_size"
+    SINGLE_SUBMIT = "--single_submit"
+    SEQUENTIAL_REQUESTS = "--sequential_requests"
+    FTD_MAP = "--folder_to_device_mapping"
+    IO_PARALLEL = "--io_parallel"
+
+    tag_map = {
+        QUEUE_DEPTH: "d",
+        BLOCK_SIZE: "bs",
+        SINGLE_SUBMIT: "single",
+        SEQUENTIAL_REQUESTS: "sequential",
+        FTD_MAP: "ftd",
+        IO_PARALLEL: "p"
+    }
+
+    tag_default = {
+        QUEUE_DEPTH: 1,
+        BLOCK_SIZE: "1M",
+        SINGLE_SUBMIT: "block",
+        SEQUENTIAL_REQUESTS: "overlap",
+        FTD_MAP: 1,
+        IO_PARALLEL: 1
+    }
+
+    def get_default_value(tag):
+        value = tag_default[tag]
+        if tag in [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS]:
+            return value
+        return f'{tag_map[tag]}{value}'
+
+    def get_config_value(tag, value):
+        tag_key = tag_map[tag]
+        if value is None:
+            return tag_key
+        return f'{tag_key}{value}'
+
+    tag_list = [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS, FTD_MAP, QUEUE_DEPTH, BLOCK_SIZE, IO_PARALLEL]
+    log_tags = [io_op_desc]
+    cmd_tags = create_cmd_tags(cmd_line)
+    for tag in tag_list:
+        if tag in cmd_tags:
+            log_tags.append(get_config_value(tag, cmd_tags[tag]))
+        else:
+            log_tags.append(get_default_value(tag))
+
+    log_file = '_'.join(log_tags)
+    log_file += '.txt'
+    return log_file
+
+
+def create_perf_jobs(io_op_desc, log_dir, cmd_lines):
+    py_cmd = [os.path.join(script_path(), PERF_SCRIPT)]
+
+    perf_jobs = []
+    for cmd in cmd_lines:
+        log_file = os.path.join(log_dir, get_log_file(io_op_desc, cmd))
+        job = Job(cmd_line=py_cmd + cmd, output_file=log_file)
+        perf_jobs.append(job)
+
+    return perf_jobs
+
+
+def script_path():
+    return os.path.dirname(os.path.realpath(sys.argv[0]))
+
+
+def async_io_setup():
+    return AsyncIOBuilder().is_compatible()
+
+
+def gds_io_setup():
+    return GDSBuilder().is_compatible()
+
+
+def remove_folder(folder):
+    assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
+    shutil.rmtree(folder)
+
+
+def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
+    read_cmd_lines = [[f'--read {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    # dump_cmd_lines(cmd_lines)
+
+    log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
+    os.makedirs(log_folder, exist_ok=True)
+
+    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, log_dir=log_folder, cmd_lines=read_cmd_lines)
+
+    launch_sweep(sweep_jobs=perf_jobs,
+                 sync_job=sync_job,
+                 flush_cache_job=flush_cache_job,
+                 verbose=sweep_config.verbose)
+
+
+def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
+    write_cmd_lines = [[f'{sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    # dump_cmd_lines(write_cmd_lines)
+
+    log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
+    os.makedirs(log_folder, exist_ok=True)
+
+    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, log_dir=log_folder, cmd_lines=write_cmd_lines)
+
+    launch_sweep(sweep_jobs=perf_jobs,
+                 sync_job=sync_job,
+                 flush_cache_job=flush_cache_job,
+                 verbose=sweep_config.verbose)
+
+
+def sweep_main(args):
+    sweep_config = SweepConfig(args)
+    cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
+
+    if sweep_config.flush_cache:
+        flush_cache_job = Job(cmd_line=['sudo', 'bash -c', "'echo 1 > /proc/sys/vm/drop_caches'"])
+    else:
+        flush_cache_job = None
+
+    sync_job = Job(cmd_line=['sync'])
+
+    if sweep_config.read:
+        run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
+
+    if sweep_config.write:
+        run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
+
+
+def main():
+    args = parse_sweep_arguments()
+    print(f"Running DeepNVMe performance sweep on {args.nvme_dir}")
+    sweep_main(args)
+
+
+if __name__ == "__main__":
+    sweep_main()
diff --git a/deepspeed/nvme/perf_sweep_utils.py b/deepspeed/nvme/perf_sweep_utils.py
new file mode 100644
index 0000000000000..e6832c1baa492
--- /dev/null
+++ b/deepspeed/nvme/perf_sweep_utils.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+SCRIPT_PREFIX = '_aio_bench'
+WRITE_OP_DESC = 'write'
+READ_OP_DESC = 'read'
+READ_IO_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_io'
+WRITE_IO_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_io'
+BENCH_LOG_DIR = f'{SCRIPT_PREFIX}_logs'
+READ_LOG_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_logs'
+WRITE_LOG_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_logs'
diff --git a/deepspeed/nvme/test_ds_aio.py b/deepspeed/nvme/test_ds_aio.py
new file mode 100755
index 0000000000000..a17350414739c
--- /dev/null
+++ b/deepspeed/nvme/test_ds_aio.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import multiprocessing as mp
+from .ds_aio_basic import aio_basic_multiprocessing
+from .ds_aio_handle import aio_handle_multiprocessing
+from .ds_aio_args import get_validated_args
+
+
+def ds_io_main():
+    print(f'Testing deepspeed_aio python frontend')
+
+    args = get_validated_args()
+    mp.set_start_method('spawn')
+    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
+    multiprocess_function(args, args.read)
+
+
+if __name__ == "__main__":
+    ds_io_main()
diff --git a/deepspeed/nvme/test_ds_aio_utils.py b/deepspeed/nvme/test_ds_aio_utils.py
new file mode 100755
index 0000000000000..cf167f6474603
--- /dev/null
+++ b/deepspeed/nvme/test_ds_aio_utils.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+from .ds_aio_job import Job, run_job
+
+BYTES_PER_GB = 1024**3
+BYTES_PER_MB = 1024**2
+BYTES_PER_KB = 1024
+LOG_TIDS = [0]
+
+
+def task_log(tid, msg, force=False):
+    if force or tid in LOG_TIDS:
+        print(f'tid {tid}: {msg}')
+
+
+def task_barrier(barrier, num_parties):
+    assert barrier.parties == num_parties
+    barrier.wait()
+    assert barrier.broken == False
+
+
+def report_results(args, read_op, pool_results):
+    #print(f'pool_results = {pool_results}')
+    io_string = 'Read' if read_op else 'Write'
+    if None in pool_results:
+        print(f'Failure in one of {args.threads} {io_string} processes')
+        return
+
+    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
+
+    task_latency_sec = max([sec for _, sec, _ in pool_results])
+    task_speed_GB = 0 if task_latency_sec == 0 else total_bytes / task_latency_sec / BYTES_PER_GB
+    print(f'Task {io_string} Latency = {task_latency_sec} sec')
+    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
+
+    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
+    e2e_speed_GB = 0 if e2e_latency_sec == 0 else total_bytes / e2e_latency_sec / BYTES_PER_GB
+    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
+    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
+
+
+def get_block_size_and_count(io_bytes):
+    if io_bytes > BYTES_PER_MB and io_bytes % BYTES_PER_MB == 0:
+        block_size = BYTES_PER_MB
+        block_size_string = '1M'
+    else:
+        assert io_bytes % BYTES_PER_KB == 0
+        block_size = BYTES_PER_KB
+        block_size_string = '1K'
+    block_count = io_bytes / block_size
+
+    return block_size_string, int(block_count)
+
+
+def refine_integer_value(value):
+    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
+
+    if value[-1] in list(unit_dict.keys()):
+        int_value = int(value[:-1]) * unit_dict[value[-1]]
+        return int_value
+    return int(value)
+
+
+def create_filename(folder, read_op, size, tid):
+    io_string = "read" if read_op else "write"
+    return os.path.join(folder, f'_aio_{io_string}_{size}.pt.{tid}')
+
+
+def create_file(filename, num_bytes):
+    block_size, block_count = get_block_size_and_count(num_bytes)
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={filename} bs={block_size} count={block_count}'])
+    print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....')
+    run_job(dd_job)
+    print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....')
diff --git a/deepspeed/nvme/validate_async_io.py b/deepspeed/nvme/validate_async_io.py
new file mode 100644
index 0000000000000..10fb638347bcd
--- /dev/null
+++ b/deepspeed/nvme/validate_async_io.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+from deepspeed.ops.op_builder import AsyncIOBuilder
+assert AsyncIOBuilder().is_compatible()
+assert AsyncIOBuilder().load()
diff --git a/docs/_tutorials/deepnvme.md b/docs/_tutorials/deepnvme.md
index 480bcf2d95dfe..70c6ac0979635 100644
--- a/docs/_tutorials/deepnvme.md
+++ b/docs/_tutorials/deepnvme.md
@@ -188,7 +188,7 @@ This tutorial has been significantly improved by feedback from [Guanhua Wang](ht
 ## Appendix
 
 ### Advanced Handle Creation
-Achieving peak I/O performance with DeepNVMe requires careful configuration of handle creation. In particular, the parameters of `aio_handle` and `gds_handle` constructors are performance-critical because they determine how efficiently DeepNVMe interacts with the underlying storage subsystem (i.e., `libaio`, GDS, and SSD). For convenience we make it possible to create handles using default parameter values which will provide decent performance in most scenarios. However, squeezing out every available performance in your environment will likely require tuning the constructor parameters, namely `block_size`, `queue_depth`, `single_submit`, `overlap_events`, and `num_threads`. The `aio_handle` constructor parameters and default values are illustrated below:
+Achieving peak I/O performance with DeepNVMe requires careful configuration of handle creation. In particular, the parameters of `aio_handle` and `gds_handle` constructors are performance-critical because they determine how efficiently DeepNVMe interacts with the underlying storage subsystem (i.e., `libaio`, GDS, PCIe, and SSD). For convenience we make it possible to create handles using default parameter values which will provide decent performance in most scenarios. However, squeezing out every available performance in your environment will likely require tuning the constructor parameters, namely `block_size`, `queue_depth`, `single_submit`, `overlap_events`, and `num_threads`. The `aio_handle` constructor parameters and default values are illustrated below:
 ```bash
 >>> from deepspeed.ops.op_builder import AsyncIOBuilder
 >>> help(AsyncIOBuilder().load().aio_handle())
@@ -208,6 +208,56 @@ class aio_handle(pybind11_builtins.pybind11_object)
  |      AIO handle constructor
 ```
 
+### Performance Tuning
+As discussed [earlier](#advanced-handle-creation), achieving peak DeepNVMe performance for a target workload or environment requires using optimally configured `aio_handle` or `gds_handle` handles. For configuration convenience, we provide a utility called `ds_nvme_tune` to automate the discovery of optimal DeepNVMe configurations. `ds_nvme_tune` automatically explores a user-specified or default configuration space and recommends the option that provides the best read and write performance. Below is an example usage of `ds_nvme_tune` to tune `aio_handle` data transfers between GPU memory and a local NVVMe SSD mounted on `/local_nvme`. This example used the default configuration space of `ds_nvme_tune` for tuning.
+
+```bash
+$ ds_nvme_tune --nvme_dir /local_nvme --gpu
+Running DeepNVMe performance tuning on ['/local_nvme/']
+Best performance (GB/sec): read =  3.69, write =  3.18
+{
+   "aio": {
+      "single_submit": "false",
+      "overlap_events": "true",
+      "num_threads": 8,
+      "queue_depth": 32,
+      "block_size": 1048576
+   }
+}
+```
+The above tuning was executed on a Lambda workstation equipped with two NVIDIA A6000-48GB GPUs, 252GB of DRAM, and a [CS3040 NVMe 2TB SDD](https://www.pny.com/CS3040-M2-NVMe-SSD?sku=M280CS3040-2TB-RB) with peak read and write speeds of 5.6 GB/s and 4.3 GB/s respectively. The tuning required about four and half minutes. Based on the results, one can expect to achieve read and write transfer speeds of 3.69 GB/sec and 3.18 GB/sec respectively by using an `aio_handle` configured as below.
+
+```python
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle(block_size=1048576,
+                                           queue_depth=32,
+                                           single_submit=False,
+                                           overlap_events=True,
+                                           num_threads=8)
+```
+
+
+The full command line options of `ds_nvme_tune` can be obtained via the normal `-h` or `--help`.
+```bash
+usage: ds_nvme_tune [-h] --nvme_dir NVME_DIR [NVME_DIR ...] [--sweep_config SWEEP_CONFIG] [--no_read] [--no_write] [--io_size IO_SIZE] [--gpu] [--gds] [--flush_page_cache] [--log_dir LOG_DIR] [--loops LOOPS] [--verbose]
+
+options:
+  -h, --help            show this help message and exit
+  --nvme_dir NVME_DIR [NVME_DIR ...]
+                        Directory in which to perform I/O tests. A writeable directory on a NVMe device.
+  --sweep_config SWEEP_CONFIG
+                        Performance sweep configuration json file.
+  --no_read             Disable read performance measurements.
+  --no_write            Disable write performance measurements.
+  --io_size IO_SIZE     Number of I/O bytes to read/write for performance measurements.
+  --gpu                 Test tensor transfers between GPU device and NVME device.
+  --gds                 Run the sweep over NVIDIA GPUDirectStorage operator
+  --flush_page_cache    Page cache will not be flushed and reported read speeds may be higher than actual ***Requires sudo access***.
+  --log_dir LOG_DIR     Output directory for performance log files. Default is ./_aio_bench_logs
+  --loops LOOPS         Count of operation repetitions
+  --verbose             Print debugging information.
+```
+
 ### DeepNVMe APIs
 For convenience, we provide listing and brief descriptions of the DeepNVMe APIs.
 
diff --git a/setup.py b/setup.py
index 1b6768d1b2c3d..e39d8c7e05a3c 100755
--- a/setup.py
+++ b/setup.py
@@ -298,7 +298,7 @@ def op_enabled(op_name):
 else:
     scripts = [
         'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr',
-        'bin/ds_elastic'
+        'bin/ds_elastic', 'bin/ds_nvme_tune', 'bin/ds_io'
     ]
 
 start_time = time.time()

From 0fbe96a5022d30470b8cb53f65237389ba26c57a Mon Sep 17 00:00:00 2001
From: andyG <135115931+Andy666G@users.noreply.github.com>
Date: Thu, 26 Sep 2024 21:10:52 +0800
Subject: [PATCH 04/43] [Accelerator] Cambricon MLU support (#6472)

### Description
This PR includes Cambricon MLU accelerator support.
With this PR, DeepSpeed supports MLU as backend for training and
inference tasks.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 accelerator/mlu_accelerator.py  | 300 ++++++++++++++++++++++++++++++++
 accelerator/real_accelerator.py |  18 +-
 op_builder/mlu/__init__.py      |  12 ++
 op_builder/mlu/builder.py       |  35 ++++
 op_builder/mlu/cpu_adagrad.py   |  24 +++
 op_builder/mlu/cpu_adam.py      |  28 +++
 op_builder/mlu/fused_adam.py    |  39 +++++
 op_builder/mlu/no_impl.py       |  34 ++++
 8 files changed, 489 insertions(+), 1 deletion(-)
 create mode 100644 accelerator/mlu_accelerator.py
 create mode 100644 op_builder/mlu/__init__.py
 create mode 100644 op_builder/mlu/builder.py
 create mode 100644 op_builder/mlu/cpu_adagrad.py
 create mode 100644 op_builder/mlu/cpu_adam.py
 create mode 100644 op_builder/mlu/fused_adam.py
 create mode 100644 op_builder/mlu/no_impl.py

diff --git a/accelerator/mlu_accelerator.py b/accelerator/mlu_accelerator.py
new file mode 100644
index 0000000000000..bef716f0ee4e4
--- /dev/null
+++ b/accelerator/mlu_accelerator.py
@@ -0,0 +1,300 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import importlib
+import inspect
+import functools
+
+from .abstract_accelerator import DeepSpeedAccelerator
+import torch
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+
+
+class MLU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'mlu'
+        self._communication_backend_name = 'cncl'
+        self._compile_backend = "inductor"
+        self.class_dict = None
+
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return 'mlu'
+        return 'mlu:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.mlu.device(device_index)
+
+    def set_device(self, device_index):
+        torch.mlu.set_device(device_index)
+
+    def current_device(self):
+        return torch.mlu.current_device()
+
+    def current_device_name(self):
+        return 'mlu:{}'.format(torch.mlu.current_device())
+
+    def device_count(self):
+        return torch.mlu.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.mlu.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.mlu.set_rng_state(new_state)
+
+        return torch.mlu.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.mlu.get_rng_state()
+
+        return torch.mlu.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.mlu.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.mlu.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        return torch.mlu.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return torch.mlu.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.mlu.Stream
+
+    def stream(self, stream):
+        return torch.mlu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.mlu.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.mlu.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.mlu.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.mlu.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.mlu.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.mlu.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.mlu.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.mlu.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.mlu.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.mlu.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.mlu, 'memory_stats'):
+            return torch.mlu.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.mlu, 'reset_peak_memory_stats'):
+            return torch.mlu.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.mlu, 'memory_reserved'):
+            return torch.mlu.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.mlu, 'max_memory_reserved'):
+            return torch.mlu.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.mlu.get_device_properties(device_index).total_memory
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.mlu.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        return True
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        if self.is_bf16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.mlu, 'amp'):
+            return torch.mlu.amp
+        return None
+
+    def is_available(self):
+        return torch.mlu.is_available()
+
+    def range_push(self, msg):
+        if hasattr(torch.mlu.cnpx, 'range_push'):
+            return torch.mlu.cnpx.range_push(msg)
+
+    def range_pop(self):
+        if hasattr(torch.mlu.cnpx, 'range_pop'):
+            return torch.mlu.cnpx.range_pop()
+
+    def lazy_call(self, callback):
+        return torch.mlu._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return True
+
+    # Graph operations
+    def create_graph(self):
+        torch.mlu.MLUGraph()
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        return torch.mlu.graph(graph, pool, stream)
+
+    def replay_graph(self, graph):
+        graph.replay()
+        return
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='mlu')
+
+    @property
+    def ByteTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='mlu')
+
+    @property
+    def DoubleTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.double, device='mlu')
+
+    @property
+    def FloatTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.float, device='mlu')
+
+    @property
+    def HalfTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.half, device='mlu')
+
+    @property
+    def IntTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.int, device='mlu')
+
+    @property
+    def LongTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.long, device='mlu')
+
+    def pin_memory(self, tensor):
+        return tensor.pin_memory()
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('mlu:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.mlu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.mlu"
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict:
+            return
+
+        op_builder_module = importlib.import_module(self.op_builder_dir())
+
+        # get op builder class from op_builder/mlu/__init__.py
+        self.class_dict = {}
+        for class_name, class_obj in inspect.getmembers(op_builder_module, inspect.isclass):
+            self.class_dict[class_name] = class_obj
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        builder_class = self.get_op_builder(class_name)
+        return builder_class()
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder']
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return ['NEUWARE_HOME', 'CNCL', 'LD_LIBRARY', 'PATH']
+
+    def visible_devices_envs(self):
+        return ['MLU_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }")
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 037162e867ecc..69e96d285bb8a 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -20,7 +20,7 @@
 except ImportError as e:
     dsa2 = None
 
-SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps', 'hpu']
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps', 'hpu', 'mlu']
 
 ds_accelerator = None
 
@@ -94,6 +94,11 @@ def get_accelerator():
             except ImportError as e:
                 raise ValueError(
                     f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
+        elif accelerator_name == "mlu":
+            try:
+                import torch_mlu  # noqa: F401
+            except ImportError as e:
+                raise ValueError(f"MLU_Accelerator requires torch_mlu, which is not installed on this system.")
         elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST:
             raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
                              f'Value "{accelerator_name}" is not supported')
@@ -149,6 +154,13 @@ def get_accelerator():
                 accelerator_name = "hpu"
             except ImportError as e:
                 pass
+        if accelerator_name is None:
+            try:
+                import torch_mlu  # noqa: F401,F811
+
+                accelerator_name = "mlu"
+            except ImportError as e:
+                pass
         if accelerator_name is None:
             # borrow this log from PR#5084
             try:
@@ -198,6 +210,10 @@ def get_accelerator():
         from .hpu_accelerator import HPU_Accelerator
 
         ds_accelerator = HPU_Accelerator()
+    elif accelerator_name == 'mlu':
+        from .mlu_accelerator import MLU_Accelerator
+
+        ds_accelerator = MLU_Accelerator()
     _validate_accelerator(ds_accelerator)
     if accel_logger is not None:
         accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
diff --git a/op_builder/mlu/__init__.py b/op_builder/mlu/__init__.py
new file mode 100644
index 0000000000000..db12afbbf20ec
--- /dev/null
+++ b/op_builder/mlu/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# MLU related operators will be added in the future.
+from .no_impl import NotImplementedBuilder
+from .cpu_adagrad import CPUAdagradBuilder
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
diff --git a/op_builder/mlu/builder.py b/op_builder/mlu/builder.py
new file mode 100644
index 0000000000000..17b9723ffcc1e
--- /dev/null
+++ b/op_builder/mlu/builder.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class MLUOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=self.strip_empty_entries(self.include_paths()),
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        return ['-O3', '-g', '-Wno-reorder']
+
+    def libraries_args(self):
+        return []
diff --git a/op_builder/mlu/cpu_adagrad.py b/op_builder/mlu/cpu_adagrad.py
new file mode 100644
index 0000000000000..68b7bbe514eea
--- /dev/null
+++ b/op_builder/mlu/cpu_adagrad.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class CPUAdagradBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAGRAD"
+    NAME = "cpu_adagrad"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adagrad.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adagrad/cpu_adagrad.cpp']
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/mlu/cpu_adam.py b/op_builder/mlu/cpu_adam.py
new file mode 100644
index 0000000000000..b3c8e476bf396
--- /dev/null
+++ b/op_builder/mlu/cpu_adam.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class CPUAdamBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/mlu/fused_adam.py b/op_builder/mlu/fused_adam.py
new file mode 100644
index 0000000000000..0198db7cb2768
--- /dev/null
+++ b/op_builder/mlu/fused_adam.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+import torch
+
+
+class MLUFusedAdam:
+
+    @staticmethod
+    def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
+                          bias_correction, weight_decay, *args):
+
+        torch.ops.torch_mlu.fused_adam(noop_flag_buffer, tensor_lists[0], tensor_lists[1], tensor_lists[2],
+                                       tensor_lists[3], lr, beta1, beta2, epsilon, step, adam_w_mode, bias_correction,
+                                       weight_decay)
+
+
+class FusedAdamBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def include_paths(self):
+        return []
+
+    def load(self, verbose=True):
+        return MLUFusedAdam
diff --git a/op_builder/mlu/no_impl.py b/op_builder/mlu/no_impl.py
new file mode 100644
index 0000000000000..375c148b4a5e0
--- /dev/null
+++ b/op_builder/mlu/no_impl.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class NotImplementedBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on MLU backend.")
+
+    def sources(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        return []

From c85c8703bc49eefb78d8d64b5f0027f7b8acf9ff Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Thu, 26 Sep 2024 06:11:24 -0700
Subject: [PATCH 05/43] Fix gradient accumulation for Z2+offload (#6550)

The ZeRO 1/2 optimizer performs incorrect gradient accumulation in the
path for ZeRO2 + Offloading. This issue is caused by two main reasons:

1) The micro_step_id in the ZeRO 1/2 optimizer is:

- Initialized to 0 in the constructor.
- Reset to -1 during the backward pass.

For example, given a gradient accumulation step of 4, the micro_step_id
changes as follows:

- For the first global step: 1, 2, 3, 4.
- Subsequently: 0, 1, 2, 3.

2) Gradients are copied to the buffer on the first micro step and
accumulated in the buffer during the following micro steps. However, the
current code incorrectly copies gradients at steps that are not at the
accumulation boundary.

This PR aligns the micro_step_id initialization in both the constructor
and the backward pass, and corrects the condition for copying and
accumulating gradients.

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/zero/stage_1_and_2.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 83cf996ca0192..df7a2f83e3bcc 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -39,6 +39,7 @@
 OPTIMIZER_GRADIENTS_TIMER = 'optimizer_gradients'
 OPTIMIZER_STEP_TIMER = 'optimizer_step'
 OPTIMIZER_TIMERS = [OPTIMIZER_ALLGATHER_TIMER, OPTIMIZER_GRADIENTS_TIMER, OPTIMIZER_STEP_TIMER]
+INITIAL_MICRO_STEP_ID = -1
 
 
 def input(msg):
@@ -224,7 +225,7 @@ def __init__(self,
         self.gradient_predivide_factor = gradient_predivide_factor
         self.postscale_gradients = postscale_gradients
         self.gradient_accumulation_steps = gradient_accumulation_steps
-        self.micro_step_id = 0
+        self.micro_step_id = INITIAL_MICRO_STEP_ID
         self.ignore_unused_parameters = ignore_unused_parameters
         self.round_robin_gradients = round_robin_gradients
 
@@ -1231,9 +1232,7 @@ def copy_gradients_to_cpu():
 
         if self.micro_step_id > 0:
             accumulate_gradients()
-
-        # at the boundary we will send 32bit directly
-        if not self.is_gradient_accumulation_boundary:
+        else:
             copy_gradients_to_cpu()
 
     def set_norm_for_param_grad(self, param):
@@ -1824,7 +1823,7 @@ def step(self, closure=None):
         """
         Not supporting closure.
         """
-        self.micro_step_id = -1
+        self.micro_step_id = INITIAL_MICRO_STEP_ID
 
         see_memory_usage(f"In step before checking overflow")
 

From ba58682a138760ee44b1366165fdbe4d87522323 Mon Sep 17 00:00:00 2001
From: Nir Sonnenschein <nsonnenschein@habana.ai>
Date: Thu, 26 Sep 2024 17:55:12 +0300
Subject: [PATCH 06/43] fix errors when setting zero3 leaf modules with
 torch.compile (#6564)

When setting zero3 leaf modules to a higher level module and running
with torch.compile, there are a few errors from ZeROOrderedDict.

First it doesn't support Deep copy for not having a constructor with no
parameters.

Second, it doesn't check the existence of ds_status attr on param before
accessing the attr.

change contributed by Haifeng Chen

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/zero/parameter_offload.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index 92702edb733b1..1ce2414a1e17c 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -38,7 +38,7 @@ def _apply_forward_and_backward_to_tensors_only(module, forward_function, backwa
 
 class ZeROOrderedDict(OrderedDict):
 
-    def __init__(self, parent_module, *args, **kwargs):
+    def __init__(self, parent_module=None, *args, **kwargs):
         """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
 
         Args:
@@ -56,7 +56,7 @@ def __getitem__(self, key):
         if param is None:
             return param
 
-        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+        if hasattr(param, "ds_status") and param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
             if self._parent_module._parameters._in_forward:
                 register_external_parameter(FWD_MODULE_STACK[-1], param)
                 param.all_gather()

From d45cfd34551537ce6f8317504bd520d7a2a1a588 Mon Sep 17 00:00:00 2001
From: Liangliang Ma <liangliang.ma@intel.com>
Date: Fri, 27 Sep 2024 04:39:59 +0800
Subject: [PATCH 07/43] [XPU] Support DeepNVMe new code structure (#6532)

In DeepNVMe GDS update, many functions are changed into a more abstract
way. Also added some files. These change break zero-infinity on XPU. To
bring this feature back, we have this PR:
1. modify the aio opbuilder for new files.
2. Add custom cpu_op_desc_t for xpu users. (XPU don't handle buffer
aligned here)

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 csrc/xpu/aio/deepspeed_cpu_op.cpp | 51 +++++++++++++++++++++++++++++++
 op_builder/xpu/async_io.py        | 17 ++++++++---
 2 files changed, 63 insertions(+), 5 deletions(-)
 create mode 100644 csrc/xpu/aio/deepspeed_cpu_op.cpp

diff --git a/csrc/xpu/aio/deepspeed_cpu_op.cpp b/csrc/xpu/aio/deepspeed_cpu_op.cpp
new file mode 100644
index 0000000000000..ee98c2d5cac2f
--- /dev/null
+++ b/csrc/xpu/aio/deepspeed_cpu_op.cpp
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepspeed_cpu_op.h"
+
+using namespace std;
+
+cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
+                             const torch::Tensor& buffer,
+                             const int fd,
+                             const char* filename,
+                             const long long int file_num_bytes,
+                             const int num_threads,
+                             const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate),
+      _cpu_buffer(buffer)
+{
+    // XPU don't handle buffer here. See XPU Accelerator pin_memory.
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void cpu_op_desc_t::finish()
+{
+    if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+}
+
+void cpu_op_desc_t::validate()
+{
+    validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes);
+}
+
+void cpu_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
+{
+    assert(tid < _num_threads);
+    const auto base_offset = _num_bytes_per_thread * tid;
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr()));
+
+    if (aio_config->_overlap_events) {
+        do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    }
+}
diff --git a/op_builder/xpu/async_io.py b/op_builder/xpu/async_io.py
index 7ed527e016fa7..6a6798eaeb9cf 100644
--- a/op_builder/xpu/async_io.py
+++ b/op_builder/xpu/async_io.py
@@ -21,11 +21,18 @@ def absolute_name(self):
 
     def sources(self):
         return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
-            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp',
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp',
+            'csrc/xpu/aio/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp',
         ]
 
     def include_paths(self):

From 047bcf6af6a3721cfac31a13a1ab07c6b5482fb9 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Thu, 26 Sep 2024 22:37:32 -0700
Subject: [PATCH 08/43] Add APIs to offload states of model, optimizer, and
 engine (#6011)

This PR adds the following APIs to offload model, optimizer, and engine
states.

```pytyon
def offload_states(self,
                   include: Container[OffloadStateTypeEnum] = None,
                   device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
                   pin_memory: bool = True,
                   non_blocking: bool = False) -> None:
    """Move the ZeRO optimizer buffers to the specified device.

    Arguments:
        include: Optional. The set of states to offload. If not provided, all states are offloaded.
        device: Optional. The device to move the ZeRO optimizer buffers to.
        pin_memory: Optional. Whether to pin the memory of the offloaded states.
        non_blocking: Optional. Whether to offload the states asynchronously.
...
def offload_states_back(self, non_blocking: bool = False) -> None:
```

Here is the typical usage.
```python
# Offload after forward, backward, and step
model.offload_states()
# Do something requiring a lot of device memory
...
# Load states back to device memory
model.offload_states_back()
```

You can selectively offload states to balance the offloading overhead
and memory saving.
```python
model.offload_states(include=set([OffloadStateTypeEnum.hp_params, OffloadStateTypeEnum.opt_states], device=OffloadDeviceEnum.cpu)
```

Performance (4.3B parameters / 4x A100)
- Environment (4x A100, [benchmark
script](https://gist.github.com/tohtana/05d5faba5068cf839abfc7b1e38b85e4))
- Average Device to Host transfer time: 2.45 GB/s, aggregated: 9.79 GB/s
  - Average Host to Device transfer: 11.05 GB/s, aggregated: 44.19 GB/s
- Mem (allocated by PyTorch)
  - Before offload 18.2GB
  - After offloading 17.7MB
- Time ([benchmark
script](https://github.com/microsoft/DeepSpeedExamples/tree/tohtana/offload_states/training/offload_states),
offloading time/loading time)

python output_table.py
| |pin_memory=0 non_blocking=0|pin_memory=0 non_blocking=1|pin_memory=1
non_blocking=0|pin_memory=1 non_blocking=1|

|--:|---------------------------|---------------------------|---------------------------|---------------------------|
| 1|4.34 / 3.42 |4.99 / 2.37 |6.5 / 2.42 |6.0 / 2.39 |
| 2|9.9 / 3.28 |5.1 / 2.34 |6.21 / 2.42 |6.25 / 2.45 |
| 3|9.92 / 3.19 |6.71 / 2.35 |6.33 / 2.38 |5.93 / 2.42 |
| 4|9.55 / 2.82 |7.11 / 2.39 |6.9 / 2.38 |6.5 / 2.43 |
| 5|4.4 / 3.35 |6.04 / 2.41 |6.26 / 2.41 |6.32 / 2.47 |
| 6|4.4 / 3.57 |6.58 / 2.42 |6.88 / 2.4 |6.35 / 2.43 |
| 7|9.51 / 3.12 |6.9 / 2.39 |6.9 / 2.39 |6.46 / 2.4 |
| 8|4.77 / 3.64 |6.69 / 2.39 |7.39 / 2.42 |6.56 / 2.46 |
| 9|9.5 / 3.07 |7.18 / 2.42 |6.67 / 2.39 |7.38 / 2.46 |

TODO:
- Enable offloading to a NVMe storage -> NVMe support is non-trivial. I
suggest adding the support in another PR
- [DONE] Discard buffer (and recreate it) instead of offloading. We
don't need to restore the contiguous buffer for reduce.
- [DONE] Check pin_memory improves performance or not

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/engine.py                   |  41 +++-
 deepspeed/runtime/utils.py                    |  36 ++++
 deepspeed/runtime/zero/offload_config.py      |   9 +
 deepspeed/runtime/zero/stage3.py              | 189 ++++++++++++++++--
 deepspeed/runtime/zero/utils.py               |  15 +-
 docs/code-docs/source/zero3.rst               |  53 +++++
 .../unit/runtime/zero/test_offload_states.py  | 125 ++++++++++++
 7 files changed, 443 insertions(+), 25 deletions(-)
 create mode 100644 tests/unit/runtime/zero/test_offload_states.py

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 61e6da2663cf8..b590ea432658c 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -18,13 +18,13 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from typing import Callable, Dict, Union, Iterable
+from typing import Callable, Dict, Union, Iterable, Container
 
 import deepspeed
 
 from deepspeed import comm as dist
 from deepspeed.runtime.utils import see_memory_usage, DummyOptim
-from .zero.offload_config import OffloadDeviceEnum
+from .zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
 from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 from deepspeed.runtime.zero.utils import is_zero_supported_optimizer, ZeRORuntimeException
@@ -3681,3 +3681,40 @@ def compile(self, backend=get_accelerator().get_compile_backend(), compile_kwarg
     @property
     def is_compiled(self) -> bool:
         return self._is_compiled
+
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False) -> None:
+        """Offload the engine's states to the specified device.
+
+        Arguments:
+            include: Optional. The set of states to offload. If not provided, all states are offloaded.
+            device: Optional. The device to move the ZeRO optimizer buffers to. Currently only `OffloadDeviceEnum.cpu` is supported.
+            pin_memory: Optional. Whether to pin the memory of the offloaded states.
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+        assert self.zero_optimization_stage(
+        ) == ZeroStageEnum.weights, "Moving buffers across devices is supported only for ZeRO stage 3."
+
+        assert not self.zero_offload_param(), "Moving states across devices is not supported for offloaded parameters."
+
+        if device == OffloadDeviceEnum.none:
+            logger.warning("No device specified for offloading states.")
+            return
+
+        if device == OffloadDeviceEnum.nvme:
+            raise ValueError("NVMe offload is not supported for offloading states.")
+
+        self.optimizer.offload_states(include=include, device=device, pin_memory=pin_memory, non_blocking=non_blocking)
+
+    def reload_states(self, non_blocking: bool = False) -> None:
+        """Reload the engine states to the original device.
+
+        Arguments:
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+        assert self.zero_optimization_stage(
+        ) == ZeroStageEnum.weights, "Moving buffers back is supported only for ZeRO stage 3."
+        self.optimizer.reload_states(non_blocking=non_blocking)
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 2c01c3475a70a..adcadd349803b 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -1065,3 +1065,39 @@ def to_tensor(v):
             total_norm = -1
 
     return total_norm
+
+
+def _make_offload_state_key(key):
+    return f"{key}_offload_buffer"
+
+
+def offload_adam_states(optimizer, device, pin_memory: bool = False, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_key(state, key):
+        offload_buf_key = _make_offload_state_key(key)
+        if offload_buf_key not in state:
+            state[offload_buf_key] = torch.empty_like(state[key], device=device)
+            if pin_memory:
+                state[offload_buf_key] = get_accelerator().pin_memory(state[offload_buf_key])
+        state[offload_buf_key].copy_(state[key], non_blocking=non_blocking)
+        state[key].data = state[offload_buf_key]
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_key(state, "exp_avg_sq")
+
+
+def reload_adam_states(optimizer, device, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_back_key(state, key):
+        state[key].data = state[_make_offload_state_key(key)].to(device, non_blocking=non_blocking)
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_back_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_back_key(state, "exp_avg_sq")
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index 74a5673bc1bc7..ca35d7a7d1691 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -98,3 +98,12 @@ def set_pipeline(self):
         pipeline = self.pipeline_read or self.pipeline_write
         self.__dict__["pipeline"] = pipeline
         return self
+
+
+class OffloadStateTypeEnum(str, Enum):
+    """ Enum for internal buffer types """
+    optim_states = "optim_states"
+    hp_params = "hp_params"
+    lp_params = "lp_params"
+    lp_grads = "lp_grads"
+    contiguous_grad_buffer = "contiguous_grad_buffer"
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 796957a4c6e5a..fb75d2bcebd58 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -6,22 +6,24 @@
 import sys
 import gc
 import collections
-from typing import Deque, Dict, Tuple
+import itertools
+from typing import Deque, Dict, Set, Tuple, Container
 from contextlib import contextmanager
+
 from deepspeed import comm as dist
-from deepspeed.utils import groups
+from deepspeed.utils import groups, z3_leaf_parameter
 
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from deepspeed.runtime.base_optimizer import ZeROOptimizer
 from deepspeed.utils import logger
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
-from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
+from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item, offload_adam_states, reload_adam_states
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
-from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
 from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
-from deepspeed.runtime.zero.utils import apply_to_tensors_only
+from deepspeed.runtime.zero.utils import apply_to_tensors_only, get_mapping_to_flat_buffer
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
@@ -29,7 +31,6 @@
 from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE, LOSS_SCALER
 from deepspeed.accelerator import get_accelerator
-from deepspeed.utils import z3_leaf_parameter
 
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
@@ -425,6 +426,8 @@ def __init__(
 
         self._link_all_hp_params()
 
+        self.offloaded_states: Set(OffloadDeviceEnum) = set()
+
         if dist.get_rank(group=self.dp_process_group) == 0:
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
@@ -563,21 +566,15 @@ def defragment(tensors: List[Tensor]) -> Tensor:
         cpu_buffer = torch.empty(sum(p.numel() for p in tensors),
                                  dtype=get_only_unique_item(t.dtype for t in tensors),
                                  device="cpu")
-        tensor_infos: List[Tuple[Tensor, int, int]] = []
+        tensor_infos: List[Tuple[Tensor, int, int]] = get_mapping_to_flat_buffer(tensors)
         orig_device = get_only_unique_item(t.device for t in tensors)
 
         offset = 0
-        for tensor in tensors:
-            tensor_numel = tensor.numel()
+        for tensor, offset, tensor_numel in tensor_infos:
             # move the tensor from device memory to host memory
             cpu_buffer.narrow(0, offset, tensor_numel).copy_(tensor)
             tensor.data = torch.empty(0, dtype=tensor.dtype, device=tensor.device)
 
-            # record some data so we can restore the device tensor later
-            tensor_infos.append((tensor, offset, tensor_numel))
-
-            offset += tensor_numel
-
         gc.collect()
         get_accelerator().empty_cache()
 
@@ -725,15 +722,11 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
             for sub_group in self.fp16_groups:
                 for param in sub_group:
                     parameter_partitions.append(param.ds_tensor)
-            device_buffer = __class__.defragment(parameter_partitions)
 
-            # setup flat buffers per subgroup, these are each just sections of the
-            # contiguous flat buffer for all parameters that we created earlier
-            offset = 0
-            for sub_group in self.fp16_groups:
-                sub_group_numel = sum(param.partition_numel() for param in sub_group)
-                self.fp16_partitioned_groups_flat.append(device_buffer.narrow(0, offset, sub_group_numel))
-                offset += sub_group_numel
+            # We need to keep the reference to this buffer to make sure you can free it in `offload_states`
+            self.lp_param_buffer = __class__.defragment(parameter_partitions)
+            self._set_fp16_partitioned_groups_flat()
+
         else:  # partitioned params offloaded to CPU when not in use
             # create a flat CPU memory allocation for each param group
             self._create_param_groups_fp16_flat_cpu_memory()
@@ -1008,6 +1001,15 @@ def _partitioned_params_swap_out(self, i):
             swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(dst_fp16_params=swap_fp16_params,
                                                                          src_fp32_params=swap_fp32_params)
 
+    def _set_fp16_partitioned_groups_flat(self):
+        # setup flat buffers per subgroup, these are each just sections of the
+        # contiguous flat buffer for all parameters that we created earlier
+        offset = 0
+        for sub_group in self.fp16_groups:
+            sub_group_numel = sum(param.partition_numel() for param in sub_group)
+            self.fp16_partitioned_groups_flat.append(self.lp_param_buffer.narrow(0, offset, sub_group_numel))
+            offset += sub_group_numel
+
     def initialize_optimizer_states(self):
         num_subgroups = len(self.fp16_groups)
 
@@ -2782,6 +2784,149 @@ def checkpoint_event_epilogue(self):
     def empty_partition_cache(self):
         self.parameter_offload.empty_partition_cache()
 
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False):
+        device = device.value
+
+        self.empty_partition_cache()
+
+        assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, f"Offloading is supported only for DeepSpeed FusedAdam."
+
+        def needs_offload(target):
+            # return True
+            return target not in self.offloaded_states and (include == None or target in include)
+
+        # HP param
+        if needs_offload(OffloadStateTypeEnum.hp_params):
+            if pin_memory:
+                if not hasattr(self, "hp_params_pin_buffers"):
+                    self.hp_params_pin_buffers = [
+                        get_accelerator().pin_memory(torch.empty_like(t, device=device))
+                        for t in self.fp32_partitioned_groups_flat
+                    ]
+
+                for src_tensor, dest_buf in zip(self.fp32_partitioned_groups_flat, self.hp_params_pin_buffers):
+                    dest_buf.copy_(src_tensor, non_blocking=non_blocking)
+                    src_tensor.data = dest_buf
+            else:
+                for buf in self.fp32_partitioned_groups_flat:
+                    buf.data = buf.data.to(device, non_blocking=non_blocking)
+            self.offloaded_states.add(OffloadStateTypeEnum.hp_params)
+
+        # LP param
+        if needs_offload(OffloadStateTypeEnum.lp_params):
+            if pin_memory:
+                if not hasattr(self, "lp_param_contiguous_pin_buffer"):
+                    self.lp_param_contiguous_pin_buffer = get_accelerator().pin_memory(
+                        torch.empty_like(self.lp_param_buffer, device=device))
+                self.lp_param_contiguous_pin_buffer.copy_(self.lp_param_buffer, non_blocking=non_blocking)
+                cpu_buffer = self.lp_param_contiguous_pin_buffer
+            else:
+                cpu_buffer = self.lp_param_buffer.to(device, non_blocking=non_blocking)
+
+            self.lp_param_buffer.data = cpu_buffer
+            for tensor, offset, tensor_numel in get_mapping_to_flat_buffer(
+                [p.ds_tensor for p in self.module.parameters()]):
+                tensor.data = cpu_buffer.narrow(0, offset, tensor_numel)
+
+            self.fp16_partitioned_groups_flat.clear()
+            self.offloaded_states.add(OffloadStateTypeEnum.lp_params)
+
+        # LP grad
+        if needs_offload(OffloadStateTypeEnum.lp_grads):
+            if pin_memory:
+                if not hasattr(self, "lp_grad_partitions_flat_pin_buffers"):
+                    self.lp_grad_partitions_flat_pin_buffers = get_accelerator().pin_memory(
+                        torch.empty_like(self.grad_partitions_flat_buffer, device=device))
+                self.lp_grad_partitions_flat_pin_buffers.copy_(self.grad_partitions_flat_buffer,
+                                                               non_blocking=non_blocking)
+                self.grad_partitions_flat_buffer.data = self.lp_grad_partitions_flat_pin_buffers
+            else:
+                self.grad_partitions_flat_buffer.data = self.grad_partitions_flat_buffer.data.to(device)
+            self.averaged_gradients = {}
+
+            self.__param_id_to_grad_partition = {}
+
+            self.offloaded_states.add(OffloadStateTypeEnum.lp_grads)
+
+        # contiguous bucket
+        if needs_offload(OffloadStateTypeEnum.contiguous_grad_buffer):
+            if hasattr(self, "_DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer"):
+                # Record properties like shape, strides, etc. as a meta tensor
+                self.grad_buffer_meta = self.__ipg_bucket_flat_buffer.to("meta")
+                self.__ipg_bucket_flat_buffer = None
+                self.offloaded_states.add(OffloadStateTypeEnum.contiguous_grad_buffer)
+
+        # Adam
+        if needs_offload(OffloadStateTypeEnum.optim_states):
+            offload_adam_states(self.optimizer, device, pin_memory=pin_memory, non_blocking=non_blocking)
+            self.offloaded_states.add(OffloadStateTypeEnum.optim_states)
+
+        gc.collect()
+        get_accelerator().empty_cache()
+
+    def reload_states(self, non_blocking: bool = False):
+
+        device = get_accelerator().current_device_name()
+
+        # HP param
+        if OffloadStateTypeEnum.hp_params in self.offloaded_states:
+            if hasattr(self, "hp_params_pin_buffers"):
+                for src, dest in zip(self.hp_params_pin_buffers, self.fp32_partitioned_groups_flat):
+                    dest.data = src.to(device, non_blocking=non_blocking)
+            else:
+                for buf in self.fp32_partitioned_groups_flat:
+                    buf.data = buf.data.to(device, non_blocking=non_blocking)
+            self.offloaded_states.remove(OffloadStateTypeEnum.hp_params)
+
+        # LP Param
+        if OffloadStateTypeEnum.lp_params in self.offloaded_states:
+            cpu_buffer = self.lp_param_contiguous_pin_buffer if hasattr(
+                self, "lp_param_contiguous_pin_buffer") else self.lp_param_buffer
+            self.lp_param_buffer.data = cpu_buffer.data.to(device, non_blocking=non_blocking)
+            self._set_fp16_partitioned_groups_flat()
+
+            for tensor, offset, tensor_numel in get_mapping_to_flat_buffer(
+                [p.ds_tensor for p in self.module.parameters()]):
+                tensor.data = self.lp_param_buffer.narrow(0, offset, tensor_numel)
+            self.offloaded_states.remove(OffloadStateTypeEnum.lp_params)
+
+        # LP grad
+        if OffloadStateTypeEnum.lp_grads in self.offloaded_states:
+            if hasattr(self, "lp_grad_partitions_flat_pin_buffers"):
+                self.grad_partitions_flat_buffer.data = self.lp_grad_partitions_flat_pin_buffers.to(
+                    device, non_blocking=non_blocking)
+            else:
+                self.grad_partitions_flat_buffer.data = self.grad_partitions_flat_buffer.data.to(
+                    device, non_blocking=non_blocking)
+            self.averaged_gradients = {}
+
+            offset = 0
+            all_params = list(itertools.chain.from_iterable(self.fp16_groups))
+            for param in all_params:
+                self.__param_id_to_grad_partition[param.ds_id] = self.grad_partitions_flat_buffer.narrow(
+                    0, offset, param.partition_numel())
+                offset += param.partition_numel()
+
+            self.offloaded_states.remove(OffloadStateTypeEnum.lp_grads)
+
+        # contiguous bucket
+        if OffloadStateTypeEnum.contiguous_grad_buffer in self.offloaded_states:
+            self.__ipg_bucket_flat_buffer = torch.empty_like(self.grad_buffer_meta, device=device)
+            # self.__ipg_bucket_flat_buffer.data = self.__ipg_bucket_flat_buffer.data.to(device)
+            self.offloaded_states.remove(OffloadStateTypeEnum.contiguous_grad_buffer)
+
+        # Adam
+        if OffloadStateTypeEnum.optim_states in self.offloaded_states:
+            reload_adam_states(self.optimizer, device, non_blocking=non_blocking)
+            self.offloaded_states.remove(OffloadStateTypeEnum.optim_states)
+
+        if non_blocking:
+            get_accelerator().synchronize()
+
 
 def _handle_overflow(cpu_sum, x, i):
     import math
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index 8f913d065934a..2d1cf17962d8a 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 import os
-from typing import List
+from typing import List, Tuple
 
 import torch
 from deepspeed import comm as dist
@@ -160,3 +160,16 @@ def apply_to_tensors_only(function, value, warning_msg_fn=None):
                 logger.warning(warning_msg_fn(value))
                 warned = True
         return value
+
+
+def get_mapping_to_flat_buffer(tensors: List[torch.Tensor]) -> List[Tuple[torch.Tensor, int, int]]:
+    tensor_infos: List[Tuple[torch.Tensor, int, int]] = []
+
+    offset = 0
+    for tensor in tensors:
+        tensor_numel = tensor.numel()
+        # record some data so we can restore the device tensor later
+        tensor_infos.append((tensor, offset, tensor_numel))
+        offset += tensor_numel
+
+    return tensor_infos
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index 2a6a48ca91db8..f0974c08c9f3b 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -456,3 +456,56 @@ The following code snippet illustrates this functionality.
 
     # Free GPU memory consumed by model parameters
     ds_engine.empty_partition_cache()
+
+
+Offload States
+--------------
+
+The DeepSpeed engine maintains a set of states in device memory (e.g., CUDA memory). The following API allows you to offload these states to a different device (currently, only CPU memory is supported), reducing the memory footprint on the device.
+
+.. code-block:: python
+
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False) -> None:
+        """Offload the engine's states to the specified device.
+
+        Arguments:
+            include: Optional. The set of states to offload. If not provided, all states are offloaded.
+            device: Optional. The device to move the ZeRO optimizer buffers to. Currently only `OffloadDeviceEnum.cpu` is supported.
+            pin_memory: Optional. Whether to pin the memory of the offloaded states.
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+
+You can selectively offload specific states by specifying the ``OffloadStateTypeEnum`` in the include argument. ``OffloadStateTypeEnum`` is an enum that defines the states that can be offloaded. The following states are supported:
+
+* ``OffloadStateTypeEnum.optim_states``: Optimizer states. Currently, only states of DeepSpeed's FusedAdam optimizer are supported.
+* ``OffloadStateTypeEnum.hp_params``: FP32 parameters.
+* ``OffloadStateTypeEnum.lp_params``: BF16/FP16 parameters.
+* ``OffloadStateTypeEnum.lp_grads``: BF16/FP16 gradients.
+* ``OffloadStateTypeEnum.contiguous_grad_buffer``: The contiguous gradient buffer for reduce operations.
+
+Note that offloading states comes with a trade-off between memory savings and computational overhead. This API allows states to be reloaded back into device memory when needed.
+
+.. code-block:: python
+
+    def reload_states(self, non_blocking: bool = False) -> None:
+        """Reload the engine states to the original device.
+
+        Arguments:
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+
+Below is an example code snippet demonstrating how to offload FP32 parameters and optimizer states to CPU memory:
+
+.. code-block:: python
+
+    # Offload after forward, backward, and step
+    ds_engine.offload_states(include=[OffloadStateTypeEnum.hp_params, OffloadStateTypeEnum.optim_states])
+
+    # Do something requiring a lot of device memory
+    ...
+    # Load states back to device memory
+    ds_engine.reload_states()
diff --git a/tests/unit/runtime/zero/test_offload_states.py b/tests/unit/runtime/zero/test_offload_states.py
new file mode 100644
index 0000000000000..cc60908d3c337
--- /dev/null
+++ b/tests/unit/runtime/zero/test_offload_states.py
@@ -0,0 +1,125 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+import torch
+
+from unit.common import DistributedTest
+from unit.simple_model import random_dataloader, SimpleModel
+
+import deepspeed
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
+from deepspeed.utils import safe_get_local_fp32_param, safe_get_local_optimizer_state
+
+
+def validate_device(model, device: torch.device, include) -> None:
+    # Make sure the model parameters are offloaded
+    if include is None or OffloadStateTypeEnum.hp_params in include:
+        assert all(safe_get_local_fp32_param(p).device == device for p in model.parameters())
+    if include is None or OffloadStateTypeEnum.lp_params in include:
+        assert all(p.ds_tensor.device == device for p in model.parameters())
+    if include is None or OffloadStateTypeEnum.lp_grads in include:
+        assert model.optimizer.grad_partitions_flat_buffer.device == device
+    if include is None or OffloadStateTypeEnum.optim_states in include:
+        assert all(safe_get_local_optimizer_state(p, "exp_avg").device == device for p in model.parameters())
+        assert all(safe_get_local_optimizer_state(p, "exp_avg_sq").device == device for p in model.parameters())
+
+
+def run_model(model, config_dict, hidden_dim, dtype, include, pin_memory, non_blocking):
+    # Currently we only support OffloadDeviceEnum.cpu
+    offload_device = OffloadDeviceEnum.cpu
+
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+    data_loader = random_dataloader(model=model,
+                                    total_samples=10,
+                                    hidden_dim=hidden_dim,
+                                    device=model.device,
+                                    dtype=dtype)
+    dist.barrier()
+    for batch in data_loader:
+        loss = model(batch[0], batch[1])
+        model.backward(loss)
+        model.step()
+
+        hp_params_expected = [safe_get_local_fp32_param(p).clone() for p in model.parameters()]
+        lp_params_expected = [p.ds_tensor.clone() for p in model.parameters()]
+        lp_grads_expected = model.optimizer.grad_partitions_flat_buffer.clone()
+        adam_exp_avg_expected = [safe_get_local_optimizer_state(p, "exp_avg").clone() for p in model.parameters()]
+        adam_exp_avg_sq = [safe_get_local_optimizer_state(p, "exp_avg_sq").clone() for p in model.parameters()]
+
+        # Start offloading
+        alloc_before_offload = get_accelerator().memory_allocated()
+        model.offload_states(include=include, device=offload_device, pin_memory=pin_memory, non_blocking=non_blocking)
+        alloc_after_offload = get_accelerator().memory_allocated()
+        assert alloc_after_offload < alloc_before_offload, f"Allocated memory should decrease after offload"
+
+        validate_device(model, torch.device(offload_device.value), include)
+
+        # Reload states
+        model.reload_states()
+        assert alloc_after_offload < get_accelerator().memory_allocated(
+        ), f"Allocated memory should increase after offload back"
+
+        # Verify restored states
+        hp_param_restored = [safe_get_local_fp32_param(p) for p in model.parameters()]
+        for hp_param_expected, hp_param_restored in zip(hp_params_expected, hp_param_restored):
+            assert torch.equal(hp_param_expected, hp_param_restored)
+
+        lp_param_restored = [p.ds_tensor for p in model.parameters()]
+
+        for lp_param_expected, lp_param_restored in zip(lp_params_expected, lp_param_restored):
+            assert torch.equal(lp_param_expected, lp_param_restored)
+
+        assert torch.equal(lp_grads_expected, model.optimizer.grad_partitions_flat_buffer)
+
+        adam_exp_avg_restored = [safe_get_local_optimizer_state(p, "exp_avg") for p in model.parameters()]
+        for adam_exp_avg_expected, adam_exp_avg_restored in zip(adam_exp_avg_expected, adam_exp_avg_restored):
+            assert torch.equal(adam_exp_avg_expected, adam_exp_avg_restored)
+
+        adam_exp_avg_sq_restored = [safe_get_local_optimizer_state(p, "exp_avg_sq") for p in model.parameters()]
+        for adam_exp_avg_sq_expected, adam_exp_avg_sq_restored in zip(adam_exp_avg_sq, adam_exp_avg_sq_restored):
+            assert torch.equal(adam_exp_avg_sq_expected, adam_exp_avg_sq_restored)
+
+        validate_device(model, torch.device(get_accelerator().current_device_name()), include)
+
+    # Needed in ZeRO 3. Not doing so can give memory leak
+    model.destroy()
+
+
+@pytest.mark.parametrize("included_state", [
+    OffloadStateTypeEnum.hp_params, OffloadStateTypeEnum.lp_params, OffloadStateTypeEnum.optim_states,
+    OffloadStateTypeEnum.lp_grads, OffloadStateTypeEnum.contiguous_grad_buffer, None
+])
+@pytest.mark.parametrize("pin_memory", [False, True])
+@pytest.mark.parametrize("non_blocking", [False, True])
+class TestOffloadStates(DistributedTest):
+    # Need multiple gpus to test possible hanging
+    world_size = 2
+
+    def test_offload_states(self, included_state, pin_memory, non_blocking):
+        hidden_dim = 1024
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "zero_optimization": {
+                "stage": 3,
+            }
+        }
+        config_dict["bf16"] = {"enabled": True}
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim, nlayers=4)
+
+        include = None if included_state is None else [included_state]
+        run_model(model, config_dict, hidden_dim, torch.bfloat16, include, pin_memory, non_blocking)

From 1caf6e8107689f5ea9611ac2d6bbbf3a3e6e9731 Mon Sep 17 00:00:00 2001
From: Nadav Elyahu <88962733+nelyahu@users.noreply.github.com>
Date: Fri, 27 Sep 2024 09:11:06 +0300
Subject: [PATCH 09/43] add bfloat16 to inference support dtypes (#6528)

to allow running inference tasks using bfloat16

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
---
 deepspeed/inference/engine.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 74fc7f74f6bdd..68836ceb523cc 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -73,8 +73,9 @@ def __init__(self, model, config):
         if hasattr(self.module, "config"):
             TransformerPolicy.hf_model_config = self.module.config
 
-        if config.dtype == torch.half and not get_accelerator().is_fp16_supported():
-            raise ValueError("Type fp16 is not supported.")
+        if config.dtype not in get_accelerator().supported_dtypes():
+            raise ValueError(
+                f"Data type {config.dtype} is not supported by {get_accelerator().device_name()} accelerator")
 
         # todo: keep this self.injection_dict because we don't use to change config.injection_policy API
         # todo: this will get changed when Molly's PR on auto injection dict is merged
@@ -324,7 +325,7 @@ def _validate_args(self, mpu, replace_with_kernel_inject):
         if self._config.checkpoint is not None and not isinstance(self._config.checkpoint, (str, dict)):
             raise ValueError(f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}")
 
-        supported_dtypes = [None, torch.half, torch.int8, torch.float]
+        supported_dtypes = [None, torch.half, torch.int8, torch.float, torch.bfloat16]
         if self._config.dtype not in supported_dtypes:
             raise ValueError(f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
 

From d4e189507659aca7970185d33b84115fbb11b490 Mon Sep 17 00:00:00 2001
From: Yizhou Wang <yizhou.wang@intel.com>
Date: Fri, 27 Sep 2024 14:45:42 +0800
Subject: [PATCH 10/43] [COMPILE] workflow for deepspeed + torch.compile
 (#6570)

We use simple model + deepspeed zero 3 + torch.compile and count graph
break numbers to demonstrate current status of combing deepspeed +
torch.compile.

---------

Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
---
 .github/workflows/xpu-compile.yml   | 59 +++++++++++++++++
 tests/torch_compile/ds_config.json  | 41 ++++++++++++
 tests/torch_compile/test_compile.py | 99 +++++++++++++++++++++++++++++
 3 files changed, 199 insertions(+)
 create mode 100644 .github/workflows/xpu-compile.yml
 create mode 100644 tests/torch_compile/ds_config.json
 create mode 100644 tests/torch_compile/test_compile.py

diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
new file mode 100644
index 0000000000000..c2392091012f3
--- /dev/null
+++ b/.github/workflows/xpu-compile.yml
@@ -0,0 +1,59 @@
+name: xpu-compile
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/xpu-compile.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  compile-tests:
+    runs-on: [self-hosted, intel, xpu]
+    container:
+      image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install prerequisite
+      run: |
+        apt-get update
+        apt-get install clinfo libaio-dev python3-pip -y
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+        pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
+        pip install py-cpuinfo numpy
+        pip install .[dev,autotuning]
+
+    - name: Check container state
+      run: |
+        ldd --version
+        ds_report
+        python3 -c "import torch; print('torch:', torch.__version__, torch)"
+        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
+        pip list
+
+    - name: Compile Status
+      shell: bash
+      run: |
+        export FI_HMEM=system
+        ulimit -n 1048575
+        cd tests/torch_compile
+        export ZE_AFFINITY_MASK=0,1
+        deepspeed test_compile.py --deepspeed_config ds_config.json 2>&1 | tee log.txt
+        cat log.txt |  grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY
diff --git a/tests/torch_compile/ds_config.json b/tests/torch_compile/ds_config.json
new file mode 100644
index 0000000000000..361bc115eaeeb
--- /dev/null
+++ b/tests/torch_compile/ds_config.json
@@ -0,0 +1,41 @@
+{
+  "train_batch_size": 8,
+  "steps_per_print": 2000,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 1000
+    }
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "bf16": {
+      "enabled": true,
+      "loss_scale": 0,
+      "loss_scale_window": 500,
+      "hysteresis": 2,
+      "min_loss_scale": 1,
+      "initial_scale_power": 15
+  },
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+      "stage": 3,
+      "reduce_scatter": true,
+      "overlap_comm": false,
+      "contiguous_gradients": false
+  }
+}
diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py
new file mode 100644
index 0000000000000..529ca56ae0a8e
--- /dev/null
+++ b/tests/torch_compile/test_compile.py
@@ -0,0 +1,99 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import argparse
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm
+
+import torch
+import intel_extension_for_pytorch  # noqa: F401 # type: ignore
+from torch.utils.data import Dataset, DataLoader
+
+torch._dynamo.config.cache_size_limit = 100
+
+import collections
+
+
+def get_dynamo_stats():
+    # TODO: consider deepcopy'ing the entire counters struct and
+    # adding a helper to do subtraction on it
+    return collections.Counter({
+        "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
+        "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
+        "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
+        # NB: The plus removes zero counts
+        "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
+        "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"]["captures"],
+        "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"]["compiles"],
+        "cudagraph_skips": torch._dynamo.utils.counters["inductor"]["cudagraph_skips"],
+    })
+
+
+class RandomDataset(Dataset):
+
+    def __init__(self, size, length):
+        self.len = length
+        self.data = torch.randn(length, size).to(torch.bfloat16)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+
+data_size = 1024
+data_length = 100
+rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length), batch_size=1, shuffle=False)
+
+
+class MyModule(torch.nn.Module):
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.fc0 = torch.nn.Linear(1024, 256, bias=False)
+        self.fc1 = torch.nn.Linear(256, 256, bias=False)
+        self.dropout = torch.nn.Dropout(0.5)
+
+    def forward(self, data, residual):
+        output = residual + self.fc1(self.fc0(self.dropout(data))) * 0.5
+        return output
+
+
+model = MyModule()
+params = model.parameters()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher')
+parser.add_argument('--deepspeed_config',
+                    type=str,
+                    default='ds_config.json',
+                    help='path to DeepSpeed configuration file')
+cmd_args = parser.parse_args()
+
+# initialize the DeepSpeed engine
+model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args, model=model, model_parameters=params)
+model_engine.compile()
+
+residual = torch.rand(256, 256, dtype=torch.float).to(get_accelerator().current_device_name())
+
+start_stats = get_dynamo_stats()
+
+for step, batch in enumerate(rand_loader):
+    if step % 10 == 0 and comm.get_rank() == 0:
+        print(f'step={step}')
+    # forward() method
+    loss = model_engine(batch.to(get_accelerator().current_device_name()), residual).sum()
+    # runs backpropagation
+    model_engine.backward(loss)
+    # weight update
+    model_engine.step()
+
+dynamo_stats = get_dynamo_stats()
+dynamo_stats.subtract(start_stats)
+
+if comm.get_rank() == 0:
+    print(dynamo_stats)

From 828ddfbbda2482412fffc89f5fcd3b0d0eba9a62 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Fri, 27 Sep 2024 09:22:13 -0700
Subject: [PATCH 11/43] Fixes on the accelerate side mean we do not need to
 skip this test (#6583)

HF accelerate implemented fixes here:
https://github.com/huggingface/accelerate/pull/3131

This means we can revert the changes from #6574
---
 .github/workflows/nv-accelerate-v100.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 76b8c93fba3f9..915493bb3183c 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -55,4 +55,4 @@ jobs:
           # tmp fix: force newer datasets version
           #pip install "datasets>=2.0.0"
           pip list
-          pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed -k "not test_prepare_multiple_models_zero3_inference"
+          pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed

From 8cded575a94e296fee751072e862304676c95316 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Fri, 27 Sep 2024 13:32:48 -0700
Subject: [PATCH 12/43] Fix torch include in `op_builder/mlu/fused_adam.py` and
 update no-torch workflow triggers (#6584)

Changes from #6472 caused the no-torch workflow that is an example of
how we build the DeepSpeed release package to fail (so we caught this
before a release, see more in #6402). These changes also copy the style
used to include torch in other accelerator op_builder implementations,
such as npu
[here](https://github.com/microsoft/DeepSpeed/blob/master/op_builder/npu/fused_adam.py#L8)
and hpu
[here](https://github.com/microsoft/DeepSpeed/blob/828ddfbbda2482412fffc89f5fcd3b0d0eba9a62/op_builder/hpu/fused_adam.py#L15).

This also updates the no-torch workflow to run on all changes to the
op_builder directory. The test runs quickly and shouldn't add any
additional testing burden there.

Resolves: #6576
---
 .github/workflows/no-torch.yml | 1 +
 op_builder/mlu/fused_adam.py   | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
index 794c02b3c4dbf..eb3ac9b03161a 100644
--- a/.github/workflows/no-torch.yml
+++ b/.github/workflows/no-torch.yml
@@ -5,6 +5,7 @@ on:
   pull_request:
     paths:
       - '.github/workflows/no-torch.yml'
+      - 'op_builder/**'
   schedule:
     - cron: "0 0 * * *"
 
diff --git a/op_builder/mlu/fused_adam.py b/op_builder/mlu/fused_adam.py
index 0198db7cb2768..2343e55568c48 100644
--- a/op_builder/mlu/fused_adam.py
+++ b/op_builder/mlu/fused_adam.py
@@ -5,7 +5,11 @@
 # DeepSpeed Team
 
 from .builder import MLUOpBuilder
-import torch
+
+try:
+    import torch
+except ImportError as e:
+    pass
 
 
 class MLUFusedAdam:

From b93c7a20c8b50258935d8598e53a6e388762fee9 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Date: Fri, 4 Oct 2024 14:31:25 -0700
Subject: [PATCH 13/43] [ROCm] Fix subprocess error (#6587)

Fixes https://github.com/microsoft/DeepSpeed/issues/6585
Use shell=True for subprocess.check_output() in case of ROCm commands.
Do not use shlex.split() since command string has wildcard expansion.

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
---
 op_builder/builder.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index e935a179f6afd..f95341f137b4b 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -253,8 +253,7 @@ def get_rocm_gpu_arch():
             rocm_info = Path("rocminfo")
         rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
         try:
-            safe_cmd = shlex.split(rocm_gpu_arch_cmd)
-            result = subprocess.check_output(safe_cmd)
+            result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
             rocm_gpu_arch = result.decode('utf-8').strip()
         except subprocess.CalledProcessError:
             rocm_gpu_arch = ""
@@ -272,8 +271,7 @@ def get_rocm_wavefront_size():
         rocm_wavefront_size_cmd = str(
             rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
         try:
-            safe_cmd = shlex.split(rocm_wavefront_size_cmd)
-            result = subprocess.check_output(rocm_wavefront_size_cmd)
+            result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
             rocm_wavefront_size = result.decode('utf-8').strip()
         except subprocess.CalledProcessError:
             rocm_wavefront_size = "32"

From 239b83a77e952533439104fced9a72456a010a75 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 7 Oct 2024 10:01:53 -0700
Subject: [PATCH 14/43] Cleanup CODEOWNERS file to be valid (#6603)

---
 CODEOWNERS | 76 +++++++++++++++++++++++++++---------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 2c16aef39a1be..c0fc85cb8b898 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -7,50 +7,50 @@
 
 
 # top-level repo folders
-/.github/ @mrwyattii @loadams
-/azure/ @mrwyattii @awan-10
-/benchmarks/ @awan-10 @mrwyattii
-/bin/ @mrwyattii
-/csrc/ @awan-10 @mrwyattii @cmikeh2 @arashb
-/deepspeed/ @mrwyattii
-/docker/ @mrwyattii @awan-10
-/docs/ @mrwyattii
-/examples/ @awan-10 @mrwyattii
-/op_builder/ @mrwyattii @cmikeh2
-/release/ @loadams @mrwyattii
-/requirements/ @loadams @mrwyattii
-/scripts/ @mrwyattii @awan-10
-/tests/ @mrwyattii @tjruwase @loadams
+/.github/ @loadams
+/azure/ @awan-10
+/benchmarks/ @awan-10 @tjruwase
+/bin/ @loadams
+/csrc/ @awan-10
+/deepspeed/ @loadams @tjruwase
+/docker/ @awan-10
+/docs/ @loadams @tjruwase
+/examples/ @awan-10 @tohtana
+/op_builder/ @loadams @tjruwase @jomayeri
+/release/ @loadams
+/requirements/ @loadams
+/scripts/ @awan-10
+/tests/ @tjruwase @loadams @tohtana
 
 # deepspeed
-/deepspeed/autotuning/ @mrwyattii
+/deepspeed/autotuning/ @loadams
 /deepspeed/checkpoint/ @tjruwase
 /deepspeed/comm/ @awan-10
-/deepspeed/compression/ @minjiaz @xiaoxiawu-microsoft @conglongli
-/deepspeed/elasticity/ @mrwyattii @awan-10
-/deepspeed/launcher/ @mrwyattii @awan-10
-/deepspeed/module_inject/ @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/moe/ @awan-10
-/deepspeed/monitor/ @awan-10 @mrwyattii
-/deepspeed/nebula/ @tjruwase @mrwyattii
-/deepspeed/ops/ @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/pipe/ @ShadenSmith @duli2012
-/deepspeed/profiling/ @ShijieZZZZ
-/deepspeed/utils/ @mrwyattii @tjruwase @awan-10
+/deepspeed/compression/ @tjruwase
+/deepspeed/elasticity/ @awan-10
+/deepspeed/launcher/ @loadams
+/deepspeed/module_inject/ @awan-10
+/deepspeed/moe/ @tohtana
+/deepspeed/monitor/ @awan-10
+/deepspeed/nebula/ @tjruwase
+/deepspeed/ops/ @tohtana
+/deepspeed/pipe/ @tohtana @loadams
+/deepspeed/profiling/ @loadams
+/deepspeed/utils/ @tjruwase @awan-10
 
 # inference
-/deepspeed/inference/ @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/model_implementations/ @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/inference/ @awan-10
+/deepspeed/model_implementations/ @awan-10
 
 # training
-/deepspeed/runtime/ @mrwyattii @tjruwase
-/deepspeed/runtime/activation_checkpointing/ @mrwyattii @tjruwase
-/deepspeed/runtime/checkpoint_engine/ @tjruwase @mrwyattii
+/deepspeed/runtime/ @tjruwase @tohtana
+/deepspeed/runtime/activation_checkpointing/ @tjruwase
+/deepspeed/runtime/checkpoint_engine/ @tjruwase
 /deepspeed/runtime/comm/ @awan-10
-/deepspeed/runtime/compression/ @awan-10 @conglongli
-/deepspeed/runtime/data_pipeline/ @conglongli
-/deepspeed/runtime/fp16/ @mrwyattii @tjruwase
-/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
-/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
-/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
-/deepspeed/runtime/zero/ @tjruwase @mrwyattii
+/deepspeed/runtime/compression/ @awan-10
+/deepspeed/runtime/data_pipeline/ @tjruwase
+/deepspeed/runtime/fp16/ @tjruwase
+/deepspeed/runtime/fp16/onebit/ @awan-10
+/deepspeed/runtime/pipe/ @loadams
+/deepspeed/runtime/swap_tensor/ @tjruwase
+/deepspeed/runtime/zero/ @tjruwase

From 940887ded1000121cc11b746a6a2eb9b53e3e6fc Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 7 Oct 2024 11:22:05 -0700
Subject: [PATCH 15/43] Add SSF Best practices badge (#6604)

Work in progress to ensure we meet SSF best practices:
https://www.bestpractices.dev/en/projects/9530
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2f6661ef58603..86711c4374f83 100755
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 [![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
 [![Downloads](https://static.pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
 [![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9530/badge)](https://www.bestpractices.dev/projects/9530)
 [![Twitter](https://img.shields.io/twitter/follow/MSFTDeepSpeed)](https://twitter.com/intent/follow?screen_name=MSFTDeepSpeed)
 [![Japanese Twitter](https://img.shields.io/badge/%E6%97%A5%E6%9C%AC%E8%AA%9ETwitter-%40MSFTDeepSpeedJP-blue)](https://twitter.com/MSFTDeepSpeedJP)
 [![Chinese Zhihu](https://img.shields.io/badge/%E7%9F%A5%E4%B9%8E-%E5%BE%AE%E8%BD%AFDeepSpeed-blue)](https://www.zhihu.com/people/deepspeed)

From 20695b39b19b64bf0ae0ef3e590bc29bccff36c7 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 7 Oct 2024 21:06:51 -0700
Subject: [PATCH 16/43] Move V100 workflows from cuda 11.1/11.7 to 12.1 (#6607)

---
 .github/workflows/nv-accelerate-v100.yml    |  4 ++--
 .github/workflows/nv-ds-chat.yml            |  4 ++--
 .github/workflows/nv-inference.yml          | 10 +++++-----
 .github/workflows/nv-lightning-v100.yml     |  4 ++--
 .github/workflows/nv-mii.yml                |  4 ++--
 .github/workflows/nv-nightly.yml            |  6 +++---
 .github/workflows/nv-torch-latest-v100.yml  |  8 ++++----
 .github/workflows/nv-torch-nightly-v100.yml |  4 ++--
 .github/workflows/nv-transformers-v100.yml  |  4 ++--
 9 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 915493bb3183c..346055e2685f4 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
index c823c194e83cb..2ad336cac4edc 100644
--- a/.github/workflows/nv-ds-chat.yml
+++ b/.github/workflows/nv-ds-chat.yml
@@ -26,7 +26,7 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -36,7 +36,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index f863226bfb954..c0a4275bd2b6b 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -22,7 +22,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -58,8 +58,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          #pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="11.8"
+          #pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.1"
           # run ds_report again to check updated op list
           ds_report
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index 6af228f7fb2f8..a9a26b7ce816c 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
 
@@ -31,7 +31,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index d394b7e24bd60..a576e5933b08d 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -27,7 +27,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -37,7 +37,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index 8658ff5d2348a..e6032941ebb30 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -18,7 +18,7 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -28,7 +28,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -58,7 +58,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.4" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.4" --cuda_ver="12.1"
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index ebef2d35c2780..e888c472638fd 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -55,5 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="12.1"
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index 2570404390053..74495812add5f 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -15,7 +15,7 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -25,7 +25,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
+          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index cfed6d6583e6c..18c5e2c98bc6a 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -18,7 +18,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu117, v100]
+    runs-on: [self-hosted, nvidia, cu121, v100]
 
     steps:
       - uses: actions/checkout@v4
@@ -29,7 +29,7 @@ jobs:
       - name: Install pytorch
         run: |
           # use the same pytorch version as transformers CI
-          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu121 --index-url https://download.pytorch.org/whl/cu121
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 

From 00c4b98ba0786d3ce99bde46ee60811f07f01dff Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 8 Oct 2024 10:42:22 -0700
Subject: [PATCH 17/43] Fix SD workflow (#6609)

SD workflow needed updates when we moved to pydantic 2 support that was
never added before.

Passing nv-sd workflow
[here](https://github.com/microsoft/DeepSpeed/actions/runs/11239699283)
---
 .github/workflows/nv-sd.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
index b348d5ff931f5..0344c80451a68 100644
--- a/.github/workflows/nv-sd.yml
+++ b/.github/workflows/nv-sd.yml
@@ -53,6 +53,8 @@ jobs:
           pip install image-similarity-measures
           python -m pip install opencv-python==4.6.* --force-reinstall
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          # Update packages included in the container that do not support pydantic 2+ to versions that do
+          python -m pip install thinc spacy confection --upgrade
           python -m pip install .[dev,1bit,autotuning,sd]
           ds_report
       - name: Python environment

From 745dd48b90260bff3c65c7620d993df9ca6c529b Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 8 Oct 2024 11:15:46 -0700
Subject: [PATCH 18/43] Pin accelerate to fix CI failures/issues (#6610)

---
 .github/workflows/nv-accelerate-v100.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 346055e2685f4..3bddc329e3dd1 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -47,6 +47,7 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           git clone https://github.com/huggingface/accelerate
           cd accelerate
+          git checkout ae9cb6e4db6f81fd18148c2cc67d72b903d81a46
           git rev-parse --short HEAD
           # installing dependencies
           pip install .[testing]

From e97b453645a03bc6901d74ec13be1c5d7f1a1fec Mon Sep 17 00:00:00 2001
From: Yejing-Lai <yejing.lai@intel.com>
Date: Wed, 9 Oct 2024 02:16:04 +0800
Subject: [PATCH 19/43] Add llama3.2 vision autotp (#6577)

Llama3.2-11b and llama3.2-90b including vision model and text model,
these two models have different num_kv_heads, so we need to set
num_kv_heads dynamically.

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/module_inject/auto_tp.py        |  3 ++-
 deepspeed/module_inject/replace_module.py | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index e6eea2183de5a..52d7c95ec9d88 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -495,7 +495,8 @@ def get_model_num_kv_heads(self, config):
         num_kv_heads = None
         # multi_query_group_num is for chatglm2 & chatglm3
         kv_head_names = [
-            'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads'
+            'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads',
+            'attention_heads'
         ]
         for name in kv_head_names:
             if hasattr(config, name):
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 64dc5479940c3..cf70c4530c822 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -274,7 +274,13 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         _autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
 
         # 3. Try to get num_key_heads from model_config.num_key_value_heads
-        num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
+        if hasattr(model_config, "vision_config"):
+            if "MllamaVisionEncoderLayer" in str(module):
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config)
+            else:
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config)
+        else:
+            num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
 
         # 4. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
         set_num_kv_heads(num_kv_heads)
@@ -339,6 +345,8 @@ def set_lm_head(module):
                                                       "weight") and not module.embed_out.weight.is_meta and isinstance(
                                                           module.embed_out, torch.nn.Linear):
             module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
+        elif hasattr(module.language_model, "lm_head"):
+            module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head")
         return module
 
     def conv2d_parallel_shard_weights(model, rank, world_size):

From f74ea69abf09294834eba13b003568a440e3d22b Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Tue, 8 Oct 2024 14:38:51 -0400
Subject: [PATCH 20/43] Improve DS logging control (#6602)

Disable `steps_per_print` by default.
---
 deepspeed/runtime/constants.py | 2 +-
 deepspeed/runtime/engine.py    | 8 ++++----
 deepspeed/utils/timer.py       | 9 +++++++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index 679230ca7d4c3..55cfa8f59c91a 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -77,7 +77,7 @@
 
 # Steps
 STEPS_PER_PRINT = "steps_per_print"
-STEPS_PER_PRINT_DEFAULT = 10
+STEPS_PER_PRINT_DEFAULT = None
 
 #########################################
 # Training micro batch size per GPU
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index b590ea432658c..0371c5663a2d1 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -2145,8 +2145,6 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
         else:
             self.zero_grad()
 
-        report_progress = self.global_rank == 0 if self.global_rank else True
-
         # Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function.
         overflow = False
         if hasattr(self.optimizer, "overflow"):
@@ -2166,8 +2164,10 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
                     # pipe_engine.train_batch()
                     self.lr_scheduler.step(self.train_batch_size())
 
-        if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
-            self._report_progress(self.global_steps + 1)
+        if self.steps_per_print() is not None:
+            report_progress = self.global_rank == 0 if self.global_rank else True
+            if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
+                self._report_progress(self.global_steps + 1)
 
         self.losses = None
         self.global_steps += 1
diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py
index 00f17dea709c9..64ae8ac0e5b4f 100755
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -198,7 +198,7 @@ def get_mean(self, names, normalizer=1.0, reset=True):
 
 class ThroughputTimer:
 
-    def __init__(self, config, batch_size, start_step=2, steps_per_output=50, monitor_memory=False, logging_fn=None):
+    def __init__(self, config, batch_size, start_step=2, steps_per_output=None, monitor_memory=False, logging_fn=None):
         from deepspeed.utils import logger
         self.config = config
         self.start_time = 0
@@ -238,6 +238,11 @@ def start(self):
                 get_accelerator().synchronize()
             self.start_time = time.time()
 
+    def _is_report_boundary(self):
+        if self.steps_per_output is None:
+            return False
+        return self.global_step_count % self.steps_per_output == 0
+
     def stop(self, global_step=False, report_speed=True):
         if not self.config.enabled or not self.started:
             return
@@ -255,7 +260,7 @@ def stop(self, global_step=False, report_speed=True):
             self.step_elapsed_time += duration
 
             if global_step:
-                if report_speed and self.global_step_count % self.steps_per_output == 0:
+                if report_speed and self._is_report_boundary():
                     self.logging(
                         "epoch={}/micro_step={}/global_step={}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, "
                         "MemAllocated={}GB, MaxMemAllocated={}GB".format(

From 5cbbff40bd2eceb4340376d7e3db764c69a2527a Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Tue, 8 Oct 2024 13:41:44 -0700
Subject: [PATCH 21/43] Fix device selection using CUDA_VISIBLE_DEVICES (#6530)

This PR addresses #5818.
Instead of contiguous numbers based on the device count, this PR uses
device indices in `--include`.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/launcher/runner.py | 44 ++++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 076f65b201b34..5f926834dda22 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -20,6 +20,8 @@
 from copy import deepcopy
 import signal
 import time
+from typing import Tuple, List, Dict
+from collections import defaultdict
 import shlex
 
 from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner, IMPIRunner
@@ -263,6 +265,31 @@ def _stable_remove_duplicates(data):
     return new_list
 
 
+def parse_node_config(node_config: str) -> Tuple[str, List[int]]:
+    SLOT_LIST_START = ':'
+    SLOT_SEP = ','
+
+    if SLOT_LIST_START not in node_config:
+        return node_config, []
+
+    hostname, slots = node_config.split(SLOT_LIST_START)
+    slots = [int(x) for x in slots.split(SLOT_SEP)]
+
+    return hostname, slots
+
+
+def parse_node_config_list(node_config_list: List[str]) -> Dict[str, List[int]]:
+    NODE_SEP = '@'
+
+    node_configs = defaultdict(list)
+
+    for node_config in node_config_list.split(NODE_SEP):
+        hostname, slots = parse_node_config(node_config)
+        node_configs[hostname] += slots
+
+    return {k: sorted(list(set(v))) for k, v in node_configs.items()}
+
+
 def parse_resource_filter(host_info, include_str="", exclude_str=""):
     '''Parse an inclusion or exclusion string and filter a hostfile dictionary.
 
@@ -277,11 +304,6 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
           slot 0 on worker-1.
     '''
 
-    # Constants that define our syntax
-    NODE_SEP = '@'
-    SLOT_LIST_START = ':'
-    SLOT_SEP = ','
-
     # Ensure include/exclude are mutually exclusive
     if (include_str != "") and (exclude_str != ""):
         raise ValueError('include_str and exclude_str are mutually exclusive.')
@@ -299,12 +321,9 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
         parse_str = exclude_str
 
     # foreach node in the list
-    for node_config in parse_str.split(NODE_SEP):
+    for hostname, slots in parse_node_config_list(parse_str).items():
         # Node can either be alone or node:slot,slot,slot
-        if SLOT_LIST_START in node_config:
-            hostname, slots = node_config.split(SLOT_LIST_START)
-            slots = [int(x) for x in slots.split(SLOT_SEP)]
-
+        if len(slots) > 0:
             # sanity checks
             if hostname not in host_info:
                 raise ValueError(f"Hostname '{hostname}' not found in hostfile")
@@ -322,7 +341,6 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
 
         # User just specified the whole node
         else:
-            hostname = node_config
             # sanity check hostname
             if hostname not in host_info:
                 raise ValueError(f"Hostname '{hostname}' not found in hostfile")
@@ -355,8 +373,10 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
 
 def parse_inclusion_exclusion(resource_pool, inclusion, exclusion):
     active_resources = collections.OrderedDict()
+    node_configs = parse_node_config_list(inclusion)
+
     for hostname, slots in resource_pool.items():
-        active_resources[hostname] = list(range(slots))
+        active_resources[hostname] = node_configs[hostname] if hostname in node_configs else list(range(slots))
 
     return parse_resource_filter(active_resources, include_str=inclusion, exclude_str=exclusion)
 

From ca8b1fe945ce8100dd1ac164d7416f3b8f6be8b7 Mon Sep 17 00:00:00 2001
From: Yichen Yan <wenji.yyc@alibaba-inc.com>
Date: Wed, 9 Oct 2024 07:38:43 +0800
Subject: [PATCH 22/43] Handle when `backend` is also in compile_kwargs (#6502)

cc @tohtana

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
---
 deepspeed/runtime/engine.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 0371c5663a2d1..ba0e0f8756253 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -3675,7 +3675,11 @@ def compile(self, backend=get_accelerator().get_compile_backend(), compile_kwarg
         if self.is_compiled:
             return
 
-        self.module.compile(backend=backend, **compile_kwargs)
+        if 'backend' in compile_kwargs:
+            logger.warning("The `backend` in `compile_kwargs` will be overridden. Use the `backend` argument instead.")
+
+        # create new dict to avoid modifying original dict
+        self.module.compile(**{**compile_kwargs, 'backend': backend})
         self._is_compiled = True
 
     @property

From 645639bcf892e35cd39e7baccdab0e69abb8ce33 Mon Sep 17 00:00:00 2001
From: Omar Elayan <142979319+oelayan7@users.noreply.github.com>
Date: Wed, 9 Oct 2024 04:22:28 +0300
Subject: [PATCH 23/43] Rearrange inference OPS and stop using builder.load 
 (#5490)

This PR mainly handles all places where InferenceBuilder is used to
access any op or a specific implementation for an op.
Instead an op is defined, and its proper implementation is picked inside
and the usage will be transparent to the user.
What was done in the PR:
1) Added missing ops (added a py file with fallback mechanism)
2) Added missing fallback implementations for existing ops
3) removed all usages for builder.load and replaced them with ops
instead.
4) added workspace op and inferenceContext which contains all workspace
related functions and inferenceContext is the python fallback of
inferenceContext in CUDA
5) a small change to softmax_context signature to fit the fallback
signature.

---------

Co-authored-by: Joe Mayer <114769929+jomayeri@users.noreply.github.com>
Co-authored-by: Lev Kurilenko <113481193+lekurile@users.noreply.github.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 .../transformer/inference/csrc/pt_binding.cpp |   9 +-
 deepspeed/inference/engine.py                 |  17 +-
 .../transformers/ds_llama2.py                 |  15 +-
 .../transformers/ds_transformer.py            |  54 ++---
 deepspeed/ops/transformer/inference/config.py |   1 -
 .../inference/diffusers_attention.py          |  52 ++--
 .../inference/diffusers_transformer_block.py  |  33 +--
 .../ops/transformer/inference/ds_attention.py |  21 +-
 .../transformer/inference/moe_inference.py    |  59 ++---
 .../inference/op_binding/bias_add.py          |  31 +++
 .../inference/op_binding/bias_gelu.py         |  33 +++
 .../inference/op_binding/bias_relu.py         |  33 +++
 .../inference/op_binding/bias_residual.py     |  29 +++
 .../inference/op_binding/einsum_sec_sm_ecm.py |  29 +++
 .../inference/op_binding/gated_activation.py  |  40 ++++
 .../inference/op_binding/gelu_gemm.py         |  11 +-
 .../inference/op_binding/layer_norm.py        |  60 +++++
 .../inference/op_binding/mlp_gemm.py          |  38 ++-
 .../inference/op_binding/moe_res_matmul.py    |  29 +++
 .../inference/op_binding/pad_transform.py     |  26 ++
 .../inference/op_binding/pre_rms_norm.py      |  31 +++
 .../inference/op_binding/qkv_gemm.py          |  30 ++-
 .../inference/op_binding/residual_add.py      |  65 ++---
 .../inference/op_binding/rms_norm.py          |  33 +++
 .../inference/op_binding/softmax.py           |  54 +++--
 .../inference/op_binding/softmax_context.py   | 105 ++++++++-
 .../inference/op_binding/vector_add.py        |  28 +++
 .../inference/op_binding/vector_matmul.py     |   8 +-
 .../inference/op_binding/workspace.py         | 222 ++++++++++++++++++
 .../transformer/inference/triton/attention.py |  18 +-
 .../ops/transformer/inference/triton/ops.py   |  20 +-
 deepspeed/runtime/hybrid_engine.py            |  18 +-
 op_builder/hpu/__init__.py                    |   1 +
 op_builder/hpu/transformer_inference.py       |  39 +++
 .../transformer/inference/test_bias_add.py    |  14 +-
 .../transformer/inference/test_bias_geglu.py  |  14 +-
 .../transformer/inference/test_bias_gelu.py   |  14 +-
 .../transformer/inference/test_bias_relu.py   |  14 +-
 .../ops/transformer/inference/test_gelu.py    |  12 +-
 .../transformer/inference/test_layer_norm.py  |  18 +-
 .../inference/test_moe_res_matmult.py         |   8 +-
 .../inference/test_residual_add.py            |  25 +-
 .../transformer/inference/test_rms_norm.py    |  14 +-
 .../ops/transformer/inference/test_softmax.py |   2 +-
 44 files changed, 1064 insertions(+), 363 deletions(-)
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/bias_add.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/bias_gelu.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/bias_relu.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/bias_residual.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/gated_activation.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/layer_norm.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/pad_transform.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/rms_norm.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/vector_add.py
 create mode 100644 deepspeed/ops/transformer/inference/op_binding/workspace.py
 create mode 100644 op_builder/hpu/transformer_inference.py

diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 2d5332578edc5..19dbe73726f74 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -452,14 +452,17 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                            unsigned layer_id,
                                            unsigned num_layers,
                                            at::Tensor& alibi,
-                                           float rope_theta)
+                                           float rope_theta,
+                                           bool is_prompt,
+                                           std::optional<at::Tensor> token_idx,
+                                           std::optional<at::Tensor> position_ids)
 {
     unsigned bsz = query_key_value.size(0);
     unsigned seq_len = query_key_value.size(1);
     int k = query_key_value.size(2) / (heads + 2 * (num_kv > 0 ? num_kv : heads));
     unsigned hidden_dim = heads * k;
 
-    bool is_prompt = (seq_len > 1);
+    is_prompt = (seq_len > 1);
 
     if (is_prompt) InferenceContext::Instance().reset_tokens(seq_len);
     unsigned soft_len = InferenceContext::Instance().current_tokens();
@@ -2031,7 +2034,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)");                \
     m.def("dequantize_" #_name,                                                                   \
           &ds_dequantize<_dtype>,                                                                 \
-          "DeepSpeed dequantize with " #_name " (CUDA)")
+          "DeepSpeed dequantize with " #_name " (CUDA)");
 
     DEF_OPS(fp32, float);
     DEF_OPS(fp16, __half);
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 68836ceb523cc..6574d49fb1323 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -53,12 +53,7 @@ def __init__(self, model, config):
         DS_INFERENCE_ENABLED = True
 
         super().__init__()
-
-        # Have to import here because inference_module is a global, but python
-        # globals only work at the module level and will not be updated unless
-        # we import it each time we init a new inference engine.
-        from ..model_implementations.transformers.ds_transformer import inference_module
-        if inference_module is not None:
+        if DeepSpeedTransformerInference.workspace is not None:
             self.destroy()
 
         self.module = model
@@ -191,15 +186,11 @@ def __init__(self, model, config):
         self._is_compiled = False
 
     def destroy(self):
-        # Have to import here because inference_module is a global, but python
-        # globals only work at the module level and will not be updated unless
-        # we import it each time we init a new inference engine.
-        from ..model_implementations.transformers.ds_transformer import inference_module
         DeepSpeedTransformerInference.layer_id = 0
         DeepSpeedSelfAttention.num_layers = 0
-        if inference_module is not None:
-            inference_module.release_workspace()
-            inference_module = None
+        if DeepSpeedTransformerInference.workspace.is_allocated():
+            DeepSpeedTransformerInference.workspace.release_workspace()
+        DeepSpeedTransformerInference.workspace = None
 
     def profile_model_time(self, use_cuda_events=True):
         if not self.model_profile_enabled and not self._config.enable_cuda_graph:
diff --git a/deepspeed/model_implementations/transformers/ds_llama2.py b/deepspeed/model_implementations/transformers/ds_llama2.py
index 7d9eb4113a8a5..325bfb4f7e181 100644
--- a/deepspeed/model_implementations/transformers/ds_llama2.py
+++ b/deepspeed/model_implementations/transformers/ds_llama2.py
@@ -4,11 +4,8 @@
 # DeepSpeed Team
 
 import torch
-from deepspeed import comm as dist
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
-inference_module = None
-
 
 class DeepSpeedLlama2Inference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed OPT Transformer Layer.
@@ -27,18 +24,10 @@ def forward(self, *args, **kwargs):
 
         input = args[0]
         input_mask = None
-        # Allocate memory only on first layer forward
-        if self.config.layer_id == 0 and self._alloc_workspace:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
-                                    self.config.bigscience_bloom,
-                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
-                                    self.config.min_out_tokens)
-            self._alloc_workspace = False
-
         get_present = True
 
+        self.allocate_workspace(input.size())
+
         # We set the prev key/value to None when there is a prompt
         if input.shape[1] > 1:
             self.layer_past = None
diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py
index d87d0de997b51..360113b78a3db 100644
--- a/deepspeed/model_implementations/transformers/ds_transformer.py
+++ b/deepspeed/model_implementations/transformers/ds_transformer.py
@@ -6,19 +6,18 @@
 import torch
 import torch.nn as nn
 from deepspeed import comm as dist
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
 from deepspeed.utils.logging import log_dist
 
 from deepspeed.ops.transformer.inference.ds_mlp import DeepSpeedMLP
 from deepspeed.ops.transformer.inference.ds_attention import DeepSpeedSelfAttention, BloomSelfAttention
+from deepspeed.ops.transformer.inference.op_binding.workspace import WorkspaceOp
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
 import deepspeed
 if deepspeed.HAS_TRITON:
     from deepspeed.ops.transformer.inference.triton.mlp import TritonMLP
     from deepspeed.ops.transformer.inference.triton.attention import TritonSelfAttention
 
-inference_module = None
-
 
 class DeepSpeedTransformerInference(nn.Module):
     """Initialize the DeepSpeed Transformer Layer.
@@ -37,6 +36,7 @@ class DeepSpeedTransformerInference(nn.Module):
                 for specific downstream tasks.
     """
     layer_id = 0
+    workspace = None
 
     def __init__(self,
                  config,
@@ -52,10 +52,6 @@ def __init__(self,
         DeepSpeedTransformerInference.layer_id += 1
 
         data_type = torch.half if self.config.dtype == torch.int8 else self.config.dtype
-        global inference_module
-        if inference_module is None:
-            builder = InferenceBuilder()
-            inference_module = builder.load()
 
         if DeepSpeedTransformerInference.layer_id == 1:
             log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
@@ -88,22 +84,25 @@ def __init__(self,
             self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
                                        requires_grad=False)
         self.layer_past = None
-        try:
-            if config.dtype == torch.float32:
-                self.allocate_workspace = inference_module.allocate_workspace_fp32
-            elif config.dtype == torch.bfloat16:
-                self.allocate_workspace = inference_module.allocate_workspace_bf16
-            else:
-                self.allocate_workspace = inference_module.allocate_workspace_fp32
-            self._alloc_workspace = True
-        except AttributeError:
-            self.allocate_workspace = None
-            self._alloc_workspace = False
+        self.layer_norm = LayerNormOp()
+        if DeepSpeedTransformerInference.workspace is None:
+            DeepSpeedTransformerInference.workspace = WorkspaceOp(self.config)
+        self._should_allocate_workspace = True
+
+    def allocate_workspace(self, size):
+        # Allocate memory only on first layer forward
+        if self.config.layer_id == 0 and self._should_allocate_workspace:
+            DeepSpeedTransformerInference.workspace.allocate_workspace(
+                self.config.hidden_size, self.config.heads, size[1], size[0], DeepSpeedTransformerInference.layer_id,
+                self.config.mp_size, self.config.bigscience_bloom,
+                dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
+                self.config.min_out_tokens)
+            self._should_allocate_workspace = False
 
     @classmethod
     def reset_cache(cls):
-        if inference_module is not None:
-            inference_module.reset_cache()
+        if cls.workspace is not None:
+            cls.workspace.reset_cache()
 
     def forward(
             self,
@@ -136,15 +135,7 @@ def forward(
 
         input_mask = (input_mask if attn_mask is None else attn_mask) if attention_mask is None else attention_mask
 
-        # Allocate memory only on first layer forward
-        if self.config.layer_id == 0 and self._alloc_workspace:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
-                                    self.config.bigscience_bloom,
-                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
-                                    self.config.min_out_tokens)
-            self._alloc_workspace = False
+        self.allocate_workspace(input.size())
 
         get_present = (get_present or get_key_value or use_cache)
         input_mask = input_mask if attention_mask is None else attention_mask
@@ -178,14 +169,15 @@ def forward(
                                               output_attentions,
                                               self.norm_w,
                                               self.norm_b,
-                                              alibi)
+                                              alibi,
+                                              **kwargs)
 
             presents = (key, value)
             self.layer_past = presents if layer_past is None else None
             output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
 
             if not self.config.pre_layer_norm:
-                output = inference_module.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
+                output = self.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
 
             output = output.to(input_type)
         if get_present:
diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py
index 9709328cc1335..c0dd29f4f9627 100644
--- a/deepspeed/ops/transformer/inference/config.py
+++ b/deepspeed/ops/transformer/inference/config.py
@@ -103,7 +103,6 @@ def __init__(self,
         self.return_tuple = return_tuple
         self.mlp_after_attn = mlp_after_attn
         self.mlp_act_func_type = mlp_act_func_type
-        self.specialized_mode = False
         self.training_mp_size = training_mp_size
         self.bigscience_bloom = bigscience_bloom
         self.max_out_tokens = max_out_tokens
diff --git a/deepspeed/ops/transformer/inference/diffusers_attention.py b/deepspeed/ops/transformer/inference/diffusers_attention.py
index 5efc560db75e4..3c2340ccfc6f9 100644
--- a/deepspeed/ops/transformer/inference/diffusers_attention.py
+++ b/deepspeed/ops/transformer/inference/diffusers_attention.py
@@ -10,10 +10,11 @@
 from packaging import version as pkg_version
 from deepspeed.utils.logging import log_dist
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer.inference.op_binding.workspace import WorkspaceOp
+from deepspeed.ops.transformer.inference.op_binding.softmax_context import SoftmaxContextOp
+from deepspeed.ops.transformer.inference.op_binding import LinearOp
+from deepspeed.ops.transformer.inference.op_binding.pad_transform import PadTransformOp
 
-# Cuda modules will be imported if needed
-inference_module = None
 minus_inf = -10000.0
 triton_flash_attn = None
 
@@ -36,7 +37,8 @@ class DeepSpeedDiffusersAttentionFunction(Function):
     @staticmethod
     def forward(ctx, input, context, input_mask, config, attn_qkvw, attn_qw, attn_kw, attn_vw, attn_qkvb,
                 num_attention_heads_per_partition, norm_factor, hidden_size_per_partition, attn_ow, attn_ob,
-                do_out_bias, score_context_func, linear_func, triton_flash_attn_kernel, rope_theta):
+                do_out_bias, score_context_func, linear_func, pad_transform_func, triton_flash_attn_kernel,
+                rope_theta):
 
         def _transpose_for_context(x):
             x = x.permute(0, 2, 1, 3)
@@ -77,7 +79,7 @@ def selfAttention_fp(input, context, input_mask):
                     query = query.contiguous()
                     key = key.contiguous()
                     value = value.contiguous()
-                query, key, value = inference_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn)
+                query, key, value = pad_transform_func(query, key, value, config.heads, do_flash_attn)
                 attention_scores = (torch.matmul(query, key.transpose(-1, -2)) * scale).softmax(dim=-1)
                 context_layer = _transpose_for_context(torch.matmul(attention_scores, value))
 
@@ -117,10 +119,6 @@ def __init__(
 
         data_type = self.config.dtype
         data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
-        global inference_module
-        if inference_module is None:
-            builder = InferenceBuilder()
-            inference_module = builder.load()
 
         if DeepSpeedDiffusersAttention.layer_id == 1:
             log_dist(f"DeepSpeed-Attention config: {self.config.__dict__}", [0])
@@ -171,26 +169,24 @@ def __init__(
             self.norm_factor *= math.sqrt(self.config.layer_id + 1)
             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
 
-        if self.config.dtype in [torch.float16, torch.int8]:
-            self.score_context_func = inference_module.softmax_context_fp16
-            self.linear_func = inference_module.linear_layer_fp16
-            self.allocate_workspace = inference_module.allocate_workspace_fp16
-        else:
-            self.score_context_func = inference_module.softmax_context_fp32
-            self.linear_func = inference_module.linear_layer_fp32
-            self.allocate_workspace = inference_module.allocate_workspace_fp32
+        self.workspace = WorkspaceOp(self.config)
+        self.score_context_func = SoftmaxContextOp(self.config)
+        self.linear_func = LinearOp(self.config)
+        self.pad_transform_func = PadTransformOp(self.config)
 
-    def forward(self, input, context=None, input_mask=None):
+    def allocate_workspace(self, size):
+        # Allocate memory only on first layer forward
         if self.config.layer_id == 0:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False,
-                                    0, self.config.max_out_tokens, self.config.min_out_tokens)
-        output = DeepSpeedDiffusersAttentionFunction.apply(input, context, input_mask, self.config, self.attn_qkvw,
-                                                           self.attn_qw, self.attn_kw, self.attn_vw, self.attn_qkvb,
-                                                           self.num_attention_heads_per_partition, self.norm_factor,
-                                                           self.hidden_size_per_partition, self.attn_ow, self.attn_ob,
-                                                           self.do_out_bias, self.score_context_func, self.linear_func,
-                                                           self.triton_flash_attn_kernel, self.config.rope_theta)
+            self.workspace.allocate_workspace(self.config.hidden_size, self.config.heads, size[1], size[0],
+                                              DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False, 0,
+                                              self.config.max_out_tokens, self.config.min_out_tokens)
+
+    def forward(self, input, context=None, input_mask=None):
+        self.allocate_workspace(input.size())
+        output = DeepSpeedDiffusersAttentionFunction.apply(
+            input, context, input_mask, self.config, self.attn_qkvw, self.attn_qw, self.attn_kw, self.attn_vw,
+            self.attn_qkvb, self.num_attention_heads_per_partition, self.norm_factor, self.hidden_size_per_partition,
+            self.attn_ow, self.attn_ob, self.do_out_bias, self.score_context_func, self.linear_func,
+            self.pad_transform_func, self.triton_flash_attn_kernel, self.config.rope_theta)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
index b0156f905a06e..d01638f36e401 100644
--- a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
+++ b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
@@ -10,26 +10,9 @@
 from .diffusers_attention import DeepSpeedDiffusersAttention
 from .bias_add import nhwc_bias_add
 from .diffusers_2d_transformer import Diffusers2DTransformerConfig
-from deepspeed.ops.op_builder import InferenceBuilder, SpatialInferenceBuilder
 from deepspeed.utils.types import ActivationFuncType
-
-# Ops will be loaded on demand
-transformer_cuda_module = None
-spatial_cuda_module = None
-
-
-def load_transformer_module():
-    global transformer_cuda_module
-    if transformer_cuda_module is None:
-        transformer_cuda_module = InferenceBuilder().load()
-    return transformer_cuda_module
-
-
-def load_spatial_module():
-    global spatial_cuda_module
-    if spatial_cuda_module is None:
-        spatial_cuda_module = SpatialInferenceBuilder().load()
-    return spatial_cuda_module
+from .op_binding.gated_activation import GatedActivationOp
+from .op_binding.layer_norm import LayerNormOp
 
 
 class DeepSpeedDiffusersTransformerBlock(nn.Module):
@@ -76,8 +59,8 @@ def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerC
         else:
             self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), requires_grad=False)
 
-        self.transformer_cuda_module = load_transformer_module()
-        load_spatial_module()
+        self.gated_activation = GatedActivationOp()
+        self.layer_norm = LayerNormOp()
 
     def forward(self, hidden_states, context=None, timestep=None, **kwargs):
         # In v0.12.0 of diffuser, several new kwargs were added. Capturing
@@ -88,17 +71,17 @@ def forward(self, hidden_states, context=None, timestep=None, **kwargs):
         if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] is not None:
             context = kwargs["encoder_hidden_states"]
 
-        out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
+        out_norm_1 = self.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
         out_attn_1 = self.attn_1(out_norm_1)
 
-        out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+        out_norm_2, out_attn_1 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
             out_attn_1, self.attn_1_bias, hidden_states, self.norm2_g, self.norm2_b, self.norm2_eps)
         out_attn_2 = self.attn_2(out_norm_2, context=context)
-        out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+        out_norm_3, out_attn_2 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
             out_attn_2, self.attn_2_bias, out_attn_1, self.norm3_g, self.norm3_b, self.norm3_eps)
 
         out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w)
-        out_geglu = self.transformer_cuda_module.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
+        out_geglu = self.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
 
         out_ff2 = nn.functional.linear(out_geglu, self.ff2_w)
         return nhwc_bias_add(out_ff2, self.ff2_b, other=out_attn_2)
diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py
index ffb58175daadd..24f710d224949 100644
--- a/deepspeed/ops/transformer/inference/ds_attention.py
+++ b/deepspeed/ops/transformer/inference/ds_attention.py
@@ -89,7 +89,7 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
                 torch.empty(self.hidden_size_per_partition * 3, dtype=data_type_fp, device=device)
             ]
 
-    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
@@ -108,7 +108,10 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             no_masking=no_masking,
             layer_id=self.config.layer_id,
             num_layers=DeepSpeedSelfAttention.num_layers,
-            alibi=alibi)
+            alibi=alibi,
+            is_prompt=is_prompt,
+            token_idx=token_idx,
+            position_ids=position_ids)
 
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
@@ -136,7 +139,8 @@ def forward(self,
                 output_attentions=False,
                 norm_w=None,
                 norm_b=None,
-                alibi=None):
+                alibi=None,
+                **kwargs):
         if self.attn_qkvw is None:
             self._attn_qkvw, self._attn_qkvb = self._merge_qkv()
         else:
@@ -157,10 +161,17 @@ def forward(self,
                                     gamma=norm_w,
                                     beta=norm_b)
 
+        is_prompt = kwargs.get("first_token", qkv_out[0].shape[1] > 1)
+        token_idx = kwargs.get("token_idx", None)
+        position_ids = kwargs.get("position_ids", None)
+
         context_layer, key_layer, value_layer = self.compute_attention(qkv_out=qkv_out,
                                                                        input_mask=input_mask,
                                                                        layer_past=layer_past,
-                                                                       alibi=alibi)
+                                                                       alibi=alibi,
+                                                                       is_prompt=is_prompt,
+                                                                       token_idx=token_idx,
+                                                                       position_ids=position_ids)
 
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
         inp_norm = qkv_out[-1]
@@ -210,7 +221,7 @@ def _split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_
 
         return tensor_list
 
-    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py
index fc001a86d42e5..3a9785985d196 100644
--- a/deepspeed/ops/transformer/inference/moe_inference.py
+++ b/deepspeed/ops/transformer/inference/moe_inference.py
@@ -7,16 +7,16 @@
 import math
 import torch
 from torch.autograd import Function
-# accelerator modules will be imported if needed
-inference_module = None
-specialized_mode = None
 import torch.nn as nn
 from .ds_attention import DeepSpeedSelfAttention
 from .config import DeepSpeedInferenceConfig
+from .op_binding import SoftmaxOp, VectorMatMulOp, GELUGemmOp
+from .op_binding.bias_residual import BiasResidualOp
+from .op_binding.einsum_sec_sm_ecm import EinsumSecSmEcmOp
+from .op_binding.layer_norm import LayerNormOp
 from ....moe.sharded_moe import TopKGate
 from deepspeed import comm as dist
-from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
+from .op_binding.moe_res_matmul import MoEResMatmulOp
 
 
 class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
@@ -110,16 +110,13 @@ class DeepSpeedMLPFunction(Function):
 
     @staticmethod
     def forward(ctx, input, inter_w, inter_b, config, output_b, output_w, q_scales, q_groups, merge_count, mp_group,
-                async_op):
+                async_op, gelu_gemm_func, vector_matmul_func):
         if config.q_int8:
-            intermediate = inference_module.fused_gemm_gelu_int8(input, inter_w, inter_b, config.epsilon, q_scales[2],
-                                                                 (q_groups * (2**merge_count)), config.pre_layer_norm)
-            output = inference_module.vector_matmul_int8(intermediate, output_w, q_scales[3], q_groups, (merge_count))
+            intermediate = gelu_gemm_func(input, inter_w, inter_b, config.epsilon, q_scales[2],
+                                          (q_groups * (2**merge_count)), config.pre_layer_norm)
+            output = vector_matmul_func(intermediate, output_w, q_scales[3], q_groups, (merge_count))
         else:
-            mlp_gemm_func = inference_module.fused_gemm_gelu_fp16 if config.fp16 else \
-                                    inference_module.fused_gemm_gelu_fp32
-
-            output = mlp_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op)
+            output = gelu_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op)
         if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
             dist.all_reduce(output, group=mp_group, async_op=async_op)
 
@@ -150,10 +147,13 @@ def __init__(self, config, q_scales=None, q_groups=1, merge_count=1, mlp_extra_g
         self.q_groups = q_groups * 2 if mlp_extra_grouping else q_groups
         self.merge_count = int(math.log2(merge_count))
         self.mp_group = mp_group
+        self.gelu_gemm_func = GELUGemmOp(self.config)
+        self.vector_matmul_func = VectorMatMulOp(self.config)
 
     def forward(self, input, async_op=False):
         return DeepSpeedMLPFunction.apply(input, self.inter_w, self.inter_b, self.config, self.output_b, self.output_w,
-                                          self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op)
+                                          self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op,
+                                          self.gelu_gemm_func, self.vector_matmul_func)
 
 
 class DeepSpeedMoEInference(nn.Module):
@@ -187,18 +187,7 @@ def __init__(self,
 
         self.config = config
         self.config.layer_id = DeepSpeedMoEInference.layer_id
-        global inference_module
-        global specialized_mode
-        if inference_module is None:
-            specialized_mode = False
-            # InferenceSpecializedBuilder is not among DeepSpeed provided builder yet, so we infer by builder name string
-            builder = get_accelerator().create_op_builder("InferenceSpecializedBuilder")
-            if builder is not None and builder.is_compatible():
-                inference_module = builder.load()
-                specialized_mode = True
-            else:
-                inference_module = InferenceBuilder().load()
-        self.config.specialized_mode = specialized_mode
+
         assert self.config.dtype != torch.bfloat16, "DeepSpeed MoE Transformer Inference not yet tested for bfloat support"
 
         DeepSpeedMoEInference.layer_id += 1
@@ -213,10 +202,8 @@ def __init__(self,
             self.res_mlp = DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping,
                                            mp_group)
             self.res_coef = nn.Parameter(torch.Tensor(self.config.hidden_size, 2))
-            self.coef_func = inference_module.softmax_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.softmax_fp32
-            self.vector_matmul_func = inference_module.vector_matmul_fp16 if self.config.dtype == torch.float16 else \
-                                    inference_module.vector_matmul_fp32
+            self.coef_func = SoftmaxOp(self.config)
+            self.vector_matmul_func = VectorMatMulOp(self.config)
 
         config.mp_size = 1
         self.mlp = nn.ModuleList(
@@ -234,12 +221,10 @@ def __init__(self,
 
         print("DeepSpeed MoE Transformer Inference config is ", self.config.__dict__)
 
-        self.bias_residual_func = inference_module.bias_residual_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.bias_residual_fp32
-        self.ds_layernorm = inference_module.layer_norm_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.layer_norm_fp32
-        self.einsum_sec_sm_ecm = inference_module.einsum_sec_sm_ecm_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.einsum_sec_sm_ecm_fp32
+        self.bias_residual_func = BiasResidualOp(self.config)
+        self.ds_layernorm = LayerNormOp(self.config)
+        self.einsum_sec_sm_ecm = EinsumSecSmEcmOp(self.config)
+        self.moe_res_matmul = MoEResMatmulOp(self.config)
 
     def res_coef_func(self, inp, async_op):
         inp = self.vector_matmul_func(inp, self.res_coef, async_op)
@@ -346,7 +331,7 @@ def forward(self,
                                       dim=0)[dist.get_rank(group=self.expert_mp_group)]
 
             if self.config.mlp_type == 'residual':
-                inference_module.moe_res_matmul(res_mlp_out, res_coef_out, output)
+                self.moe_res_matmul(res_mlp_out, res_coef_out, output)
 
             output = self.bias_residual_func(output, residual_add, torch.empty(1))
 
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_add.py b/deepspeed/ops/transformer/inference/op_binding/bias_add.py
new file mode 100644
index 0000000000000..d2ae38f546eb4
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_add.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasAddOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasAddOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_add_func = self.inference_module.bias_add_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_add_func = self.inference_module.bias_add_bf16
+            else:
+                self.bias_add_func = self.inference_module.bias_add_fp32
+        except AttributeError:
+            self.bias_add_func = self.bias_add_fallback
+
+    @classmethod
+    def bias_add_fallback(cls, input, bias):
+        return torch.add(input, bias)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_add_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py b/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py
new file mode 100644
index 0000000000000..f0fee0b0d06ea
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasGeluOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasGeluOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_gelu_func = self.inference_module.bias_gelu_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_gelu_func = self.inference_module.bias_gelu_bf16
+            else:
+                self.bias_gelu_func = self.inference_module.bias_gelu_fp32
+        except AttributeError:
+            self.bias_gelu_func = self.bias_gelu_fallback
+
+    @classmethod
+    def bias_gelu_fallback(cls, activations, bias):
+        # Expected behavior is that of casting to float32 internally and using the tanh approximation
+        return F.gelu(activations.to(torch.float32) + bias.to(torch.float32), approximate='tanh').to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_gelu_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_relu.py b/deepspeed/ops/transformer/inference/op_binding/bias_relu.py
new file mode 100644
index 0000000000000..ccfade1d95245
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_relu.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasReluOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasReluOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_relu_func = self.inference_module.bias_relu_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_relu_func = self.inference_module.bias_relu_bf16
+            else:
+                self.bias_relu_func = self.inference_module.bias_relu_fp32
+        except AttributeError:
+            self.bias_relu_func = self.bias_relu_fallback
+
+    @classmethod
+    def bias_relu_fallback(cls, activations, bias):
+        # Expected behavior is that of casting to float32 internally
+        return F.relu(activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_relu_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_residual.py b/deepspeed/ops/transformer/inference/op_binding/bias_residual.py
new file mode 100644
index 0000000000000..ecad50e10ffef
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_residual.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasResidualOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasResidualOp, self).__init__(config)
+
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.bias_residual_func = self.inference_module.bias_residual_fp16
+            else:
+                self.bias_residual_func = self.inference_module.bias_residual_fp32
+        except AttributeError:
+            self.bias_residual_func = self.bias_residual_fallback
+
+    @classmethod
+    def bias_residual_fallback(cls, output, residual, bias):
+        raise NotImplementedError("bias residual fallback isn't implemented")
+
+    def forward(self, output, residual, bias):
+        return self.bias_residual_func(output, residual, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py b/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py
new file mode 100644
index 0000000000000..f34b10f786d11
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class EinsumSecSmEcmOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(EinsumSecSmEcmOp, self).__init__(config)
+
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.einsum_sec_sm_ecm_func = self.inference_module.einsum_sec_sm_ecm_fp16
+            else:
+                self.einsum_sec_sm_ecm_func = self.inference_module.einsum_sec_sm_ecm_fp32
+        except AttributeError:
+            self.einsum_sec_sm_ecm_func = self.einsum_sec_sm_ecm_fallback
+
+    @classmethod
+    def einsum_sec_sm_ecm_fallback(cls, Q, W):
+        raise NotImplementedError("einsum sec sm ecm fallback isn't implemented")
+
+    def forward(self, Q, W):
+        return self.einsum_sec_sm_ecm_func(Q, W)
diff --git a/deepspeed/ops/transformer/inference/op_binding/gated_activation.py b/deepspeed/ops/transformer/inference/op_binding/gated_activation.py
new file mode 100644
index 0000000000000..d28d818ce4b33
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/gated_activation.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from deepspeed.utils.types import ActivationFuncType
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class GatedActivationOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(GatedActivationOp, self).__init__(config)
+        try:
+            self.gated_activation_func = self.inference_module.gated_activation
+        except AttributeError:
+            self.gated_activation_func = self.gated_activation_fallback
+
+    @classmethod
+    def gated_activation_fallback(cls, activation, bias, activation_func_type):
+        # Expected behavior is that of casting to float32 internally
+        # Explicitly using the default GeLU
+        activation_func = None
+        activations = activation + bias.reshape(1, 1, -1)
+        hidden_states, gate = activations.chunk(2, dim=-1)
+
+        if activation_func_type == ActivationFuncType.GATED_SILU:
+            activation_func = F.silu
+        elif activation_func_type == ActivationFuncType.GATED_GELU:
+            activation_func = F.gelu
+
+        return hidden_states * activation_func(gate.to(torch.float32)).to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor, activation_func_type: ActivationFuncType):
+        return self.gated_activation_func(activation, bias, activation_func_type)
diff --git a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
index 63323c150752b..60bbb4b48bdb3 100644
--- a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import torch
+import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 import deepspeed
@@ -14,7 +15,9 @@ class GELUGemmOp(BaseOp):
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(GELUGemmOp, self).__init__(config)
         try:
-            if self.config.dtype in [torch.float16, torch.int8]:
+            if self.config.dtype == torch.int8:
+                self.fused_gemm_gelu = self.inference_module.fused_gemm_gelu_int8
+            elif self.config.dtype == torch.float16:
                 if deepspeed.HAS_TRITON and self.config.use_triton and self.config.dtype == torch.float16:
                     from deepspeed.ops.transformer.inference.triton.ops import fused_gemm_gelu as _triton_fused_gemm_gelu
                     self.fused_gemm_gelu = _triton_fused_gemm_gelu  # type: ignore
@@ -28,7 +31,11 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             self.fused_gemm_gelu = self.gelu_gemm_fallback
 
     def gelu_gemm_fallback(self, input, weight, scale, bias, out, out_scale, dtype, transpose):
-        raise NotImplementedError
+        tmp = torch.matmul(input, weight)
+        tmp = F.gelu(tmp.to(torch.float32) + bias.to(torch.float32), approximate="tanh").to(tmp.dtype)
+        output = torch.matmul(tmp, out)
+
+        return output
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, weight_out: torch.Tensor):
 
diff --git a/deepspeed/ops/transformer/inference/op_binding/layer_norm.py b/deepspeed/ops/transformer/inference/op_binding/layer_norm.py
new file mode 100644
index 0000000000000..31219a58ac3c8
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/layer_norm.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class LayerNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        super(LayerNormOp, self).__init__(config)
+        try:
+            if config is None:
+                self.layer_norm_func = self.inference_module.layer_norm
+            elif self.config.dtype in [torch.float16, torch.int8]:
+                self.layer_norm_func = self.inference_module.layer_norm_fp16
+            else:
+                self.layer_norm_func = self.inference_module.layer_norm_fp32
+        except AttributeError:
+            self.layer_norm_func = self.layer_norm_fallback
+
+    @classmethod
+    def layer_norm_residual(cls, vals, bias, res, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+        res_f = res.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        return F.layer_norm(vals_f + bias_f + res_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+
+    @classmethod
+    def layer_norm_residual_store_pre_ln_res(cls, vals, bias, res, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+        res_f = res.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        res_output = vals_f + bias_f + res_f
+        norm_output = F.layer_norm(res_output, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+        return norm_output, res_output.to(dtype)
+
+    @classmethod
+    def layer_norm_fallback(cls, vals, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        return F.layer_norm(vals_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+
+    def forward(self, vals, gamma, beta, epsilon):
+        return self.layer_norm_func(vals, gamma, beta, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
index 3064c00d1755d..5f1f915ec0216 100644
--- a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
@@ -5,12 +5,12 @@
 
 from typing import Optional
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 from deepspeed.utils.types import NormType
+from .pre_rms_norm import PreRMSNormOp
 
 
 class MLPGemmOp(BaseOp):
@@ -39,23 +39,45 @@ def __init__(self, config: DeepSpeedInferenceConfig):
                 self.mlp_gemm_func = self.mlp_gemm_fallback
             elif self.config.norm_type == NormType.RMSNorm:
                 self.mlp_gemm_func = self.rms_mlp_gemm_fallback
+        self.pre_rms_norm = PreRMSNormOp()
 
     def mlp_gemm_fallback(self, input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps,
                           pre_layer_norm, mlp_after_attn, interm_scale, out_scale, dtype, mlp_act_func_type,
                           transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and mlp_after_attn and not transpose:
-            residual_add = F.layer_norm(input + residual + input_bias, (input.shape[2], ), gamma, beta,
-                                        self.config.epsilon)
-            tmp = torch.matmul(residual_add, weight_interm)
+        if mlp_after_attn:
+            residual_add = F.layer_norm(input + residual + input_bias, (input.shape[2], ), gamma, beta, eps)
+            tmp = torch.matmul(residual_add, weight_interm.t() if transpose else weight_interm)
             tmp = F.gelu(tmp + bias)
-            output = torch.matmul(tmp, weight_out)
-            return (output, residual_add)
+            output = torch.matmul(tmp, weight_out.t() if transpose else weight_out)
+
+            return output, residual_add
         else:
             raise NotImplementedError
 
     def rms_mlp_gemm_fallback(self, input, residual, weight_interm, weight_out, gamma, eps, interm_scale, out_scale,
                               dtype, mlp_act_func_type, transpose):
-        raise NotImplementedError
+        inp_norm, residual = self.pre_rms_norm(input, residual, gamma, eps)
+        tmp = torch.matmul(inp_norm.view([-1, inp_norm.size(2)]), weight_interm.t() if transpose else weight_interm)
+        up_proj, gate_proj = tmp.chunk(2, dim=1)
+
+        from deepspeed.utils.types import ActivationFuncType
+        if mlp_act_func_type == ActivationFuncType.GELU:
+            intermediate = F.gelu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.ReLU:
+            intermediate = F.relu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.GATED_GELU:
+            intermediate = F.gelu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.GATED_SILU:
+            intermediate = F.silu(gate_proj)
+        else:
+            raise f"rms_mlp_gemm_fallback not implemented for activation type {mlp_act_func_type}"
+
+        intermediate = intermediate * up_proj
+
+        output = torch.matmul(intermediate, weight_out.t() if transpose else weight_out)
+        output = output.view([input.size(0), input.size(1), -1])
+
+        return [output, residual]
 
     def forward(self,
                 input: torch.Tensor,
diff --git a/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py b/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py
new file mode 100644
index 0000000000000..ef3558c8bc889
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class MoEResMatmulOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(MoEResMatmulOp, self).__init__(config)
+        try:
+            self.moe_res_matmul_func = self.inference_module.moe_res_matmul
+        except AttributeError:
+            self.moe_res_matmul_func = self.moe_res_matmul_fallback
+
+    @classmethod
+    def moe_res_matmul_fallback(cls, residual, coef, output):
+        coef_t = coef.transpose(1, 2).contiguous()
+        coef1, coef2 = torch.split(coef_t, split_size_or_sections=coef_t.shape[len(coef_t.shape) - 1] // 2, dim=-1)
+        return residual * coef1 + output * coef2
+
+    def forward(self, residual, coef, output):
+        return self.moe_res_matmul_func(residual, coef, output)
diff --git a/deepspeed/ops/transformer/inference/op_binding/pad_transform.py b/deepspeed/ops/transformer/inference/op_binding/pad_transform.py
new file mode 100644
index 0000000000000..876fefc3bcfbd
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/pad_transform.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class PadTransformOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(PadTransformOp, self).__init__(config)
+        try:
+            self.pad_transform_func = self.inference_module.pad_transform_fp16
+        except AttributeError:
+            self.pad_transform_func = self.pad_transform_fallback
+
+    @staticmethod
+    def pad_transform_fallback(query, key, value, heads, do_flash_attn):
+        raise NotImplementedError("pad_transform fallback is not implemented.")
+
+    def forward(self, query, key, value, heads, do_flash_attn):
+        return self.pad_transform_func(query, key, value, heads, do_flash_attn)
diff --git a/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py b/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py
new file mode 100644
index 0000000000000..7969d20f0527b
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+from .rms_norm import RMSNormOp
+
+
+class PreRMSNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(PreRMSNormOp, self).__init__(config)
+        try:
+            self.pre_rms_norm_func = self.inference_module.pre_rms_norm
+        except AttributeError:
+            self.pre_rms_norm_func = self.pre_rms_norm_fallback
+
+    @staticmethod
+    def pre_rms_norm_fallback(vals, residual, gamma, epsilon):
+        residual = vals.to(torch.float32) + residual.to(torch.float32)
+        vals = residual
+
+        return RMSNormOp.rms_norm_fallback(vals, gamma, epsilon), residual.to(gamma.dtype)
+
+    def forward(self, vals, residual, gamma, epsilon):
+        return self.pre_rms_norm_func(vals, residual, gamma, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
index 250bf9864e1e7..9ff5366fae5d6 100644
--- a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
@@ -3,11 +3,11 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from .rms_norm import RMSNormOp
 import deepspeed
 from deepspeed.utils.types import NormType
 
@@ -56,19 +56,23 @@ def _triton_autotune(min_seqlen, max_seqlen, hidden_size, dtype=torch.float16):
             matmul(A, B)
         Fp16Matmul._update_autotune_table()
 
-    def qkv_gemm_fallback(self, input, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and not transpose:
-            inp_norm = F.layer_norm(input, (input.shape[2], ), gamma, beta, eps)
-            tmp = torch.matmul(inp_norm, weight)
-            if add_bias:
-                tmp += bias
-            output = [tmp, inp_norm]
-            return output
-        else:
-            raise NotImplementedError
+    @staticmethod
+    def qkv_gemm_fallback(input, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+        inp_norm = F.layer_norm(input, (input.shape[2], ), gamma, beta, eps)
+        tmp = torch.matmul(inp_norm, weight.t() if transpose else weight)
+        if add_bias:
+            tmp += bias
+        output = [tmp, inp_norm]
+
+        return output
+
+    @staticmethod
+    def rms_qkv_gemm_fallback(input, weight, q_scale, gamma, eps, q_int8, transpose):
+        inp_norm = RMSNormOp.rms_norm_fallback(input, gamma, eps)
+        tmp = torch.matmul(inp_norm, weight.t() if transpose else weight)
+        output = [tmp, inp_norm]
 
-    def rms_qkv_gemm_fallback(self, input, weight, q_scale, gamma, eps, q_int8, transpose):
-        raise NotImplementedError
+        return output
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, gamma: torch.Tensor,
                 beta: torch.Tensor):
diff --git a/deepspeed/ops/transformer/inference/op_binding/residual_add.py b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
index 6f9b35cbc05d7..93b229c5d1ac0 100644
--- a/deepspeed/ops/transformer/inference/op_binding/residual_add.py
+++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
@@ -3,9 +3,10 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 from typing import Optional
+
+from .vector_add import VectorAddOp
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 
@@ -22,11 +23,32 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             else:
                 self.residual_add_func = self.inference_module.residual_add_bias_fp32
         except AttributeError:
-            self.residual_add_func = None
-        try:
-            self._vector_add = self.inference_module._vector_add
-        except AttributeError:
-            self._vector_add = None
+            self.residual_add_func = self.residual_add_fallback
+        self.vector_add = VectorAddOp()
+
+    @staticmethod
+    def res_add_bias(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias, mp_size):
+        hidden_state += attn_output + (residual + final_bias) / mp_size
+        if add_attn_bias:
+            hidden_state += attn_bias / mp_size
+
+        return hidden_state
+
+    @staticmethod
+    def residual_add_fallback(hidden_state, residual, attention_output, attention_bias, final_bias, mp_size,
+                              mlp_after_attn, add_bias, pre_layer_norm):
+        if mlp_after_attn:
+            if pre_layer_norm:
+                tmp = (residual.float() + attention_output.float() + attention_bias.float() +
+                       final_bias.float()) / mp_size + hidden_state.float()
+            else:
+                tmp = residual.float() + hidden_state.float() + final_bias.float()
+        else:
+            tmp = ResidualAddOp.res_add_bias(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                             add_bias, mp_size)
+        residual.copy_(tmp.to(hidden_state.dtype))
+
+        return residual
 
     def forward(self,
                 hidden_state: torch.Tensor,
@@ -37,28 +59,15 @@ def forward(self,
                 attention_bias: Optional[torch.Tensor] = None,
                 final_bias: Optional[torch.Tensor] = None):
 
-        if self.residual_add_func is not None:
-            if final_bias is None:
-                residual = self._vector_add(residual, hidden_state, 1.0 / self.config.mp_size)
-            else:
-                if not self.config.pre_layer_norm and residual_add is not None:
-                    # only use residual add if its set and we are not pre layer norm
-                    residual = residual_add
-
-                self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
-                                       self.config.mp_size, self.config.mlp_after_attn, add_bias,
-                                       self.config.pre_layer_norm)
+        if final_bias is None and attention_bias is None:
+            residual = self.vector_add(residual + attention_output, hidden_state, 1.0 / self.config.mp_size)
         else:
-            # fallback
-            if os.environ.get('DS_KI_FALLBACK') == 'True' and self.config.mlp_after_attn:
-                if self.config.pre_layer_norm:
-                    tmp = (residual.float() + attention_output.float() + attention_bias.float() +
-                           final_bias.float()) / self.config.mp_size + hidden_state.float()
-                else:
-                    tmp = residual.float() + hidden_state.float() + final_bias.float()
+            if not self.config.pre_layer_norm and residual_add is not None:
+                # only use residual add if its set and we are not pre layer norm
+                residual = residual_add
+
+            self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                   self.config.mp_size, self.config.mlp_after_attn, add_bias,
+                                   self.config.pre_layer_norm)
 
-                input_dtype = hidden_state.dtype
-                residual = tmp.to(input_dtype)
-            else:
-                raise NotImplementedError
         return residual
diff --git a/deepspeed/ops/transformer/inference/op_binding/rms_norm.py b/deepspeed/ops/transformer/inference/op_binding/rms_norm.py
new file mode 100644
index 0000000000000..128883ce5d438
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/rms_norm.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class RMSNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(RMSNormOp, self).__init__(config)
+        try:
+            self.rms_norm_func = self.inference_module.rms_norm
+        except AttributeError:
+            self.rms_norm_func = self.rms_norm_fallback
+
+    @staticmethod
+    def rms_norm_fallback(vals, gamma, epsilon):
+        variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        vals = vals * torch.rsqrt(variance + epsilon)
+
+        if gamma.dtype in [torch.float16, torch.bfloat16]:
+            vals = vals.to(gamma.dtype)
+
+        return gamma * vals
+
+    def forward(self, vals, gamma, epsilon):
+        return self.rms_norm_func(vals, gamma, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax.py b/deepspeed/ops/transformer/inference/op_binding/softmax.py
index bc309d94df147..2e08541596fa8 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py
@@ -3,11 +3,11 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from deepspeed.ops.transformer.inference.op_binding.workspace import InferenceContext
 
 
 class SoftmaxOp(BaseOp):
@@ -25,24 +25,42 @@ def __init__(self, config: DeepSpeedInferenceConfig):
         except AttributeError:
             self.softmax_func = self.softmax_fallback
 
-    def softmax_fallback(self, attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size,
-                         async_op, layer_scale, head_offset, mp_size):
-        if os.environ.get('DS_KI_FALLBACK') == 'True':
-            alibi = alibi[head_offset:head_offset + self.num_attention_heads_per_partition]
-            input_dtype = attn_scores.dtype
-            if (triangular):
-                tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
-                attn_scores = torch.masked_fill(attn_scores * layer_scale, tri, torch.finfo(input_dtype).min)
-            if alibi is not None:
-                attn_scores += alibi
-            if attn_mask is not None:
-                # expand atten_mask from two dim into 4 dim, insert two dims in the middle
+    @staticmethod
+    def softmax_fallback(attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size, async_op,
+                         layer_scale, head_offset, mp_size):
+        scores_len = len(attn_scores.size())
+        heads = 1
+        if scores_len > 1:
+            heads = attn_scores.size()[1]
+        num_attention_heads_per_partition = heads // mp_size
+
+        if alibi is not None:
+            if len(alibi.shape) == 1:
+                alibi = None
+            else:
+                alibi = alibi[head_offset:head_offset + num_attention_heads_per_partition]
+        if attn_mask is not None and len(attn_mask.shape) == 1:
+            attn_mask = None
+        input_dtype = attn_scores.dtype
+        attn_scores *= layer_scale
+
+        if alibi is not None:
+            attn_scores += alibi
+        if attn_mask is not None:
+            # expand atten_mask from two dim into 4 dim, insert two dims in the middle
+            if len(attn_mask.shape) == 2:
                 attn_mask = attn_mask[:, None, None, :]
-                attn_scores += attn_mask
-            output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
-            return output
-        else:
-            raise NotImplementedError
+            attn_scores += attn_mask
+        if triangular:
+            if attn_scores.shape[2] == 1:  # query using kv cache
+                token_idx = InferenceContext.Instance().current_tokens()
+                tri = torch.arange(attn_scores.shape[2], device=attn_scores.device).ge(token_idx)
+            else:
+                tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
+            attn_scores = torch.masked_fill(attn_scores, tri, float('-inf'))
+        output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
+
+        return output
 
     def forward(self, attn_scores: torch.Tensor, attn_mask: torch.Tensor, alibi: torch.Tensor, triangular: bool,
                 recompute: bool, local_attention: bool, window_size: int, async_op: bool, layer_scale: float,
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
index 0dc4e08a36335..d745df678e93b 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
@@ -7,6 +7,8 @@
 from deepspeed import comm as dist
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from .softmax import SoftmaxOp
+from deepspeed.ops.transformer.inference.op_binding.workspace import InferenceContext
 
 
 class SoftmaxContextOp(BaseOp):
@@ -23,13 +25,108 @@ def __init__(self, config: DeepSpeedInferenceConfig):
         except AttributeError:
             self.softmax_context_func = self.softmax_context_fallback
 
+    @staticmethod
+    def transform4d_0213(x, seq_length):
+        assert x.dim() == 3, F"Dim {x.dim()} is not supported"
+        batch_size, num_heads, seq_length_head_dim = x.shape
+        head_dim = seq_length_head_dim // seq_length
+        x = x.view(batch_size, num_heads, seq_length, head_dim)
+        x = x.permute(0, 2, 1, 3)
+
+        return x
+
+    @staticmethod
+    def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep <= 1 or num_key_value_heads == 1:
+            return hidden_states
+
+        hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+    @staticmethod
+    def bias_add_transform_0213(input, bias, num_heads, trans_count, perform_bias=False):
+        assert trans_count == 1 or trans_count == 3, F"Trans count {trans_count} is not supported"
+        assert input.dim() == 3, F"Dim {input.dim()} is not supported"
+        input_biased = torch.add(input, bias) if perform_bias else input
+        batch_size, seq_length, value_size = input_biased.shape
+        hid_dim = value_size // trans_count
+        head_dim = hid_dim // num_heads
+
+        if trans_count == 1:
+            query_layer = input.view(batch_size, seq_length, num_heads, head_dim)
+            query_layer = query_layer.permute(0, 2, 1, 3)
+            key_layer = torch.zeros_like(query_layer)
+            value_layer = torch.zeros_like(query_layer)
+            return query_layer, key_layer, value_layer
+
+        qkv_layers = input.view(batch_size, seq_length, 3, num_heads, head_dim)
+        query_layer, key_layer, value_layer = qkv_layers[..., 0, :, :], qkv_layers[..., 1, :, :], qkv_layers[...,
+                                                                                                             2, :, :]
+        query_layer = query_layer.transpose(1, 2)
+        key_layer = key_layer.transpose(1, 2)
+        value_layer = value_layer.transpose(1, 2)
+
+        return query_layer, key_layer, value_layer
+
     def softmax_context_fallback(self, query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads,
                                  num_kv, norm_factor, triangular_masking, local_attention, window_size, no_masking,
-                                 layer_id, num_layers, alibi, rope_theta):
-        raise NotImplementedError
+                                 layer_id, num_layers, alibi, rope_theta, is_prompt, token_idx, position_ids):
+        bat_0213_query, bat_0213_key, bat_0213_value = self.bias_add_transform_0213(
+            query_key_value, None, heads, 3, False)
+
+        if rotary_dim > 0 and rotate_half:
+            from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+            rotary = InferenceContext.Instance().get_rotary(rotary_dim, rope_theta, bat_0213_value.device)
+            cos, sin = rotary(bat_0213_value, InferenceContext.Instance().get_max_tokens_num())
+            bat_0213_query, bat_0213_key = apply_rotary_pos_emb(bat_0213_query, bat_0213_key, cos, sin, position_ids)
+
+        bat_0213_key, bat_0213_value = InferenceContext.Instance().update_cache(layer_id, token_idx, is_prompt,
+                                                                                bat_0213_key, bat_0213_value)
+
+        bat_0213_key = self.repeat_kv(bat_0213_key, num_kv)
+        bat_0213_value = self.repeat_kv(bat_0213_value, num_kv)
+
+        bsz = query_key_value.shape[0]
+        head_dim = query_key_value.shape[2] // (heads * 3)
+
+        bmm_output = torch.bmm(bat_0213_query.reshape(bsz * heads, bat_0213_query.shape[2], head_dim),
+                               bat_0213_key.reshape(bsz * heads, bat_0213_key.shape[2], head_dim).transpose(1, 2))
+
+        layer_scale = 1.0
+        if alibi is not None and len(alibi.shape) > 1:
+            layer_scale = max(1, layer_id).to(float)
+
+        alpha = norm_factor * norm_factor / layer_scale
+        bmm_output *= alpha
+        bmm_output_reshape = bmm_output.reshape(bsz, heads, bmm_output.shape[1], bmm_output.shape[2])
+
+        recompute = is_prompt
+        if attn_mask is not None and len(attn_mask.shape) > 1 and attn_mask.shape[-1] < bmm_output_reshape.shape[3]:
+            attn_mask = torch.nn.functional.pad(attn_mask, (0, bmm_output_reshape.shape[3] - attn_mask.shape[-1]),
+                                                value=torch.finfo(attn_mask.dtype).min)
+        softmax_output = SoftmaxOp.softmax_fallback(bmm_output_reshape, attn_mask, alibi, triangular_masking,
+                                                    recompute, local_attention, window_size, None, layer_scale, 0, 1)
+
+        output = torch.bmm(softmax_output.reshape(bsz * heads, softmax_output.shape[2], softmax_output.shape[3]),
+                           bat_0213_value.reshape(bsz * heads, bat_0213_value.shape[2], head_dim))
+
+        output = output.reshape(bsz, heads, output.shape[1], head_dim)
+        output = output.reshape(bsz, heads, output.shape[2] * head_dim)
+        input_seq_len = query_key_value.shape[1]
+        t4d_0123_output = self.transform4d_0213(output, input_seq_len)
+        t4d_0123_output = t4d_0123_output.reshape(bsz, t4d_0123_output.shape[1], heads * head_dim)
+
+        if layer_id == num_layers - 1:
+            InferenceContext.Instance().advance_tokens()
+
+        return t4d_0123_output, bat_0213_key, bat_0213_value
 
     def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, num_kv: int,
-                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
+                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor,
+                is_prompt: bool, token_idx: torch.Tensor, position_ids: torch.Tensor):
 
         if alibi is not None:
             batch_heads = query_key_value.shape[0] * heads
@@ -42,6 +139,6 @@ def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads:
                                            self.config.rotate_every_two, heads, num_kv, norm_factor,
                                            self.config.triangular_masking, self.config.local_attention,
                                            self.config.window_size, no_masking, layer_id, num_layers, alibi,
-                                           self.config.rope_theta)
+                                           self.config.rope_theta, is_prompt, token_idx, position_ids)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_add.py b/deepspeed/ops/transformer/inference/op_binding/vector_add.py
new file mode 100644
index 0000000000000..015340a1084b5
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_add.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class VectorAddOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(VectorAddOp, self).__init__(config)
+        try:
+            self.vector_add_func = self.inference_module._vector_add
+        except AttributeError:
+            self.vector_add_func = self.vector_add_fallback
+
+    @classmethod
+    def vector_add_fallback(cls, a, b, gamma):
+        """Based on csrc/transformer/inference/csrc/pt_binding.cpp code of _vector_add"""
+        dtype = a.dtype
+        return (gamma * a.float() + b.float()).to(dtype)
+
+    def forward(self, a, b, gamma):
+        return self.vector_add_func(a, b, gamma)
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
index 011be859634d5..cabab8d8c4ab1 100644
--- a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
@@ -3,7 +3,6 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
@@ -25,7 +24,7 @@ def __init__(self, config: DeepSpeedInferenceConfig):
                 else:
                     self.vector_matmul_func = self.inference_module.vector_matmul_fp16
             elif self.config.dtype == torch.int8:
-                self.vector_matmul_func = self.inference_module.vector_matmul_fp16
+                self.vector_matmul_func = self.inference_module.vector_matmul_int8
             elif self.config.dtype == torch.bfloat16:
                 self.vector_matmul_func = self.inference_module.vector_matmul_bf16
             else:
@@ -34,10 +33,7 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             self.vector_matmul_func = self.vector_matmul_fallback
 
     def vector_matmul_fallback(self, input, weight, async_op, q_scale, q_int8, transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and not transpose:
-            return torch.matmul(input, weight)
-        else:
-            raise NotImplementedError
+        return torch.matmul(input, weight.t() if transpose else weight)
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, async_op: bool = False):
         q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
diff --git a/deepspeed/ops/transformer/inference/op_binding/workspace.py b/deepspeed/ops/transformer/inference/op_binding/workspace.py
new file mode 100644
index 0000000000000..19de7d9576afe
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/workspace.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+minus_inf = -10000.0
+key_idx = 0
+value_idx = 1
+
+
+class InferenceContext:
+
+    __instance = None
+
+    def __init__(self):
+        self.kv_cache = None
+        self.kv_cache_elem_dtype = None
+        self.num_tokens = 1
+        self.kv_cache_num_layers = None
+        self.kv_cache_size = None
+        self.max_out_tokens = None
+        self.rotary = None
+        self.allocate_called = False
+        self.static_shapes = True
+
+    @classmethod
+    def Instance(cls):
+        if InferenceContext.__instance is None:
+            InferenceContext.__instance = InferenceContext()
+        return InferenceContext.__instance
+
+    def gen_workspace(self, num_layers, num_heads, batch_size, prompt_len, hidden_dim, mp_size, external_cache,
+                      elem_dtype, rank, max_out_tokens, min_out_tokens):
+        self.allocate_called = True
+        self.kv_cache = None
+        if not external_cache:
+            self.kv_cache_num_layers = num_layers
+            self.max_out_tokens = max_out_tokens
+            head_size = hidden_dim // num_heads
+            self.kv_cache_size = torch.Size([batch_size, (num_heads // mp_size), max_out_tokens, head_size])
+            self.kv_cache_elem_dtype = elem_dtype
+        self.num_tokens = 0
+        self.static_shapes = True
+        return True
+
+    def retake_workspace(self):
+        return True
+
+    def _retake_workspace(self):
+        assert self.allocate_called, "retake workspace called before allocate workspace"
+
+        import deepspeed.accelerator as accelerator
+        if self.kv_cache is None:
+            self.kv_cache = []
+            for layer in range(self.kv_cache_num_layers):
+                self.kv_cache.append((torch.zeros(self.kv_cache_size,
+                                                  dtype=self.kv_cache_elem_dtype,
+                                                  device=accelerator.get_accelerator().device_name()),
+                                      torch.zeros(self.kv_cache_size,
+                                                  dtype=self.kv_cache_elem_dtype,
+                                                  device=accelerator.get_accelerator().device_name())))
+
+        return True
+
+    def update_cache(self, layer_id, token_idx, is_prompt, bat_0213_key, bat_0213_value):
+        has_workspace = self._retake_workspace()
+        assert has_workspace, "Could not allocate workspace"
+
+        # Update current token
+        if is_prompt:
+            self.static_shapes = True
+            if token_idx is None:
+                self.static_shapes = False
+                InferenceContext.Instance().reset_tokens(bat_0213_key.shape[2])
+            else:
+                InferenceContext.Instance().reset_tokens(token_idx)
+
+        if token_idx is None:
+            token_idx = InferenceContext.Instance().current_tokens()
+
+        bsz = bat_0213_key.shape[0]
+
+        # Update cache content
+        if is_prompt:
+            cache_max_seq = self.kv_cache_size[2]
+            cache_max_head_dim = self.kv_cache_size[3]
+            seq = bat_0213_key.shape[2]
+
+            mask = torch.arange(cache_max_seq, device=bat_0213_key.device)
+            mask = mask.ge(token_idx)
+            mask = mask.unsqueeze(-1)
+            mask = mask.expand([cache_max_seq, cache_max_head_dim])
+
+            self.kv_cache[layer_id][key_idx][:bsz, :, :seq, :].copy_(bat_0213_key)
+            self.kv_cache[layer_id][key_idx][:bsz, :].masked_fill_(mask, 0)
+            self.kv_cache[layer_id][value_idx][:bsz, :, :seq, :].copy_(bat_0213_value)
+            self.kv_cache[layer_id][value_idx][:bsz, :].masked_fill_(mask, 0)
+        else:
+            if self.static_shapes:
+                assert type(token_idx) == torch.Tensor, "token_idx is expected to be torch.Tensor"
+                self.kv_cache[layer_id][key_idx][:bsz].index_copy_(2, token_idx - 1, bat_0213_key)
+                self.kv_cache[layer_id][value_idx][:bsz].index_copy_(2, token_idx - 1, bat_0213_value)
+            else:
+                assert type(token_idx) == int, "token_idx is expected to be int"
+                self.kv_cache[layer_id][key_idx][:bsz, :, token_idx - 1:token_idx, :] = bat_0213_key
+                self.kv_cache[layer_id][value_idx][:bsz, :, token_idx - 1:token_idx, :] = bat_0213_value
+
+        bat_0213_key = self.kv_cache[layer_id][key_idx][:bsz]
+        bat_0213_value = self.kv_cache[layer_id][value_idx][:bsz]
+
+        if not self.static_shapes:
+            bat_0213_key = bat_0213_key[:, :, :token_idx, :]
+            bat_0213_value = bat_0213_value[:, :, :token_idx, :]
+
+        return bat_0213_key, bat_0213_value
+
+    def release_workspace(self):
+        self.kv_cache = None
+        self.rotary = None
+
+    def reset_tokens(self, initial_tokens=1):
+        self.num_tokens = initial_tokens
+
+    def current_tokens(self):
+        return self.num_tokens
+
+    def advance_tokens(self):
+        self.num_tokens = self.num_tokens + 1
+
+    def get_kv_cache(self):
+        return self.kv_cache
+
+    def get_rotary(self, rotary_dim, rope_theta, device=None):
+        if self.rotary is None:
+            from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
+
+            self.rotary = LlamaRotaryEmbedding(rotary_dim, base=rope_theta, device=device)
+
+        return self.rotary
+
+    def get_max_tokens_num(self):
+        return self.max_out_tokens
+
+
+class WorkspaceOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        self.inference_context = InferenceContext.Instance()
+        self._is_allocated = False
+        try:
+            super(WorkspaceOp, self).__init__(config)
+            if config.dtype == torch.float32:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_fp32
+            elif config.dtype == torch.bfloat16:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_bf16
+            else:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_fp16
+            self.release_workspace_func = self.inference_module.release_workspace
+            self.retake_workspace_func = self.inference_module.retake_workspace
+            self.reset_cache_func = self.inference_module.reset_cache
+        except (ValueError, AttributeError) as e:
+            print(f"Using fallback functions in workspace because of {e}")
+            if config.dtype == torch.float32:
+                self.allocate_workspace_func = self.allocate_workspace_fp32_fallback
+            elif config.dtype == torch.bfloat16:
+                self.allocate_workspace_func = self.allocate_workspace_bf16_fallback
+            else:
+                self.allocate_workspace_func = self.allocate_workspace_fp16_fallback
+            self.release_workspace_func = self.release_workspace_fallback
+            self.retake_workspace_func = self.retake_workspace_fallback
+            self.reset_cache_func = self.reset_cache_fallback
+
+    def allocate_workspace(self, *args, **kwargs):
+        self._is_allocated = True
+        return self.allocate_workspace_func(*args, **kwargs)
+
+    def release_workspace(self):
+        self._is_allocated = False
+        return self.release_workspace_func()
+
+    def reset_cache(self):
+        return self.reset_cache_func() if self.reset_cache_func else None
+
+    def retake_workspace(self):
+        return self.retake_workspace_func() if self.retake_workspace_func else None
+
+    def allocate_workspace_fp32_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.float, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def allocate_workspace_bf16_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.bfloat16, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def allocate_workspace_fp16_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.half, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def reset_cache_fallback(self):
+        return self.inference_context.reset_tokens()
+
+    def release_workspace_fallback(self):
+        return self.inference_context.release_workspace()
+
+    def retake_workspace_fallback(self):
+        return self.inference_context.retake_workspace()
+
+    def is_allocated(self):
+        return self._is_allocated
diff --git a/deepspeed/ops/transformer/inference/triton/attention.py b/deepspeed/ops/transformer/inference/triton/attention.py
index c05370ec74e59..6845d91b06be9 100644
--- a/deepspeed/ops/transformer/inference/triton/attention.py
+++ b/deepspeed/ops/transformer/inference/triton/attention.py
@@ -125,7 +125,7 @@ def _triton_autotune(min_seqlen,
             context_4d_matmul(output, qkv, head_size)
         Fp16Matmul._update_autotune_table()
 
-    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list):
             qkv_out = qkv_out[0]
 
@@ -143,7 +143,10 @@ def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             no_masking=no_masking,
             layer_id=self.config.layer_id,
             num_layers=TritonSelfAttention.num_layers,
-            alibi=alibi)
+            alibi=alibi,
+            is_prompt=is_prompt,
+            token_idx=token_idx,
+            position_ids=position_ids)
 
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
@@ -161,7 +164,8 @@ def forward(
             norm_w=None,
             norm_b=None,
             alibi=None,
-            use_triton_attention=True):
+            use_triton_attention=True,
+            **kwargs):
 
         if not self.config.pre_layer_norm:
             qkv_out = self.linear_func(input=input,
@@ -192,10 +196,16 @@ def forward(
                                               triangular=self.triangular_masking)
             key_layer, value_layer = qkv[:, :, self.hidden_size:2 * self.hidden_size], qkv[:, :, 2 * self.hidden_size:]
         else:
+            is_prompt = kwargs.get("first_token", qkv_out[0].shape[1] > 1)
+            token_idx = kwargs.get("token_idx", None)
+            position_ids = kwargs.get("position_ids", None)
             context_layer, key_layer, value_layer = self.ds_compute_attention(qkv_out=qkv_out,
                                                                               input_mask=input_mask,
                                                                               layer_past=layer_past,
-                                                                              alibi=alibi)
+                                                                              alibi=alibi,
+                                                                              is_prompt=is_prompt,
+                                                                              toke_idx=token_idx,
+                                                                              position_ids=position_ids)
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
 
         inp_norm = qkv_out[-1]
diff --git a/deepspeed/ops/transformer/inference/triton/ops.py b/deepspeed/ops/transformer/inference/triton/ops.py
index dd87d08d4d2c6..dbed453137806 100644
--- a/deepspeed/ops/transformer/inference/triton/ops.py
+++ b/deepspeed/ops/transformer/inference/triton/ops.py
@@ -3,12 +3,10 @@
 
 # DeepSpeed Team
 
-import deepspeed
-from deepspeed.ops.op_builder import InferenceBuilder
 import deepspeed.ops.transformer.inference.triton.matmul_ext as matmul_ext
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
 from deepspeed.ops.transformer.inference.triton.layer_norm import layer_norm, layer_norm_residual
-
-inference_module = None
+from deepspeed.utils.types import ActivationFuncType
 
 
 def vector_matmul_func(input, weight, async_op, q_scale, q_int8, transposed_mode):
@@ -76,15 +74,12 @@ def mlp_gemm_func(input,
     if use_triton_ln:
         mlp_input = layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
     else:
-        global inference_module
-        if inference_module is None:
-            inference_module = InferenceBuilder().load()
-        mlp_input = inference_module._layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
+        mlp_input = LayerNormOp.layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
 
     # activation
-    if deepspeed.utils.types.ActivationFuncType(mlp_act_func_type) == deepspeed.utils.types.ActivationFuncType.GELU:
+    if ActivationFuncType(mlp_act_func_type) == ActivationFuncType.GELU:
         activation = "gelu"
-    elif deepspeed.utils.types.ActivationFuncType(mlp_act_func_type) == deepspeed.utils.types.ActivationFuncType.ReLU:
+    elif ActivationFuncType(mlp_act_func_type) == ActivationFuncType.ReLU:
         activation = "relu"
     else:
         activation = ""
@@ -121,10 +116,7 @@ def qkv_gemm_func(
     if use_triton_ln:
         qkv_input = layer_norm(input, gamma, beta, epsilon)
     else:
-        global inference_module
-        if inference_module is None:
-            inference_module = InferenceBuilder().load()
-        qkv_input = inference_module.layer_norm(input, gamma, beta, epsilon)
+        qkv_input = LayerNormOp()(input, gamma, beta, epsilon)
 
     qkv_out = matmul_ext.matmul(qkv_input, weight, bias=(bias if add_bias else None), activation="", use_triton=True)
 
diff --git a/deepspeed/runtime/hybrid_engine.py b/deepspeed/runtime/hybrid_engine.py
index a991c43045635..8a6311bb6e834 100644
--- a/deepspeed/runtime/hybrid_engine.py
+++ b/deepspeed/runtime/hybrid_engine.py
@@ -17,16 +17,14 @@
 from deepspeed.accelerator import get_accelerator
 from torch import nn
 from deepspeed.utils import logger
-
-from deepspeed.ops.op_builder import InferenceBuilder
-
 from deepspeed.module_inject.layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
+from ..ops.transformer.inference.op_binding.workspace import WorkspaceOp
+
 try:
     import transformers
     OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
 except:
     OPTLearnedPositionalEmbedding = None
-inference_cuda_module = None
 
 
 class DeepSpeedHybridEngine(DeepSpeedEngine):
@@ -61,12 +59,8 @@ def __init__(self, args, model, **kwargs):
         self._total_batch_size = None
         self._gather_latency = 0
 
-        global inference_cuda_module
-        if inference_cuda_module is None:
-            builder = InferenceBuilder()
-            inference_cuda_module = builder.load()
-
         self.is_lora_fused = False
+        self.workspace = WorkspaceOp()
 
     def convert_to_linear_transposed(self, model):
 
@@ -160,13 +154,13 @@ def unfuse_lora_weight_non_pinned(self):
 
     def retake_inference_cache(self):
         if self._config.hybrid_engine.release_inference_cache:
-            retake_success = inference_cuda_module.retake_workspace()
+            retake_success = self.workspace.retake_workspace()
 
             if not retake_success:
                 logger.warning("Unable to acquire workspace on first attempt, emptying cache and retrying.")
                 gc.collect()
                 get_accelerator().empty_cache()
-                retake_success = inference_cuda_module.retake_workspace()
+                retake_success = self.workspace.retake_workspace()
 
                 if not retake_success:
                     raise RuntimeError("Unable to retake inference workspace.")
@@ -269,7 +263,7 @@ def generate(self, *inputs, **kwargs):
                 self.is_lora_fused = False
 
         if self._config.hybrid_engine.release_inference_cache:
-            inference_cuda_module.release_workspace()
+            self.workspace.release_workspace()
             gc.collect()
             get_accelerator().empty_cache()
 
diff --git a/op_builder/hpu/__init__.py b/op_builder/hpu/__init__.py
index 6527ace087b5f..5ad1b9a7f891e 100644
--- a/op_builder/hpu/__init__.py
+++ b/op_builder/hpu/__init__.py
@@ -7,4 +7,5 @@
 
 from .cpu_adam import CPUAdamBuilder
 from .fused_adam import FusedAdamBuilder
+from .transformer_inference import InferenceBuilder
 from .no_impl import NotImplementedBuilder
diff --git a/op_builder/hpu/transformer_inference.py b/op_builder/hpu/transformer_inference.py
new file mode 100644
index 0000000000000..e397c99200ec9
--- /dev/null
+++ b/op_builder/hpu/transformer_inference.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+import importlib
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class InferenceBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
+    NAME = "transformer_inference"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f"deepspeed.ops.transformer.inference.{self.NAME}_op"
+
+    def sources(self):
+        return []
+
+    def load(self, verbose=True):
+        if self.name in __class__._loaded_ops:
+            return __class__._loaded_ops[self.name]
+
+        from deepspeed.git_version_info import installed_ops  # noqa: F401
+        if installed_ops.get(self.name, False):
+            op_module = importlib.import_module(self.absolute_name())
+            __class__._loaded_ops[self.name] = op_module
+            return op_module
diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py
index 843c9b889c2bf..f25bbc1be6925 100644
--- a/tests/unit/ops/transformer/inference/test_bias_add.py
+++ b/tests/unit/ops/transformer/inference/test_bias_add.py
@@ -8,12 +8,13 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_add import BiasAddOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -22,15 +23,8 @@ def run_bias_add_reference(activations, bias):
 
 
 def run_bias_add_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_add_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_add_bf16(activations, bias)
-    else:
-        return inference_module.bias_add_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasAddOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index d5ab13964974c..05de4fbb4cf8d 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -8,13 +8,13 @@
 import deepspeed
 from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.transformer.inference.op_binding.gated_activation import GatedActivationOp
 from deepspeed.utils.types import ActivationFuncType
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -27,10 +27,7 @@ def run_bias_geglu_reference(activations, bias):
 
 
 def run_bias_geglu_ds(activation, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.gated_activation(activation, bias, ActivationFuncType.GATED_GELU)
+    return GatedActivationOp()(activation, bias, ActivationFuncType.GATED_GELU)
 
 
 @pytest.mark.inference_ops
@@ -56,17 +53,14 @@ def run_gated_silu_reference(activations, bias):
 
 
 def run_gated_silu_ds(activation, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.gated_activation(activation, bias, ActivationFuncType.GATED_SILU)
+    return GatedActivationOp()(activation, bias, ActivationFuncType.GATED_SILU)
 
 
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_gated_silu(batch, sequence, channels, dtype):
     activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device=get_accelerator().device_name())
     bias = torch.randn((channels * 2), dtype=dtype, device=get_accelerator().device_name())
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index fd82da51380c4..b69030e87ace5 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -8,13 +8,14 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
 from .inference_test_utils import allclose, get_dtypes
 from packaging import version as pkg_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -25,15 +26,8 @@ def run_bias_gelu_reference(activations, bias):
 
 
 def run_bias_gelu_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_gelu_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_gelu_bf16(activations, bias)
-    else:
-        return inference_module.bias_gelu_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasGeluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index 881af78e92cf6..57134665b2412 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -8,12 +8,13 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_relu import BiasReluOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -23,15 +24,8 @@ def run_bias_relu_reference(activations, bias):
 
 
 def run_bias_relu_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_relu_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_relu_bf16(activations, bias)
-    else:
-        return inference_module.bias_relu_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasReluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index 675860b00bdb9..beb74d09ab308 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -7,11 +7,12 @@
 import torch
 import deepspeed
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -45,13 +46,8 @@ def run_gelu_ds(activations, use_triton_ops=False):
     device = deepspeed.accelerator.get_accelerator().device_name()
     channels = activations.shape[-1]
     bias = torch.zeros((channels), dtype=activations.dtype, device=device)
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_gelu_fp16(activations, bias)
-    else:
-        return inference_module.bias_gelu_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasGeluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
index 9eac612aa29ca..2912807e9f43e 100644
--- a/tests/unit/ops/transformer/inference/test_layer_norm.py
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -8,6 +8,7 @@
 import pytest
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
 from .inference_test_utils import allclose, get_dtypes, assert_almost_equal
 try:
     import triton  # noqa: F401 # type: ignore
@@ -21,8 +22,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
     vals_f = vals.to(torch.float32)
@@ -32,10 +31,7 @@ def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
 
 
 def ds_implementation(vals, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm(vals, gamma, beta, epsilon)
+    return LayerNormOp()(vals, gamma, beta, epsilon)
 
 
 def ds_triton_implementation(vals, gamma, beta, epsilon):
@@ -83,10 +79,7 @@ def residual_ref_implementation(vals, bias, res, gamma, beta, epsilon, channels,
 
 
 def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module._layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
+    return LayerNormOp.layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
 
 
 def residual_ds_triton_implementation(vals, bias, res, gamma, beta, epsilon):
@@ -137,10 +130,7 @@ def residual_store_ref_implementation(vals, bias, res, gamma, beta, epsilon, cha
 
 
 def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
+    return LayerNormOp.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
index e1c8127a83ace..dcf9f16baaf16 100644
--- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
+++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
@@ -8,24 +8,20 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer.inference.op_binding.moe_res_matmul import MoEResMatmulOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def run_moe_res_matmul_reference(residual, coef1, coef2, output):
     return residual * coef1 + output * coef2
 
 
 def run_moe_res_matmul_ds(residual, coef, output):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
     coef_t = coef.transpose(-1, -2).contiguous()
-    return inference_module.moe_res_matmul(residual, coef_t, output)
+    return MoEResMatmulOp()(residual, coef_t, output)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
index 91830e25fc819..807da4904341f 100644
--- a/tests/unit/ops/transformer/inference/test_residual_add.py
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -8,6 +8,8 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding import ResidualAddOp
 from .inference_test_utils import get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
@@ -36,11 +38,6 @@ def allclose(x, y):
     return torch.allclose(x, y, rtol=rtol, atol=atol)
 
 
-@pytest.fixture(scope="module")
-def inference_module():
-    return InferenceBuilder().load()
-
-
 def res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size=1, pre_attn_norm=True):
     if pre_attn_norm:
         hidden_state += (residual + final_bias + attn_output + attn_bias) / mp_size
@@ -75,8 +72,8 @@ def run_residual_add_reference(hidden_state, residual, attn_output, attn_bias, f
 @pytest.mark.parametrize("mp_size", [1, 2])
 @pytest.mark.parametrize("pre_attn_norm", [True, False])
 @pytest.mark.parametrize("use_triton_ops", [True, False])
-def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size,
-                      pre_attn_norm, use_triton_ops):
+def test_residual_add(batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size, pre_attn_norm,
+                      use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
     ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
@@ -96,19 +93,9 @@ def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_
     if use_triton_ops:
         from deepspeed.ops.transformer.inference.triton import residual_add_bias
         ds_out = residual_add_bias(*res_add_args)
-    if dtype == torch.float16:
-        ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
-    elif dtype == torch.float32:
-        ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
-    elif dtype == torch.bfloat16:
-        ds_out = inference_module.residual_add_bias_bf16(*res_add_args)
     else:
-        if dtype == torch.float16:
-            ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
-        elif dtype == torch.float32:
-            ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
-        else:
-            raise ValueError(f"Unsupported dtype: {dtype}")
+        config = DeepSpeedInferenceConfig(dtype=dtype)
+        ds_out = ResidualAddOp(config).residual_add_func(*res_add_args)
 
     if not allclose(ds_out, ref_out):
         print((ds_out - ref_out).abs().max())
diff --git a/tests/unit/ops/transformer/inference/test_rms_norm.py b/tests/unit/ops/transformer/inference/test_rms_norm.py
index 508a40e12e8df..fde9c9510771a 100644
--- a/tests/unit/ops/transformer/inference/test_rms_norm.py
+++ b/tests/unit/ops/transformer/inference/test_rms_norm.py
@@ -8,13 +8,13 @@
 import pytest
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder  # type: ignore
+from deepspeed.ops.transformer.inference.op_binding.pre_rms_norm import PreRMSNormOp
+from deepspeed.ops.transformer.inference.op_binding.rms_norm import RMSNormOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def ref_implementation(vals, gamma, epsilon):
     variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
@@ -27,10 +27,7 @@ def ref_implementation(vals, gamma, epsilon):
 
 
 def ds_implementation(vals, gamma, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.rms_norm(vals, gamma, epsilon)
+    return RMSNormOp()(vals, gamma, epsilon)
 
 
 @pytest.mark.inference_ops
@@ -51,10 +48,7 @@ def test_rms_norm(batch, seq_len, channels, dtype):
 
 
 def pre_ds_implementation(vals, residual, gamma, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.pre_rms_norm(vals, residual, gamma, epsilon)
+    return PreRMSNormOp()(vals, residual, gamma, epsilon)
 
 
 def pre_ref_implementation(vals, residual, gamma, epsilon):
diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py
index 7d0d6e14b6513..4f6d69160aa7f 100644
--- a/tests/unit/ops/transformer/inference/test_softmax.py
+++ b/tests/unit/ops/transformer/inference/test_softmax.py
@@ -11,7 +11,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
 torch_minor_version = None
 
 
@@ -43,6 +42,7 @@ def run_softmax_ds(input, use_triton_ops=False):
 def test_softmax(batch, sequence, channels, dtype, use_triton_ops):
     if not deepspeed.HAS_TRITON and use_triton_ops:
         pytest.skip("triton has to be installed for the test")
+
     device = deepspeed.accelerator.get_accelerator().device_name()
     input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device)
     input_ref = input_ds.clone().detach()

From 1062a0c6583453af2d405e8ca2c0ad08b903d23e Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 9 Oct 2024 08:22:41 -0700
Subject: [PATCH 24/43] Unpin accelerate tests, update lightning with node16
 removal. (#6611)

HF accelerate fixes implemented in
https://github.com/huggingface/accelerate/pull/3145 mean that we no
longer need to pin the Accelerate version!

nv-lightning tests now run on Ubuntu 20.04+, so we support >node 16, so
we can remove the explicit permissions for that in the env config.
---
 .github/workflows/nv-accelerate-v100.yml | 1 -
 .github/workflows/nv-lightning-v100.yml  | 2 --
 2 files changed, 3 deletions(-)

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 3bddc329e3dd1..346055e2685f4 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -47,7 +47,6 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           git clone https://github.com/huggingface/accelerate
           cd accelerate
-          git checkout ae9cb6e4db6f81fd18148c2cc67d72b903d81a46
           git rev-parse --short HEAD
           # installing dependencies
           pip install .[testing]
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index a9a26b7ce816c..044c282ba1198 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -21,8 +21,6 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, cu121, v100]
 
-    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
-
     steps:
       - uses: actions/checkout@v3
 

From 474a3288cd5a89c64c37f1e51978a3d031a87b4a Mon Sep 17 00:00:00 2001
From: gyou2021 <ganmei.you@intel.com>
Date: Wed, 9 Oct 2024 23:23:16 +0800
Subject: [PATCH 25/43] Enabled Qwen2-MoE Tensor Parallelism (TP) inference
 (#6551)

Modified _replace_module in auto_tp.py :
The modification keeps the layers 'shared_expert_gate' and 'gate' in
qwen2-moe the original type torch.nn.Linear and not changes them into
LinearLayer. In this way, their weights will not be split into multiple
HPU/GPU cards. Then the qwen2-moe can run on multiple HPU/GPU cards.
Since the weights of 'gate' are not split into multiple HPU/GPU cards,
all gather operations are not needed, which may improve performance.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/module_inject/auto_tp.py              | 3 ++-
 docs/_tutorials/automatic-tensor-parallelism.md | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 deepspeed/module_inject/auto_tp.py
 mode change 100644 => 100755 docs/_tutorials/automatic-tensor-parallelism.md

diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
old mode 100644
new mode 100755
index 52d7c95ec9d88..221d490a37d2a
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -333,7 +333,8 @@ def _replace(self, child, name, conv_linear_layer):
         weight_shape = child.weight.shape
         mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
         # For mixtral-7x8b, need to skip MoE gate linear replace.
-        if name == "block_sparse_moe.gate":
+        if name == "block_sparse_moe.gate" or (('mlp.shared_expert_gate' == name or 'mlp.gate' == name)
+                                               and 'qwen2_moe' in str(type(self.module))):
             return child
         # For Yuan model
         if 'Yuan' in str(self.module):
diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md
old mode 100644
new mode 100755
index e1903ed058920..d5a08b27bf4db
--- a/docs/_tutorials/automatic-tensor-parallelism.md
+++ b/docs/_tutorials/automatic-tensor-parallelism.md
@@ -158,6 +158,7 @@ The following model families have been successfully tested with automatic tensor
 - plbart
 - qwen
 - qwen2
+- qwen2-moe
 - reformer
 - roberta
 - roformer

From 55f7f3789eed805d8f5deab0e2276516a302d745 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 9 Oct 2024 10:48:39 -0700
Subject: [PATCH 26/43] Update version.txt after 0.15.2 release (#6615)

**Auto-generated PR to update version.txt after a DeepSpeed release**
Released version - 0.15.2
Author           - @jomayeri

Co-authored-by: jomayeri <jomayeri@users.noreply.github.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 4312e0d0cae3a..1985d91413de8 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.15.2
+0.15.3

From 7d751ee8903187cd7cd82f43816acf1d5d0907ba Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Wed, 9 Oct 2024 08:23:33 -0700
Subject: [PATCH 27/43] Clean up prefetched parameters (#6557)

Parameters prefetched by ZeRO3 are sometimes not used. This occurs when
the actual sub-module execution differs from previous tracing. As a
result, the state of the allgather handle for such a parameter remains
`INFLIGHT`, causing functions like `empty_partition_cache` to detect it
and throw an error.
This PR resolves the issue by ensuring that communication finishes and
the parameters are freed.

As this issue was mentioned in #6011, this includes the change of the
branch. We need to merge #6011 first.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 .../runtime/zero/partitioned_param_coordinator.py   | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index bdec8a55fcbc5..5780b2afd6def 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -153,11 +153,18 @@ def is_invalid_trace(self) -> bool:
     def is_record_trace(self) -> bool:
         return self.__trace_mode == ZeRoTraceMode.RECORD
 
+    def _clean_inflight_param_registry(self) -> None:
+        for param, handle in self.__inflight_param_registry.items():
+            handle.wait()
+            self.__release_param(param)
+        self.__inflight_param_registry.clear()
+
     def _invalidate_trace(self) -> None:
         if self.is_invalid_trace():
             raise RuntimeError("attempted to invalidate already invalid trace")
         self.__trace_mode = ZeRoTraceMode.INVALID
         self._clear_trace_structures()
+        self._clean_inflight_param_registry()
 
     def trace_prologue(self, sub_module: Module) -> None:
         if self.is_complete_trace():
@@ -204,9 +211,7 @@ def construct_parameter_trace_from_module_trace(self):
 
     def reset_step(self) -> None:
         """indicate that we have completed one fwd+bwd for the model"""
-        if self.__inflight_param_registry:
-            raise RuntimeError(f"still have inflight params "
-                               f"{[p.ds_summary() for p in self.__inflight_param_registry.keys()]}")
+        self._clean_inflight_param_registry()
 
         if not self.is_complete_trace():  # not self.trace_complete:
             # Make sure that recorded submodule orders are identical across ranks
@@ -409,7 +414,7 @@ def release_and_reset_all(self, module: Module) -> None:
         """release all module parameters"""
         for param in iter_params(module, recurse=True):
             if param in self.__inflight_param_registry:
-                raise RuntimeError(f"param {param.ds_summary()} still in flight")
+                self.__inflight_param_registry.pop(param).wait()
 
             # TODO. make this throw if if there are still active submodules. currently
             # there's a hook execution issue

From a1f98bdc703dacf95e11c4ae25f6fd11b6d1277e Mon Sep 17 00:00:00 2001
From: Joe Mayer <114769929+jomayeri@users.noreply.github.com>
Date: Wed, 9 Oct 2024 14:07:31 -0700
Subject: [PATCH 28/43] AIO CPU Locked Tensor (#6592)

Restoring the functionality of the cpu locked tensor in the AIO library.
Make async_io operator available for CPU accelerator, i.e., CPU only
environment.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 accelerator/cpu_accelerator.py                |  6 +-
 csrc/aio/common/deepspeed_aio_common.cpp      | 43 +++++----
 csrc/aio/common/deepspeed_aio_common.h        |  2 +-
 csrc/aio/common/deepspeed_aio_utils.cpp       | 18 ++--
 csrc/aio/common/deepspeed_aio_utils.h         | 22 ++---
 csrc/aio/py_lib/deepspeed_aio_op_desc.cpp     |  8 +-
 csrc/aio/py_lib/deepspeed_aio_op_desc.h       | 10 +-
 csrc/aio/py_lib/deepspeed_cpu_op.cpp          | 79 ++++++++++------
 csrc/aio/py_lib/deepspeed_cpu_op.h            | 10 +-
 csrc/aio/py_lib/deepspeed_pin_tensor.cpp      | 22 ++++-
 csrc/aio/py_lib/deepspeed_pin_tensor.h        |  7 +-
 csrc/aio/py_lib/deepspeed_py_aio.cpp          |  6 +-
 csrc/aio/py_lib/deepspeed_py_aio_handle.cpp   |  8 +-
 csrc/aio/py_lib/deepspeed_py_aio_handle.h     |  2 +-
 csrc/aio/py_lib/deepspeed_py_io_handle.cpp    | 47 +++++-----
 csrc/aio/py_lib/deepspeed_py_io_handle.h      | 24 ++---
 csrc/aio/py_lib/py_ds_aio.cpp                 |  4 +-
 csrc/gds/py_lib/deepspeed_gds_op.cpp          |  6 +-
 csrc/gds/py_lib/deepspeed_gds_op.h            |  2 +-
 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp   | 20 ++--
 csrc/gds/py_lib/deepspeed_py_gds_handle.h     |  8 +-
 csrc/gds/py_lib/py_ds_gds.cpp                 |  4 +-
 docs/_tutorials/deepnvme.md                   | 16 ++--
 op_builder/builder.py                         |  2 +-
 op_builder/cpu/__init__.py                    |  1 +
 op_builder/cpu/async_io.py                    | 91 +++++++++++++++++++
 tests/unit/ops/aio/test_aio.py                | 84 +++++++++--------
 tests/unit/ops/aio/test_gds.py                |  2 +-
 .../runtime/zero/test_nvme_checkpointing.py   |  4 +
 29 files changed, 362 insertions(+), 196 deletions(-)
 create mode 100644 op_builder/cpu/async_io.py

diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index d4fcbb0b1e3eb..1e4335b192928 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -301,9 +301,9 @@ def get_op_builder(self, class_name):
             # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
             # if successful this also means we're doing a local install and not JIT compile path
             from op_builder import __deepspeed__  # noqa: F401 # type: ignore
-            from op_builder.cpu import CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
         except ImportError:
-            from deepspeed.ops.op_builder.cpu import CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from deepspeed.ops.op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
 
         if class_name == "CCLCommBuilder":
             return CCLCommBuilder
@@ -313,6 +313,8 @@ def get_op_builder(self, class_name):
             return FusedAdamBuilder
         elif class_name == "CPUAdamBuilder":
             return CPUAdamBuilder
+        elif class_name == "AsyncIOBuilder":
+            return AsyncIOBuilder
         else:
             # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
             return NotImplementedBuilder
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index a65cc500cc821..81c315e9a5583 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -68,8 +68,8 @@ static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_l
         std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
 }
 
-static void _do_io_submit_singles(const long long int n_iocbs,
-                                  const long long int iocb_index,
+static void _do_io_submit_singles(const int64_t n_iocbs,
+                                  const int64_t iocb_index,
                                   std::unique_ptr<aio_context>& aio_ctxt,
                                   std::vector<std::chrono::duration<double>>& submit_times)
 {
@@ -89,8 +89,8 @@ static void _do_io_submit_singles(const long long int n_iocbs,
     }
 }
 
-static void _do_io_submit_block(const long long int n_iocbs,
-                                const long long int iocb_index,
+static void _do_io_submit_block(const int64_t n_iocbs,
+                                const int64_t iocb_index,
                                 std::unique_ptr<aio_context>& aio_ctxt,
                                 std::vector<std::chrono::duration<double>>& submit_times)
 {
@@ -109,18 +109,18 @@ static void _do_io_submit_block(const long long int n_iocbs,
     assert(submit_ret > 0);
 }
 
-static int _do_io_complete(const long long int min_completes,
-                           const long long int max_completes,
+static int _do_io_complete(const int64_t min_completes,
+                           const int64_t max_completes,
                            std::unique_ptr<aio_context>& aio_ctxt,
                            std::vector<std::chrono::duration<double>>& reap_times)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
-    long long int n_completes = io_pgetevents(aio_ctxt->_io_ctxt,
-                                              min_completes,
-                                              max_completes,
-                                              aio_ctxt->_io_events.data(),
-                                              nullptr,
-                                              nullptr);
+    int64_t n_completes = io_pgetevents(aio_ctxt->_io_ctxt,
+                                        min_completes,
+                                        max_completes,
+                                        aio_ctxt->_io_events.data(),
+                                        nullptr,
+                                        nullptr);
     reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
     assert(n_completes >= min_completes);
     return n_completes;
@@ -134,7 +134,7 @@ void do_aio_operation_sequential(const bool read_op,
 {
     struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
 
-    const auto num_io_blocks = static_cast<long long int>(
+    const auto num_io_blocks = static_cast<int64_t>(
         ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
 #if DEBUG_DS_AIO_PERF
     const auto io_op_name = std::string(read_op ? "read" : "write");
@@ -145,15 +145,14 @@ void do_aio_operation_sequential(const bool read_op,
     std::vector<std::chrono::duration<double>> submit_times;
     std::vector<std::chrono::duration<double>> reap_times;
     const auto max_queue_bytes =
-        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
+        static_cast<int64_t>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
 
     auto start = std::chrono::high_resolution_clock::now();
-    for (long long iocb_index = 0; iocb_index < num_io_blocks;
-         iocb_index += aio_ctxt->_queue_depth) {
+    for (int64_t iocb_index = 0; iocb_index < num_io_blocks; iocb_index += aio_ctxt->_queue_depth) {
         const auto start_offset = iocb_index * aio_ctxt->_block_size;
         const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
         const auto n_iocbs =
-            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
+            min(static_cast<int64_t>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
         const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
         prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
 
@@ -285,13 +284,13 @@ int open_file(const char* filename, const bool read_op)
 
 int regular_read(const char* filename, std::vector<char>& buffer)
 {
-    long long int num_bytes;
+    int64_t num_bytes;
     const auto f_size = get_file_size(filename, num_bytes);
     assert(f_size != -1);
     buffer.resize(num_bytes);
     const auto fd = open(filename, O_RDONLY, 0600);
     assert(fd != -1);
-    long long int read_bytes = 0;
+    int64_t read_bytes = 0;
     auto r = 0;
     do {
         const auto buffer_ptr = buffer.data() + read_bytes;
@@ -309,7 +308,7 @@ int regular_read(const char* filename, std::vector<char>& buffer)
     return 0;
 }
 
-static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
+static bool _validate_buffer(const char* filename, void* aio_buffer, const int64_t num_bytes)
 {
     std::vector<char> regular_buffer;
     const auto reg_ret = regular_read(filename, regular_buffer);
@@ -317,7 +316,7 @@ static bool _validate_buffer(const char* filename, void* aio_buffer, const long
     std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
               << std::endl;
 
-    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
+    if (static_cast<int64_t>(regular_buffer.size()) != num_bytes) { return false; }
 
     return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
 }
@@ -325,7 +324,7 @@ static bool _validate_buffer(const char* filename, void* aio_buffer, const long
 bool validate_aio_operation(const bool read_op,
                             const char* filename,
                             void* aio_buffer,
-                            const long long int num_bytes)
+                            const int64_t num_bytes)
 {
     const auto msg_suffix = std::string("deepspeed_aio_") +
                             std::string(read_op ? "read()" : "write()") +
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
index 2940de945ee8f..aa4e49f4f4ede 100644
--- a/csrc/aio/common/deepspeed_aio_common.h
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -35,4 +35,4 @@ int regular_read(const char* filename, std::vector<char>& buffer);
 bool validate_aio_operation(const bool read_op,
                             const char* filename,
                             void* aio_buffer,
-                            const long long int num_bytes);
+                            const int64_t num_bytes);
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index 763b2c253a34c..0536ff6a362e7 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -18,8 +18,8 @@ const int c_block_size = 128 * 1024;
 const int c_io_queue_depth = 8;
 
 io_xfer_ctxt::io_xfer_ctxt(const int fd,
-                           const long long int file_offset,
-                           const long long int num_bytes,
+                           const int64_t file_offset,
+                           const int64_t num_bytes,
                            const void* buffer)
     : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
 {
@@ -36,7 +36,7 @@ io_prep_context::io_prep_context(const bool read_op,
 void io_prep_context::prep_iocbs(const int n_iocbs,
                                  const size_t num_bytes,
                                  const void* start_buffer,
-                                 const long long int start_offset)
+                                 const int64_t start_offset)
 {
     assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
     for (auto i = 0; i < n_iocbs; ++i) {
@@ -64,24 +64,24 @@ io_prep_generator::io_prep_generator(const bool read_op,
       _next_iocb_index(0)
 {
     _num_io_blocks =
-        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
+        static_cast<int64_t>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
     _remaining_io_blocks = _num_io_blocks;
 }
 
 int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
 {
     if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
-        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
+        assert(static_cast<int64_t>(_remaining_bytes) == _remaining_io_blocks);
         return 0;
     }
 
     assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
 
-    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
+    auto actual_n_iocbs = min(static_cast<int64_t>(n_iocbs), _remaining_io_blocks);
     for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
         const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
         const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
-        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
+        const auto num_bytes = min(static_cast<int64_t>(_block_size), _remaining_bytes);
 
         if (_read_op) {
             io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
@@ -95,7 +95,7 @@ int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>*
     return actual_n_iocbs;
 }
 
-int get_file_size(const char* filename, long long int& size)
+int get_file_size(const char* filename, int64_t& size)
 {
     struct stat st;
     if (stat(filename, &st) == -1) { return -1; }
@@ -103,7 +103,7 @@ int get_file_size(const char* filename, long long int& size)
     return 0;
 }
 
-void* ds_page_aligned_alloc(const size_t size, const bool lock)
+void* ds_page_aligned_alloc(const int64_t size, const bool lock)
 {
     void* ptr;
     int retval;
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
index 9c58c2286610b..20e81fe8eebd7 100644
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -30,13 +30,13 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 struct io_xfer_ctxt {
     const int _fd;
-    const long long int _base_offset;
+    const int64_t _base_offset;
     const void* _mem_buffer;
-    const long long int _num_bytes;
+    const int64_t _num_bytes;
 
     io_xfer_ctxt(const int fd,
-                 const long long int file_offset,
-                 const long long int num_bytes,
+                 const int64_t file_offset,
+                 const int64_t num_bytes,
                  const void* buffer);
 };
 
@@ -54,7 +54,7 @@ struct io_prep_context {
     void prep_iocbs(const int n_iocbs,
                     const size_t num_bytes,
                     const void* start_buffer,
-                    const long long int start_offset);
+                    const int64_t start_offset);
 };
 
 struct io_prep_generator {
@@ -62,10 +62,10 @@ struct io_prep_generator {
     const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
     const size_t _block_size;
 
-    long long int _remaining_bytes;
-    long long int _num_io_blocks;
-    long long int _remaining_io_blocks;
-    long long int _next_iocb_index;
+    int64_t _remaining_bytes;
+    int64_t _num_io_blocks;
+    int64_t _remaining_io_blocks;
+    int64_t _next_iocb_index;
 
     io_prep_generator(const bool read_op,
                       const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
@@ -74,6 +74,6 @@ struct io_prep_generator {
     int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
 };
 
-void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
+void* ds_page_aligned_alloc(const int64_t size, const bool lock = false);
 
-int get_file_size(const char* filename, long long int& size);
+int get_file_size(const char* filename, int64_t& size);
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
index dc820be528d05..6f311c5400c72 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@@ -11,16 +11,16 @@ io_op_desc_t::io_op_desc_t(const bool read_op,
                            const torch::Tensor& buffer,
                            const int fd,
                            const char* filename,
-                           const long long int file_num_bytes,
-                           const int num_threads,
+                           const int64_t file_num_bytes,
+                           const int intra_op_parallelism,
                            const bool validate)
     : _read_op(read_op),
       _buffer(buffer),
       _fd(fd),
       _filename(filename),
       _file_num_bytes(file_num_bytes),
-      _num_threads(num_threads),
-      _num_bytes_per_thread(file_num_bytes / num_threads),
+      _intra_op_parallelism(intra_op_parallelism),
+      _num_bytes_per_thread(file_num_bytes / intra_op_parallelism),
       _validate(validate)
 {
 }
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
index 350d28d29d58e..f841b8ce520a5 100644
--- a/csrc/aio/py_lib/deepspeed_aio_op_desc.h
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@@ -14,9 +14,9 @@ struct io_op_desc_t {
     torch::Tensor _buffer;
     int _fd;
     const std::string _filename;
-    const long long int _file_num_bytes;
-    const int _num_threads;
-    const long long int _num_bytes_per_thread;
+    const int64_t _file_num_bytes;
+    const int _intra_op_parallelism;
+    const int64_t _num_bytes_per_thread;
     torch::Tensor _contiguous_buffer;
     const bool _validate;
 
@@ -24,8 +24,8 @@ struct io_op_desc_t {
                  const torch::Tensor& buffer,
                  const int fd,
                  const char* filename,
-                 const long long int file_num_bytes,
-                 const int num_threads,
+                 const int64_t file_num_bytes,
+                 const int intra_op_parallelism,
                  const bool validate);
 
     virtual void run(const int tid,
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
index 41790b99bb889..da1a52d9c6e3d 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -4,31 +4,30 @@
 // DeepSpeed Team
 
 #include "deepspeed_cpu_op.h"
+#include "deepspeed_pin_tensor.h"
 
 using namespace std;
 
-cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
-                             const torch::Tensor& buffer,
-                             const int fd,
-                             const char* filename,
-                             const long long int file_num_bytes,
-                             const int num_threads,
-                             const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate),
-      _cpu_buffer(buffer)
+cpu_op_desc_t::cpu_op_desc_t(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
+    const int fd,
+    const char* filename,
+    const int64_t file_num_bytes,
+    const int intra_op_parallelism,
+    const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate),
+      _cpu_buffer(buffer),
+      _pinned_tensor_mgr(pinned_tensor_mgr),
+      _is_managed_bounce_buffer(false)
 {
     // Need to use CPU bounce buffer if buffer is not a page-locked DRAM memory.
-    _use_bounce_buffer = !(_buffer.is_cpu() && _buffer.is_pinned());
+    _use_bounce_buffer =
+        !(_buffer.is_cpu() && (_buffer.is_pinned() || _pinned_tensor_mgr->is_managed(_buffer)));
     if (_use_bounce_buffer) {
-        if (_read_op) {
-            auto options = torch::TensorOptions()
-                               .dtype(_buffer.dtype())
-                               .layout(_buffer.layout())
-                               .device(torch::kCPU);
-            _cpu_buffer = torch::empty(_buffer.nbytes(), options).pin_memory();
-        } else {
-            _cpu_buffer = _buffer.to(torch::kCPU).pin_memory();
-        }
+        _alloc_bounce_buffer();
+        if (!_read_op) { _cpu_buffer.copy_(_buffer); }
     }
     _contiguous_buffer = _cpu_buffer.contiguous();
 }
@@ -37,15 +36,20 @@ char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_pt
 
 void cpu_op_desc_t::finish()
 {
-    if (_read_op) {
-        if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
-        if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+    if (_use_bounce_buffer) {
+        if (_read_op) {
+            if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+            if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+            if (_buffer.is_cpu()) { _buffer.copy_(_cpu_buffer); }
 #if defined(__ENABLE_CANN__)
-        if (torch_npu::utils::is_npu(_buffer)) {
-            auto device = at::Device("npu:0");
-            _buffer.copy_(_cpu_buffer.to(device));
-        }
+            if (torch_npu::utils::is_npu(_buffer)) {
+                auto device = at::Device("npu:0");
+                _buffer.copy_(_cpu_buffer.to(device));
+            }
 #endif
+        }
+
+        _free_bounce_buffer();
     }
 }
 
@@ -58,7 +62,7 @@ void cpu_op_desc_t::run(const int tid,
                         std::unique_ptr<aio_context>& aio_ctxt,
                         deepspeed_aio_config_t* aio_config)
 {
-    assert(tid < _num_threads);
+    assert(tid < _intra_op_parallelism);
     const auto base_offset = _num_bytes_per_thread * tid;
 
     std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
@@ -70,3 +74,24 @@ void cpu_op_desc_t::run(const int tid,
         do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
     }
 }
+
+void cpu_op_desc_t::_alloc_bounce_buffer()
+{
+    auto options = torch::TensorOptions()
+                       .dtype(_buffer.dtype())
+                       .layout(_buffer.layout())
+                       .device(torch::kCPU)
+                       .requires_grad(false);
+
+#if defined(__CUDA_ARCH__)
+    _cpu_buffer = torch::empty(_buffer.numel(), options).pin_memory();
+#else
+    _is_managed_bounce_buffer = true;
+    _cpu_buffer = _pinned_tensor_mgr->alloc(_buffer.numel(), options);
+#endif
+}
+
+void cpu_op_desc_t::_free_bounce_buffer()
+{
+    if (_is_managed_bounce_buffer) { _pinned_tensor_mgr->free(_cpu_buffer); }
+}
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
index da96dd2b1d50b..9de2fa2540486 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.h
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -10,13 +10,16 @@
 struct cpu_op_desc_t : io_op_desc_t {
     torch::Tensor _cpu_buffer;
     bool _use_bounce_buffer;
+    bool _is_managed_bounce_buffer;
+    const std::unique_ptr<struct deepspeed_pin_tensor_t>& _pinned_tensor_mgr;
 
     cpu_op_desc_t(const bool read_op,
                   const torch::Tensor& buffer,
+                  const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
                   const int fd,
                   const char* filename,
-                  const long long int file_num_bytes,
-                  const int num_threads,
+                  const int64_t file_num_bytes,
+                  const int intra_op_parallelism,
                   const bool validate);
 
     void run(const int tid,
@@ -28,4 +31,7 @@ struct cpu_op_desc_t : io_op_desc_t {
     void validate();
 
     void finish();
+
+    void _alloc_bounce_buffer();
+    void _free_bounce_buffer();
 };
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
index 752823dc7dd2c..6d2800468e061 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
@@ -19,17 +19,23 @@ deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t()
     _locked_tensors.clear();
 }
 
-torch::Tensor deepspeed_pin_tensor_t::alloc(const size_t num_elem, const at::ScalarType& elem_type)
+torch::Tensor deepspeed_pin_tensor_t::alloc(const int64_t num_elem,
+                                            const torch::TensorOptions& options)
 {
-    const auto num_bytes = num_elem * elementSize(elem_type);
+    const auto scalar_dtype = torch::typeMetaToScalarType(options.dtype());
+    const auto num_bytes = num_elem * torch::elementSize(scalar_dtype);
     auto pinned_buffer = ds_page_aligned_alloc(num_bytes, true);
     assert(nullptr != pinned_buffer);
 
     _locked_tensors[pinned_buffer] = num_bytes;
 
-    auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU);
+    return at::from_blob(pinned_buffer, static_cast<int64_t>(num_elem), options);
+}
 
-    return at::from_blob(pinned_buffer, static_cast<long int>(num_bytes), options);
+torch::Tensor deepspeed_pin_tensor_t::alloc(const int64_t num_elem, const at::ScalarType& elem_type)
+{
+    auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU).requires_grad(false);
+    return alloc(num_elem, options);
 }
 
 bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
@@ -43,3 +49,11 @@ bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
 
     return false;
 }
+
+bool deepspeed_pin_tensor_t::is_managed(const torch::Tensor& buffer)
+{
+    if (!buffer.is_cpu()) { return false; }
+    auto addr = buffer.data_ptr();
+    if (_locked_tensors.find(addr) != _locked_tensors.end()) { return true; }
+    return false;
+};
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.h b/csrc/aio/py_lib/deepspeed_pin_tensor.h
index 4350a4ac7df67..4b8ad7e760858 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.h
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
@@ -15,13 +15,16 @@ Functionality for managing CPU tensors occupying page-locked memory.
 #include "deepspeed_py_aio.h"
 
 struct deepspeed_pin_tensor_t {
-    std::map<void*, size_t> _locked_tensors;
+    std::map<void*, int64_t> _locked_tensors;
 
     deepspeed_pin_tensor_t() = default;
 
     ~deepspeed_pin_tensor_t();
 
-    torch::Tensor alloc(const size_t num_elem, const at::ScalarType& elem_type);
+    torch::Tensor alloc(const int64_t num_elem, const at::ScalarType& elem_type);
+    torch::Tensor alloc(const int64_t num_elem, const torch::TensorOptions& options);
 
     bool free(torch::Tensor& locked_tensor);
+
+    bool is_managed(const torch::Tensor& buffer);
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index eac268d334338..02b04057d1ac3 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -51,7 +51,7 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
     if (fd == -1) { return -1; }
 
     auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
     std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
 
@@ -83,7 +83,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
                           const bool validate)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
-    long long num_file_bytes;
+    int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
         const auto error_code = errno;
         report_file_error(filename, " fstat for read", error_code);
@@ -95,7 +95,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
     if (fd == -1) { return -1; }
 
     auto read_buffer = (char*)buffer.data_ptr();
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert(static_cast<int64_t>(buffer.nbytes()) == num_file_bytes);
 
     std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index c7ca5e82afdef..2b1093e992865 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -16,8 +16,12 @@ deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
-                                               const int num_threads)
-    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
+                                               const int intra_op_parallelism)
+    : deepspeed_io_handle_t(block_size,
+                            queue_depth,
+                            single_submit,
+                            overlap_events,
+                            intra_op_parallelism)
 {
 }
 
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index eb6b90ea22f03..1398df9a56c96 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -16,7 +16,7 @@ struct deepspeed_aio_handle_t : deepspeed_io_handle_t {
                            const int queue_depth,
                            const bool single_submit,
                            const bool overlap_events,
-                           const int num_threads);
+                           const int intra_op_parallelism);
 
     ~deepspeed_aio_handle_t();
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
index bdf2a858d797a..48ea8a1339d45 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -18,16 +18,16 @@ deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size,
                                              const int queue_depth,
                                              const bool single_submit,
                                              const bool overlap_events,
-                                             const int num_threads)
+                                             const int intra_op_parallelism)
     : _aio_ctxt(new aio_context(block_size, queue_depth)),
       _single_submit(single_submit),
       _overlap_events(overlap_events),
-      _num_threads(num_threads),
+      _intra_op_parallelism(intra_op_parallelism),
       _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
       _num_pending_ops(0),
       _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
 {
-    for (auto i = 0; i < num_threads; ++i) {
+    for (auto i = 0; i < intra_op_parallelism; ++i) {
         _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
     }
 
@@ -56,7 +56,7 @@ const bool deepspeed_io_handle_t::get_single_submit() const { return _single_sub
 
 const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_events; }
 
-const int deepspeed_io_handle_t::get_thread_count() const { return _num_threads; }
+const int deepspeed_io_handle_t::get_intra_op_parallelism() const { return _intra_op_parallelism; }
 
 int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
 {
@@ -64,13 +64,13 @@ int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, con
 
     assert(_aio_ctxt);
 
-    long long num_file_bytes;
+    int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
         const auto error_code = errno;
         report_file_error(filename, " fstat for read", error_code);
         return -1;
     }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert(static_cast<int64_t>(buffer.nbytes()) == num_file_bytes);
 
     const auto fd = open_file(filename, true);
     if (fd == -1) { return -1; }
@@ -108,7 +108,7 @@ int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
     if (fd == -1) { return -1; }
 
     auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
     std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
 
     if (_aio_config._overlap_events) {
@@ -188,13 +188,12 @@ int deepspeed_io_handle_t::wait()
     return num_completed_ops;
 }
 
-bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                      const long long int num_bytes)
+bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op, const int64_t num_bytes)
 {
     const auto op_string = read_op ? "Read" : "Write";
-    if (num_bytes % get_thread_count()) {
+    if (num_bytes % get_intra_op_parallelism()) {
         std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_thread_count() << std::endl;
+                  << " not divisible by thread count = " << get_intra_op_parallelism() << std::endl;
         return false;
     }
 
@@ -206,11 +205,17 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
     const torch::Tensor& buffer,
     const int fd,
     const char* filename,
-    const long long int file_num_bytes,
+    const int64_t file_num_bytes,
     const bool validate)
 {
-    return std::make_shared<cpu_op_desc_t>(
-        read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
+    return std::make_shared<cpu_op_desc_t>(read_op,
+                                           buffer,
+                                           _pinned_tensor_mgr,
+                                           fd,
+                                           filename,
+                                           file_num_bytes,
+                                           _intra_op_parallelism,
+                                           validate);
 }
 
 int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
@@ -218,19 +223,19 @@ int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
                                  const bool validate,
                                  const bool async)
 {
-    long long num_file_bytes;
+    int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
         const auto error_code = errno;
         report_file_error(filename, " fstat for read", error_code);
         return -1;
     }
-    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
+    const auto buffer_bytes = static_cast<int64_t>(buffer.nbytes());
     if (buffer_bytes != num_file_bytes) {
         std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
                   << " != " << num_file_bytes << std::endl;
     }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-    assert((num_file_bytes % _num_threads) == 0);
+    assert(buffer_bytes == num_file_bytes);
+    assert((num_file_bytes % _intra_op_parallelism) == 0);
 
     if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
 
@@ -251,8 +256,8 @@ int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
                                   const bool validate,
                                   const bool async)
 {
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    assert((num_write_bytes % _num_threads) == 0);
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
+    assert((num_write_bytes % _intra_op_parallelism) == 0);
 
     if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
 
@@ -288,7 +293,7 @@ int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char*
     return pwrite(buffer, filename, false, true);
 }
 
-at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const size_t num_elem,
+at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const int64_t num_elem,
                                                         const torch::Tensor& example_tensor)
 {
     return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h
index 2974ebe87bfc1..4fedf80808189 100644
--- a/csrc/aio/py_lib/deepspeed_py_io_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@@ -16,7 +16,7 @@ struct deepspeed_io_handle_t {
     std::unique_ptr<struct aio_context> _aio_ctxt;
     const bool _single_submit;
     const bool _overlap_events;
-    const int _num_threads;
+    const int _intra_op_parallelism;
     deepspeed_aio_config_t _aio_config;
 
     std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
@@ -28,7 +28,7 @@ struct deepspeed_io_handle_t {
                           const int queue_depth,
                           const bool single_submit,
                           const bool overlap_events,
-                          const int num_threads);
+                          const int intra_op_parallelism);
 
     virtual ~deepspeed_io_handle_t() = 0;
 
@@ -36,7 +36,7 @@ struct deepspeed_io_handle_t {
     const int get_queue_depth() const;
     const bool get_single_submit() const;
     const bool get_overlap_events() const;
-    const int get_thread_count() const;
+    const int get_intra_op_parallelism() const;
 
     int read(torch::Tensor& buffer, const char* filename, const bool validate);
 
@@ -61,7 +61,8 @@ struct deepspeed_io_handle_t {
     int async_pwrite(const torch::Tensor& buffer, const char* filename);
 
     // TODO: Make API's args to be shape and dtype.
-    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
+    torch::Tensor new_cpu_locked_tensor(const int64_t num_elem,
+                                        const torch::Tensor& example_tensor);
 
     bool free_cpu_locked_tensor(torch::Tensor&);
 
@@ -73,13 +74,12 @@ struct deepspeed_io_handle_t {
 
     std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
 
-    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+    bool _is_valid_parallel_aio_op(const bool read_op, const int64_t num_bytes);
 
-    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(
-        const bool read_op,
-        const torch::Tensor& buffer,
-        const int fd,
-        const char* filename,
-        const long long int file_num_bytes,
-        const bool validate);
+    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
+                                                                    const torch::Tensor& buffer,
+                                                                    const int fd,
+                                                                    const char* filename,
+                                                                    const int64_t file_num_bytes,
+                                                                    const bool validate);
 };
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index 3171d0c6bf3c2..b80fa2d6c8e6d 100644
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -27,13 +27,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "queue_depth"_a = 128,
              "single_submit"_a = false,
              "overlap_events"_a = false,
-             "num_threads"_a = 1)
+             "intra_op_parallelism"_a = 1)
 
         .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
         .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
         .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
-        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
+        .def("get_intra_op_parallelism", &deepspeed_aio_handle_t::get_intra_op_parallelism)
 
         .def("read",
              &deepspeed_aio_handle_t::read,
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
index c370a448e5a21..dae2eef21c6f1 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -95,9 +95,9 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op,
                              const int fd,
                              const char* filename,
                              const long long int file_num_bytes,
-                             const int num_threads,
+                             const int intra_op_parallelism,
                              const bool validate)
-    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate)
 {
     _contiguous_buffer = _buffer.contiguous();
     const int64_t device = _buffer.get_device();
@@ -123,7 +123,7 @@ void gds_op_desc_t::run(const int tid,
                         std::unique_ptr<aio_context>& aio_ctxt,
                         deepspeed_aio_config_t* aio_config)
 {
-    assert(tid < _num_threads);
+    assert(tid < _intra_op_parallelism);
     check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
     int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
     const auto file_offset = _num_bytes_per_thread * tid;
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
index b7fab64d40549..c9d4c076f1894 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.h
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -23,7 +23,7 @@ struct gds_op_desc_t : io_op_desc_t {
                   const int fd,
                   const char* filename,
                   const long long int file_num_bytes,
-                  const int num_threads,
+                  const int intra_op_parallelism,
                   const bool validate);
 
     void run(const int tid,
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index 15fd516acaae5..43705939dc3e5 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -19,21 +19,25 @@ deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
-                                               const int num_threads)
-    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1)
+                                               const int intra_op_parallelism)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1),
+      _intra_gds_op_parallelism(intra_op_parallelism)
 {
-    _init_cuFile(block_size, queue_depth, num_threads);
+    _init_cuFile(block_size, queue_depth);
 }
 
 deepspeed_gds_handle_t::~deepspeed_gds_handle_t() { _close_cuFile(); }
 
-void deepspeed_gds_handle_t::_init_cuFile(const int block_size,
-                                          const int queue_depth,
-                                          const int num_threads)
+const int deepspeed_gds_handle_t::get_intra_op_parallelism() const
+{
+    return _intra_gds_op_parallelism;
+}
+
+void deepspeed_gds_handle_t::_init_cuFile(const int block_size, const int queue_depth)
 {
     if (deepspeed_gds_handle_t::s_cuFile_init == 0) {
         std::string depthStr = std::to_string(queue_depth);
-        std::string threadsStr = std::to_string(num_threads);
+        std::string threadsStr = std::to_string(_intra_gds_op_parallelism);
         std::string json1 = R"({"execution": {"max_io_queue_depth": )" + depthStr + ", ";
         std::string json2 = R"("max_request_parallelism": )" + threadsStr + ", ";
         std::string json3 = R"("max_io_threads": )" + threadsStr + ", ";
@@ -107,7 +111,7 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
 {
     if (buffer.is_cuda()) {
         return std::make_shared<gds_op_desc_t>(
-            read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
+            read_op, buffer, fd, filename, file_num_bytes, _intra_op_parallelism, validate);
     }
     return deepspeed_io_handle_t::_create_io_op_desc(
         read_op, buffer, fd, filename, file_num_bytes, validate);
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
index f324e6b65e80b..a3c10a4f64676 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -12,11 +12,13 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include "deepspeed_py_io_handle.h"
 
 struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
+    const int _intra_gds_op_parallelism;
+
     deepspeed_gds_handle_t(const int block_size,
                            const int queue_depth,
                            const bool single_submit,
                            const bool overlap_events,
-                           const int num_threads);
+                           const int intra_op_parallelism);
 
     ~deepspeed_gds_handle_t();
 
@@ -29,10 +31,12 @@ struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
 
     bool unpin_device_tensor(const torch::Tensor& buffer);
 
-    void _init_cuFile(const int block_size, const int queue_length, const int num_threads);
+    void _init_cuFile(const int block_size, const int queue_depth);
 
     void _close_cuFile();
 
+    const int get_intra_op_parallelism() const;
+
     std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
                                                             const torch::Tensor& buffer,
                                                             const int fd,
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
index 66eb34d4ea8cf..57bf8d2207c45 100644
--- a/csrc/gds/py_lib/py_ds_gds.cpp
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -20,13 +20,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
              "queue_depth"_a = 128,
              "single_submit"_a = false,
              "overlap_events"_a = false,
-             "num_threads"_a = 1)
+             "intra_op_parallelism"_a = 1)
 
         .def("get_block_size", &deepspeed_gds_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_gds_handle_t::get_queue_depth)
         .def("get_single_submit", &deepspeed_gds_handle_t::get_single_submit)
         .def("get_overlap_events", &deepspeed_gds_handle_t::get_overlap_events)
-        .def("get_thread_count", &deepspeed_gds_handle_t::get_thread_count)
+        .def("get_intra_op_parallelism", &deepspeed_gds_handle_t::get_intra_op_parallelism)
 
         .def("read",
              &deepspeed_gds_handle_t::read,
diff --git a/docs/_tutorials/deepnvme.md b/docs/_tutorials/deepnvme.md
index 70c6ac0979635..4ed528412eae9 100644
--- a/docs/_tutorials/deepnvme.md
+++ b/docs/_tutorials/deepnvme.md
@@ -50,7 +50,7 @@ Type "help", "copyright", "credits" or "license" for more information.
 >>> h = AsyncIOBuilder().load().aio_handle()
 >>> h.
 h.async_pread(             h.free_cpu_locked_tensor(  h.get_overlap_events(      h.get_single_submit(       h.new_cpu_locked_tensor(   h.pwrite(                  h.sync_pread(              h.wait(
-h.async_pwrite(            h.get_block_size(          h.get_queue_depth(         h.get_thread_count(        h.pread(                   h.read(                    h.sync_pwrite(             h.write(
+h.async_pwrite(            h.get_block_size(          h.get_queue_depth(         h.get_intra_op_parallelism(        h.pread(                   h.read(                    h.sync_pwrite(             h.write(
 ```
 The APIs of interest for performing I/O operations are those named with `pread` and `pwrite` substrings. For brevity, we will focus on the file write APIs, namely `sync_pwrite`, `async_pwrite`, and `pwrite`. We will discuss only `sync_pwrite` and `async_pwrite` below because they are specializations of `pwrite`.
 
@@ -107,7 +107,7 @@ Similar safety problems apply to reading the destination tensor of a non-blockin
 
 
 ### Parallel File Write
-An important DeepNVMe optimization is the ability to parallelize individual I/O operations. This optimization is enabled by specifying the desired parallelism degree when constructing a DeepNVMe handle. Subsequent I/O operations with that handle are automatically parallelized over the requested number of host or device threads, as appropriate. I/O parallelism is composable with either the blocking or non-blocking I/O APIs. The example below illustrates 4-way parallelism of a file write using `async_pwrite`. Note the use of `num_threads` argument to specify the desired parallelism degree in handle creation.
+An important DeepNVMe optimization is the ability to parallelize individual I/O operations. This optimization is enabled by specifying the desired parallelism degree when constructing a DeepNVMe handle. Subsequent I/O operations with that handle are automatically parallelized over the requested number of host or device threads, as appropriate. I/O parallelism is composable with either the blocking or non-blocking I/O APIs. The example below illustrates 4-way parallelism of a file write using `async_pwrite`. Note the use of `intra_op_parallelism` argument to specify the desired parallelism degree in handle creation.
 
 ```bash
 >>> import os
@@ -116,7 +116,7 @@ False
 >>> import torch
 >>> t=torch.empty(1024**3, dtype=torch.uint8).cuda()
 >>> from deepspeed.ops.op_builder import AsyncIOBuilder
->>> h = AsyncIOBuilder().load().aio_handle(num_threads=4)
+>>> h = AsyncIOBuilder().load().aio_handle(intra_op_parallelism=4)
 >>> h.async_pwrite(t,'/local_nvme/test_1GB.pt')
 >>> h.wait()
 1
@@ -188,7 +188,7 @@ This tutorial has been significantly improved by feedback from [Guanhua Wang](ht
 ## Appendix
 
 ### Advanced Handle Creation
-Achieving peak I/O performance with DeepNVMe requires careful configuration of handle creation. In particular, the parameters of `aio_handle` and `gds_handle` constructors are performance-critical because they determine how efficiently DeepNVMe interacts with the underlying storage subsystem (i.e., `libaio`, GDS, PCIe, and SSD). For convenience we make it possible to create handles using default parameter values which will provide decent performance in most scenarios. However, squeezing out every available performance in your environment will likely require tuning the constructor parameters, namely `block_size`, `queue_depth`, `single_submit`, `overlap_events`, and `num_threads`. The `aio_handle` constructor parameters and default values are illustrated below:
+Achieving peak I/O performance with DeepNVMe requires careful configuration of handle creation. In particular, the parameters of `aio_handle` and `gds_handle` constructors are performance-critical because they determine how efficiently DeepNVMe interacts with the underlying storage subsystem (i.e., `libaio`, GDS, PCIe, and SSD). For convenience we make it possible to create handles using default parameter values which will provide decent performance in most scenarios. However, squeezing out every available performance in your environment will likely require tuning the constructor parameters, namely `block_size`, `queue_depth`, `single_submit`, `overlap_events`, and `intra_op_parallelism`. The `aio_handle` constructor parameters and default values are illustrated below:
 ```bash
 >>> from deepspeed.ops.op_builder import AsyncIOBuilder
 >>> help(AsyncIOBuilder().load().aio_handle())
@@ -203,7 +203,7 @@ class aio_handle(pybind11_builtins.pybind11_object)
  |  Methods defined here:
  |
  |  __init__(...)
- |      __init__(self: async_io.aio_handle, block_size: int = 1048576, queue_depth: int = 128, single_submit: bool = False, overlap_events: bool = False, num_threads: int = 1) -> None
+ |      __init__(self: async_io.aio_handle, block_size: int = 1048576, queue_depth: int = 128, single_submit: bool = False, overlap_events: bool = False, intra_op_parallelism: int = 1) -> None
  |
  |      AIO handle constructor
 ```
@@ -219,7 +219,7 @@ Best performance (GB/sec): read =  3.69, write =  3.18
    "aio": {
       "single_submit": "false",
       "overlap_events": "true",
-      "num_threads": 8,
+      "intra_op_parallelism": 8,
       "queue_depth": 32,
       "block_size": 1048576
    }
@@ -233,7 +233,7 @@ The above tuning was executed on a Lambda workstation equipped with two NVIDIA A
                                            queue_depth=32,
                                            single_submit=False,
                                            overlap_events=True,
-                                           num_threads=8)
+                                           intra_op_parallelism=8)
 ```
 
 
@@ -292,6 +292,6 @@ Function | Description
 |---|---|
 get_queue_depth | Return queue depth setting |
 get_single_submit | Return whether single_submit is enabled |
-get_thread_count | Return I/O parallelism degree |
+get_intra_op_parallelism | Return I/O parallelism degree |
 get_block_size | Return I/O block size setting |
 get_overlap_events | Return whether overlap_event is enabled |
diff --git a/op_builder/builder.py b/op_builder/builder.py
index f95341f137b4b..1609bc9005f46 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -76,7 +76,7 @@ def get_default_compute_capabilities():
 cuda_minor_mismatch_ok = {
     10: ["10.0", "10.1", "10.2"],
     11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
-    12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5"],
+    12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6"],
 }
 
 
diff --git a/op_builder/cpu/__init__.py b/op_builder/cpu/__init__.py
index 30238add3f905..7084db8469f1d 100644
--- a/op_builder/cpu/__init__.py
+++ b/op_builder/cpu/__init__.py
@@ -8,3 +8,4 @@
 from .fused_adam import FusedAdamBuilder
 from .cpu_adam import CPUAdamBuilder
 from .no_impl import NotImplementedBuilder
+from .async_io import AsyncIOBuilder
diff --git a/op_builder/cpu/async_io.py b/op_builder/cpu/async_io.py
new file mode 100644
index 0000000000000..493ef174566e7
--- /dev/null
+++ b/op_builder/cpu/async_io.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import distutils.spawn
+import subprocess
+
+from .builder import CPUOpBuilder
+
+
+class AsyncIOBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_AIO"
+    NAME = "async_io"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.aio.{self.NAME}_op'
+
+    def lib_sources(self):
+        src_list = [
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+        ]
+        return src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/aio/py_lib/py_ds_aio.cpp']
+
+    def include_paths(self):
+        return ['csrc/aio/py_lib', 'csrc/aio/common']
+
+    def cxx_args(self):
+        # -O0 for improved debugging, since performance is bound by I/O
+        args = super().cxx_args()
+        import torch
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if not (TORCH_MAJOR >= 2 and TORCH_MINOR >= 1):
+            args.remove('-std=c++17')
+            args.append('-std=c++14')
+        args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder']
+        return args
+
+    def extra_ldflags(self):
+        return ['-laio', '-fopenmp']
+
+    def check_for_libaio_pkg(self):
+        libs = dict(
+            dpkg=["-l", "libaio-dev", "apt"],
+            pacman=["-Q", "libaio", "pacman"],
+            rpm=["-q", "libaio-devel", "yum"],
+        )
+
+        found = False
+        for pkgmgr, data in libs.items():
+            flag, lib, tool = data
+            path = distutils.spawn.find_executable(pkgmgr)
+            if path is not None:
+                cmd = [pkgmgr, flag, lib]
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                if result.wait() == 0:
+                    found = True
+                else:
+                    self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
+                break
+        return found
+
+    def is_compatible(self, verbose=False):
+        # Check for the existence of libaio by using distutils
+        # to compile and link a test program that calls io_submit,
+        # which is a function provided by libaio that is used in the async_io op.
+        # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
+        # respectively to specify the directories for libaio.h and libaio.so.
+        aio_compatible = self.has_function('io_submit', ('aio', ))
+        if verbose and not aio_compatible:
+            self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
+
+            # Check for the libaio package via known package managers
+            # to print suggestions on which package to install.
+            self.check_for_libaio_pkg()
+
+            self.warning(
+                "If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found."
+            )
+        return super().is_compatible(verbose) and aio_compatible
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index e6927efc38248..a074cfca317fe 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -23,12 +23,10 @@
     pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True)
 
 
-def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True):
-    if not get_accelerator().is_available():
-        if use_cuda_device:
-            pytest.skip("GPU tensors only supported in CUDA environments.")
+def _skip_for_invalid_environment(use_cuda_pinned_tensor=True):
+    if get_accelerator().device_name() != 'cuda':
         if use_cuda_pinned_tensor:
-            pytest.skip("CUDA-pinned tensors only supported in CUDA environments.")
+            pytest.skip("torch.pin_memory is only supported in CUDA environments.")
 
 
 def _get_local_rank():
@@ -52,13 +50,13 @@ def _get_test_write_file(tmpdir, index):
     return os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
 
 
-def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0):
+def _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
     return test_file, test_buffer
 
 
-def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0):
+def _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, aio_handle=None, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     if aio_handle is None:
         test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer)))
@@ -73,12 +71,12 @@ def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, ind
 def _validate_handle_state(handle, single_submit, overlap_events):
     assert handle.get_single_submit() == single_submit
     assert handle.get_overlap_events() == overlap_events
-    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_intra_op_parallelism() == IO_PARALLEL
     assert handle.get_block_size() == BLOCK_SIZE
     assert handle.get_queue_depth() == QUEUE_DEPTH
 
 
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
 @pytest.mark.parametrize("single_submit", [True, False])
 @pytest.mark.parametrize("overlap_events", [True, False])
 class TestRead(DistributedTest):
@@ -89,12 +87,15 @@ class TestRead(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
-        if use_cuda_pinned_tensor:
+        if use_unpinned_tensor:
+            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        elif use_cuda_pinned_tensor:
             aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
         else:
             aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
@@ -112,14 +113,14 @@ def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, over
         if not use_cuda_pinned_tensor:
             h.free_cpu_locked_tensor(aio_buffer)
 
-    @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         use_cpu_locked_tensor = False
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
-        if cuda_device:
+        if use_unpinned_tensor:
             aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
         elif use_cuda_pinned_tensor:
             aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
@@ -144,7 +145,7 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap
             h.free_cpu_locked_tensor(aio_buffer)
 
 
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
 @pytest.mark.parametrize("single_submit", [True, False])
 @pytest.mark.parametrize("overlap_events", [True, False])
 class TestWrite(DistributedTest):
@@ -155,16 +156,19 @@ class TestWrite(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
+        if use_unpinned_tensor:
+            aio_file, aio_buffer = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer)
         if use_cuda_pinned_tensor:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer)
         else:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, h)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, h)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
@@ -179,20 +183,20 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove
         filecmp.clear_cache()
         assert filecmp.cmp(ref_file, aio_file, shallow=False)
 
-    @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
 
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
         use_cpu_locked_tensor = False
-        if cuda_device:
-            aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer)
+        if use_unpinned_tensor:
+            aio_file, aio_buffer = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer)
         elif use_cuda_pinned_tensor:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer)
         else:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, h)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, h)
             use_cpu_locked_tensor = True
 
         _validate_handle_state(h, single_submit, overlap_events)
@@ -213,8 +217,8 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla
 
 
 @pytest.mark.sequential
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
-@pytest.mark.parametrize("cuda_device", [True, False])
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("use_unpinned_tensor", [True, False])
 class TestAsyncQueue(DistributedTest):
     world_size = 1
     requires_cuda_env = False
@@ -223,8 +227,8 @@ class TestAsyncQueue(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         for i in range(async_queue):
@@ -236,7 +240,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         use_cpu_locked_tensor = False
-        if cuda_device:
+        if use_unpinned_tensor:
             aio_buffers = [
                 torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
                 for _ in range(async_queue)
@@ -270,8 +274,8 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
                 h.free_cpu_locked_tensor(t)
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         ref_buffers = []
@@ -287,16 +291,16 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
         aio_files = []
         aio_buffers = []
         for i in range(async_queue):
-            if cuda_device:
-                f, buf = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffers[i], i)
+            if use_unpinned_tensor:
+                f, buf = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffers[i], i)
             elif use_cuda_pinned_tensor:
-                f, buf = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffers[i], None, i)
+                f, buf = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffers[i], None, i)
             else:
-                f, buf = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffers[i], h, i)
+                f, buf = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffers[i], h, i)
             aio_files.append(f)
             aio_buffers.append(buf)
 
-        use_cpu_locked_tensor = not (cuda_device or use_cuda_pinned_tensor)
+        use_cpu_locked_tensor = not (use_unpinned_tensor or use_cuda_pinned_tensor)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py
index 53655994b5601..e94d42cd22af8 100644
--- a/tests/unit/ops/aio/test_gds.py
+++ b/tests/unit/ops/aio/test_gds.py
@@ -54,7 +54,7 @@ def _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, gds_handle, index
 def _validate_handle_state(handle, single_submit, overlap_events):
     assert handle.get_single_submit() == single_submit
     assert handle.get_overlap_events() == overlap_events
-    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_intra_op_parallelism() == IO_PARALLEL
     assert handle.get_block_size() == BLOCK_SIZE
     assert handle.get_queue_depth() == QUEUE_DEPTH
 
diff --git a/tests/unit/runtime/zero/test_nvme_checkpointing.py b/tests/unit/runtime/zero/test_nvme_checkpointing.py
index 75cba2e789c16..850c8eb3e3496 100644
--- a/tests/unit/runtime/zero/test_nvme_checkpointing.py
+++ b/tests/unit/runtime/zero/test_nvme_checkpointing.py
@@ -15,6 +15,7 @@
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.partition_parameters import Init
 from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.accelerator import get_accelerator
 
 
 class TestNVMeCheckpointing(DistributedTest):
@@ -29,6 +30,9 @@ def test_nvme_checkpointing(self, tmpdir, param_offload_device, optim_offload_de
 
         first_stage_steps, second_stage_steps = 2, 2
 
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
+
         if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
             pytest.skip('Skip tests since async-io is not compatible')
 

From d7ca3d83732854eea41e6c83e603639699faf1d4 Mon Sep 17 00:00:00 2001
From: Nir Sonnenschein <nsonnenschein@habana.ai>
Date: Thu, 10 Oct 2024 03:47:44 +0300
Subject: [PATCH 29/43] reduce setting global variables to reduce torch compile
 graph breaks (#6541)

setting global variables during training will create a graph breaks when
using torch.compile (reading global variables doesn't). this commit
attempts to reduce the setting of global variables in the checkpointing
flows.
there are 2 main uses setting global variables:
1. Share data between functions
2. Establish that this is the first call to the code

For most of the cases the data in the global variables is data that can
be computed on demand or set once in an initial state in a configure
function.
For "check that this is the first run" use case the code was moved to
the configure function.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 .../activation_checkpointing/checkpointing.py | 100 +++++++-----------
 1 file changed, 37 insertions(+), 63 deletions(-)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index f955cf5ebcad6..d65126999199c 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -36,8 +36,10 @@
 
 # MP parameters
 mpu = None
-mp_rank = None
-mp_size = None
+
+#set default values
+mp_rank = 0
+mp_size = 1
 mp_group = None
 
 # Model Parameters
@@ -61,8 +63,6 @@
 
 # Default name for the model parallel rng tracker.
 _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
-transport_stream = None
-cuda_device = None
 
 
 def detach_variable(inputs, device=None):
@@ -518,35 +518,10 @@ def save_args_for_backward(*all_args):
         global mp_rank, mp_size, mp_group
         global contiguous_data_buffers, contiguous_size_buffers
         global data_offsets, size_offsets
-        if mp_rank is None:
-            if mpu is not None:
-                if hasattr(mpu, 'get_tensor_model_parallel_rank'):
-                    mp_rank = mpu.get_tensor_model_parallel_rank()
-                    mp_size = mpu.get_tensor_model_parallel_world_size()
-                    mp_group = mpu.get_tensor_model_parallel_group()
-                else:
-                    mp_rank = mpu.get_model_parallel_rank()
-                    mp_size = mpu.get_model_parallel_world_size()
-                    mp_group = mpu.get_model_parallel_group()
-            else:
-                mp_rank = 0
-                mp_size = 1
-                mp_group = None
-
-        global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
-
-        if cuda_device is None:
-            see_memory_usage("First Forward Beginning", force=False)
-            if dist.get_rank() == 0:
-                logger.info(f"Activation Checkpointing Information")
-                logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
-                logger.info(
-                    f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
-                logger.info(f"----Synchronization {SYNCHRONIZE}")
-                logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+        global PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
 
-            cuda_device = get_accelerator().current_device_name()
-            transport_stream = get_accelerator().Stream(device=cuda_device)
+        cuda_device = get_accelerator().current_device_name()
+        transport_stream = get_accelerator().Stream(device=cuda_device)
 
         if PARTITION_ACTIVATIONS:
             inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
@@ -631,8 +606,9 @@ def backward(ctx, *grads):
             raise RuntimeError("Checkpointing is not compatible with .grad(), "
                                "please use .backward() if possible")
 
-        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
-
+        global PARTITION_ACTIVATIONS
+        cuda_device = get_accelerator().current_device_name()
+        transport_stream = get_accelerator().Stream(device=cuda_device)
         # Rebuild deepspeed_saved_tensors
         for t in ctx.deepspeed_saved_tensors:
             if t is not None and hasattr(t, 'saved_data') and t.saved_data is not None:
@@ -764,35 +740,10 @@ def save_args_for_backward(*all_args):
     global mp_rank, mp_size, mp_group
     global contiguous_data_buffers, contiguous_size_buffers
     global data_offsets, size_offsets
-    if mp_rank is None:
-        if mpu is not None:
-            if hasattr(mpu, 'get_tensor_model_parallel_rank'):
-                mp_rank = mpu.get_tensor_model_parallel_rank()
-                mp_size = mpu.get_tensor_model_parallel_world_size()
-                mp_group = mpu.get_tensor_model_parallel_group()
-            else:
-                mp_rank = mpu.get_model_parallel_rank()
-                mp_size = mpu.get_model_parallel_world_size()
-                mp_group = mpu.get_model_parallel_group()
-        else:
-            mp_rank = 0
-            mp_size = 1
-            mp_group = None
-
-    global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
-
-    if cuda_device is None:
-        see_memory_usage("First Forward Beginning", force=False)
-        if dist.get_rank() == 0:
-            logger.info(f"Activation Checkpointing Information")
-            logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
-            logger.info(
-                f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
-            logger.info(f"----Synchronization {SYNCHRONIZE}")
-            logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+    global PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
 
-        cuda_device = get_accelerator().current_device_name()
-        transport_stream = get_accelerator().Stream(device=cuda_device)
+    cuda_device = get_accelerator().current_device_name()
+    transport_stream = get_accelerator().Stream(device=cuda_device)
 
     if PARTITION_ACTIVATIONS:
         inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
@@ -899,7 +850,9 @@ def replay_unpack(none_value):
                 raise RuntimeError("Checkpointing is not compatible with .grad(), "
                                    "please use .backward() if possible")
 
-            global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+            global PARTITION_ACTIVATIONS
+            cuda_device = get_accelerator().current_device_name()
+            transport_stream = get_accelerator().Stream(device=cuda_device)
 
             # gather inputs which is partitioned or checkpointed before first forward
             if PARTITION_ACTIVATIONS:
@@ -1152,6 +1105,27 @@ def configure(
     if CONTIGUOUS_CHECKPOINTING:
         assert num_layers is not None, "Must specify the number of layers with contiguous memory checkpointing"
 
+    global mp_rank, mp_size, mp_group
+
+    if mpu is not None:
+        if hasattr(mpu, 'get_tensor_model_parallel_rank'):
+            mp_rank = mpu.get_tensor_model_parallel_rank()
+            mp_size = mpu.get_tensor_model_parallel_world_size()
+            mp_group = mpu.get_tensor_model_parallel_group()
+        else:
+            mp_rank = mpu.get_model_parallel_rank()
+            mp_size = mpu.get_model_parallel_world_size()
+            mp_group = mpu.get_model_parallel_group()
+
+    #print configuration only once
+    see_memory_usage("After configuration", force=False)
+    if dist.get_rank() == 0:
+        logger.info(f"Activation Checkpointing Information")
+        logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
+        logger.info(f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
+        logger.info(f"----Synchronization {SYNCHRONIZE}")
+        logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+
 
 def is_configured():
     """True if deepspeed activation checkpointing has been configured

From adec99121b411709e1b185a486d18aa846c82c64 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Wed, 9 Oct 2024 19:59:26 -0700
Subject: [PATCH 30/43] Add API to get devices of offload states (#6586)

This PR adds an API `deepspeed.runtime.zero.offload_states
get_state_devices`, which gets devices of offload states as suggested in
this
[comment](https://github.com/microsoft/DeepSpeed/pull/6011#issuecomment-2358068777).

We could lift this up to `deepspeed.utils` but would need to resolve a
circular import: User code -> `deepspeed.utils` ->
`deepspeed.utils.offload_states` -> `deepspeed.runtime.zero` ->
`deepspeed.runtime.zero.partition_parameters` -> `deepspeed.utils`

This will require a significant refactoring as long as we have
`OffloadStateTypeEnum` in `deepspeed.runtime.zero`.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/utils.py                    | 10 +--
 deepspeed/runtime/zero/offload_states.py      | 74 +++++++++++++++++++
 deepspeed/runtime/zero/stage3.py              |  3 +-
 docs/code-docs/source/zero3.rst               | 16 ++++
 .../unit/runtime/zero/test_offload_states.py  | 23 +++---
 5 files changed, 110 insertions(+), 16 deletions(-)
 create mode 100644 deepspeed/runtime/zero/offload_states.py

diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index adcadd349803b..b9617d3e632fd 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -9,28 +9,28 @@
 """
 
 from collections.abc import Iterable
-from deepspeed.moe.utils import is_moe_param
 import os
 import psutil
 import gc
 from math import sqrt
 
+from numpy import prod
+
 import torch
-from deepspeed import comm as dist
+from torch.nn import functional as F
 try:
     from torch._six import inf
 except ModuleNotFoundError:
     from torch import inf
 
+from deepspeed import comm as dist
+from deepspeed.moe.utils import is_moe_param
 from deepspeed.utils import groups, logger
 from deepspeed.utils.bwc import (bwc_tensor_model_parallel_rank, bwc_pipeline_parallel_world_size,
                                  bwc_pipeline_parallel_group)
 from deepspeed.runtime.constants import PIPE_REPLICATED
-from numpy import prod
 from deepspeed.accelerator import get_accelerator
-
 from deepspeed.module_inject.policy import transpose
-from torch.nn import functional as F
 
 torch_memory_reserved = get_accelerator().memory_reserved
 torch_max_memory_reserved = get_accelerator().max_memory_reserved
diff --git a/deepspeed/runtime/zero/offload_states.py b/deepspeed/runtime/zero/offload_states.py
new file mode 100644
index 0000000000000..f521a11a7aa45
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_states.py
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Set
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.zero.offload_config import OffloadStateTypeEnum
+
+from deepspeed.utils.tensor_fragment import safe_get_local_fp32_param, safe_get_local_optimizer_state
+
+
+def _make_offload_state_key(key):
+    return f"{key}_offload_buffer"
+
+
+def offload_adam_states(optimizer, device, pin_memory: bool = False, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_key(state, key):
+        offload_buf_key = _make_offload_state_key(key)
+        if offload_buf_key not in state:
+            state[offload_buf_key] = torch.empty_like(state[key], device=device)
+            if pin_memory:
+                state[offload_buf_key] = get_accelerator().pin_memory(state[offload_buf_key])
+        state[offload_buf_key].copy_(state[key], non_blocking=non_blocking)
+        state[key].data = state[offload_buf_key]
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_key(state, "exp_avg_sq")
+
+
+def reload_adam_states(optimizer, device, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_back_key(state, key):
+        state[key].data = state[_make_offload_state_key(key)].to(device, non_blocking=non_blocking)
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_back_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_back_key(state, "exp_avg_sq")
+
+
+def get_state_devices(model, state: OffloadStateTypeEnum) -> Set[torch.device]:
+    """Retrieve the devices of the specified state of the model.
+
+    Args:
+        model (DeepSpeedEngine): The model whose device allocations are to be checked.
+        state (OffloadStateTypeEnum): The specific state for which the devices should be retrieved.
+
+    Returns:
+        Set[torch.device]: A set of devices of the specified state.
+
+    """
+    if state == OffloadStateTypeEnum.hp_params:
+        return set(safe_get_local_fp32_param(p).device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.lp_params:
+        return set(p.ds_tensor.device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.lp_grads:
+        return {model.optimizer.grad_partitions_flat_buffer.device}
+    elif state == OffloadStateTypeEnum.optim_states:
+        return set(safe_get_local_optimizer_state(p, "exp_avg").device for p in model.parameters()) | \
+               set(safe_get_local_optimizer_state(p, "exp_avg_sq").device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.contiguous_grad_buffer:
+        if model.optimizer._DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer == None:
+            return {}
+        return {model.optimizer._DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer.device}
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index fb75d2bcebd58..6895916783f15 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -18,12 +18,13 @@
 from deepspeed.utils import logger
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
-from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item, offload_adam_states, reload_adam_states
+from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
 from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
 from deepspeed.runtime.zero.utils import apply_to_tensors_only, get_mapping_to_flat_buffer
+from deepspeed.runtime.zero.offload_states import offload_adam_states, reload_adam_states
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index f0974c08c9f3b..ae7cedd1a8b39 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -509,3 +509,19 @@ Below is an example code snippet demonstrating how to offload FP32 parameters an
     ...
     # Load states back to device memory
     ds_engine.reload_states()
+
+``deepspeed.runtime.zero.offload_states.get_state_devices`` returns devices of the specified state.
+
+.. code-block:: python
+
+    def get_state_devices(model, state: OffloadStateTypeEnum) -> Set[torch.device]:
+        """Retrieve the devices of the specified state of the model.
+
+        Args:
+            model (DeepSpeedEngine): The model whose device allocations are to be checked.
+            state (OffloadStateTypeEnum): The specific state for which the devices should be retrieved.
+
+        Returns:
+            Set[torch.device]: A set of devices of the specified state.
+
+        """
diff --git a/tests/unit/runtime/zero/test_offload_states.py b/tests/unit/runtime/zero/test_offload_states.py
index cc60908d3c337..9105a54661fac 100644
--- a/tests/unit/runtime/zero/test_offload_states.py
+++ b/tests/unit/runtime/zero/test_offload_states.py
@@ -15,19 +15,22 @@
 import deepspeed
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
 from deepspeed.utils import safe_get_local_fp32_param, safe_get_local_optimizer_state
+from deepspeed.runtime.zero.offload_states import get_state_devices
 
 
 def validate_device(model, device: torch.device, include) -> None:
-    # Make sure the model parameters are offloaded
-    if include is None or OffloadStateTypeEnum.hp_params in include:
-        assert all(safe_get_local_fp32_param(p).device == device for p in model.parameters())
-    if include is None or OffloadStateTypeEnum.lp_params in include:
-        assert all(p.ds_tensor.device == device for p in model.parameters())
-    if include is None or OffloadStateTypeEnum.lp_grads in include:
-        assert model.optimizer.grad_partitions_flat_buffer.device == device
-    if include is None or OffloadStateTypeEnum.optim_states in include:
-        assert all(safe_get_local_optimizer_state(p, "exp_avg").device == device for p in model.parameters())
-        assert all(safe_get_local_optimizer_state(p, "exp_avg_sq").device == device for p in model.parameters())
+
+    def compare_device(state) -> bool:
+        devices = get_state_devices(model, state)
+        return len(devices) == 1 and device in devices
+
+    for state in OffloadStateTypeEnum:
+        if include is None or state in include:
+            if state == OffloadStateTypeEnum.contiguous_grad_buffer and device == torch.device("cpu"):
+                assert len(get_state_devices(model,
+                                             state)) == 0, f"State {state} must be removed after offload_states()"
+            else:
+                assert compare_device(state), f"State {state} is not on device {device}"
 
 
 def run_model(model, config_dict, hidden_dim, dtype, include, pin_memory, non_blocking):

From 5c4b97f1092b798508dab4321b2ac79a9f554e72 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <mtanaka@microsoft.com>
Date: Fri, 11 Oct 2024 19:41:10 +0000
Subject: [PATCH 31/43] apply fp16 autocast only to floating point values

---
 deepspeed/runtime/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index ba0e0f8756253..05bb23e8ddd94 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -1928,7 +1928,7 @@ def _cast_inputs_half(self, inputs):
             for k, v in inputs.items():
                 new_inputs[k] = self._cast_inputs_half(v)
             return new_inputs
-        elif hasattr(inputs, 'half'):
+        elif hasattr(inputs, 'half') and inputs.is_floating_point():
             return inputs.half()
         else:
             return inputs

From 7a5bc4fdf90d3a1cd711973ed9d0113b582f143e Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Mon, 14 Oct 2024 09:08:44 -0700
Subject: [PATCH 32/43] Ignore reuse_dist_env (#6623)

Tests with `reuse_dist_env = True` often causes memory leaks. This PR
ignores `reuse_dist_env` and forcibly sets it to `False`. This change
might slow down the tests, but I think it is better to manually restart
runners and relaunch tests.

Memory usages (See #6578):
- `reuse_dist_env == True`:
https://github.com/microsoft/DeepSpeed/actions/runs/11302940871/job/31439471512
- `reuse_dist_env == False`:
https://github.com/microsoft/DeepSpeed/actions/runs/11303250613/job/31440137894
---
 tests/unit/common.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/unit/common.py b/tests/unit/common.py
index c9eb7ffaa5f46..69ba4c2708ace 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -25,6 +25,8 @@
 # Worker timeout for tests that hang
 DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600'))
 
+warn_reuse_dist_env = False
+
 
 def is_rocm_pytorch():
     return hasattr(torch.version, 'hip') and torch.version.hip is not None
@@ -179,6 +181,13 @@ def _launch_daemonic_procs(self, num_procs):
                 print("Ignoring reuse_dist_env for hpu")
                 self.reuse_dist_env = False
 
+        global warn_reuse_dist_env
+        if self.reuse_dist_env and not warn_reuse_dist_env:
+            # Currently we see memory leak for tests that reuse distributed environment
+            print("Ignoring reuse_dist_env and forcibly setting it to False")
+            warn_reuse_dist_env = True
+        self.reuse_dist_env = False
+
         if self.reuse_dist_env:
             if num_procs not in self._pool_cache:
                 self._pool_cache[num_procs] = mp.Pool(processes=num_procs)

From cf41e8c4e8b7b9ad43f31c8e5b255455278ef15b Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@gmail.com>
Date: Tue, 15 Oct 2024 01:31:34 +0800
Subject: [PATCH 33/43] [compile] Show breakdown of graph break (#6601)

This PR extends https://github.com/microsoft/DeepSpeed/pull/6570 by
showing a breakdown of graph breaks. So we can see how graph breaks are
distributed among different reasons. An example of graph break output
can be seen from the following workflow run
https://github.com/microsoft/DeepSpeed/actions/runs/11199157962
---
 .github/workflows/xpu-compile.yml             | 10 ++++-
 tests/torch_compile/ds_config_z2.json         | 40 +++++++++++++++++++
 .../{ds_config.json => ds_config_z3.json}     |  0
 tests/torch_compile/test_compile.py           | 33 ++++++++-------
 4 files changed, 64 insertions(+), 19 deletions(-)
 create mode 100644 tests/torch_compile/ds_config_z2.json
 rename tests/torch_compile/{ds_config.json => ds_config_z3.json} (100%)

diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
index c2392091012f3..e095e089fc30f 100644
--- a/.github/workflows/xpu-compile.yml
+++ b/.github/workflows/xpu-compile.yml
@@ -51,9 +51,15 @@ jobs:
     - name: Compile Status
       shell: bash
       run: |
+        echo "# torch.compile graph breaks" >> $GITHUB_STEP_SUMMARY
         export FI_HMEM=system
         ulimit -n 1048575
         cd tests/torch_compile
         export ZE_AFFINITY_MASK=0,1
-        deepspeed test_compile.py --deepspeed_config ds_config.json 2>&1 | tee log.txt
-        cat log.txt |  grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY
+        echo "## ZeRO stage 3" >> $GITHUB_STEP_SUMMARY
+        deepspeed test_compile.py --deepspeed_config ds_config_z3.json 2>&1 | tee log_z3.txt
+        # for each line start with 'dynamo_output', extract the second field and following fields and append to GITHUB_STEP_SUMMARY using awk
+        cat log_z3.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
+        echo "## ZeRO stage 2" >> $GITHUB_STEP_SUMMARY
+        deepspeed test_compile.py --deepspeed_config ds_config_z2.json 2>&1 | tee log_z2.txt
+        cat log_z2.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
diff --git a/tests/torch_compile/ds_config_z2.json b/tests/torch_compile/ds_config_z2.json
new file mode 100644
index 0000000000000..30e1237c558c9
--- /dev/null
+++ b/tests/torch_compile/ds_config_z2.json
@@ -0,0 +1,40 @@
+{
+  "train_batch_size": 8,
+  "steps_per_print": 2000,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 1000
+    }
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "bf16": {
+      "enabled": true,
+      "loss_scale": 0,
+      "loss_scale_window": 500,
+      "hysteresis": 2,
+      "min_loss_scale": 1,
+      "initial_scale_power": 15
+  },
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+      "stage": 2,
+      "overlap_comm": false,
+      "contiguous_gradients": false
+  }
+}
diff --git a/tests/torch_compile/ds_config.json b/tests/torch_compile/ds_config_z3.json
similarity index 100%
rename from tests/torch_compile/ds_config.json
rename to tests/torch_compile/ds_config_z3.json
diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py
index 529ca56ae0a8e..adbf6eaa947aa 100644
--- a/tests/torch_compile/test_compile.py
+++ b/tests/torch_compile/test_compile.py
@@ -14,22 +14,9 @@
 
 torch._dynamo.config.cache_size_limit = 100
 
-import collections
-
 
 def get_dynamo_stats():
-    # TODO: consider deepcopy'ing the entire counters struct and
-    # adding a helper to do subtraction on it
-    return collections.Counter({
-        "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
-        "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
-        "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
-        # NB: The plus removes zero counts
-        "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
-        "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"]["captures"],
-        "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"]["compiles"],
-        "cudagraph_skips": torch._dynamo.utils.counters["inductor"]["cudagraph_skips"],
-    })
+    return torch._dynamo.utils.counters["graph_break"]
 
 
 class RandomDataset(Dataset):
@@ -70,7 +57,7 @@ def forward(self, data, residual):
 parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher')
 parser.add_argument('--deepspeed_config',
                     type=str,
-                    default='ds_config.json',
+                    default='ds_config_z3.json',
                     help='path to DeepSpeed configuration file')
 cmd_args = parser.parse_args()
 
@@ -82,6 +69,11 @@ def forward(self, data, residual):
 
 start_stats = get_dynamo_stats()
 
+if comm.get_rank() == 0:
+    #print(dynamo_stats['graph_breaks'])
+    for item in start_stats.items():
+        print(item)
+
 for step, batch in enumerate(rand_loader):
     if step % 10 == 0 and comm.get_rank() == 0:
         print(f'step={step}')
@@ -93,7 +85,14 @@ def forward(self, data, residual):
     model_engine.step()
 
 dynamo_stats = get_dynamo_stats()
-dynamo_stats.subtract(start_stats)
 
 if comm.get_rank() == 0:
-    print(dynamo_stats)
+    # print break down of graph break stats with markdown, print in table format, start with reason, then count
+    # print a tag 'dynamo_output' before each line to allow post processing
+    print("dynamo_output | Reason | Count |")
+    print("dynamo_output | ------ | ----- |")
+    for item in dynamo_stats.items():
+        # replace '|' in item[0] with a literal '|' to avoid mess with table format
+        item = (item[0].replace('|', r'\|'), item[1])
+        print(f"dynamo_output | {item[0]} | {item[1]} |")
+    print(f"dynamo_output | Total | {sum(dynamo_stats.values())} |")

From 65ab64481f47f92fd427bd98b30e4faf604e5c9f Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Mon, 14 Oct 2024 13:35:41 -0400
Subject: [PATCH 34/43] Add API for updating ZeRO gradients (#6590)

---
 deepspeed/runtime/zero/stage3.py              |  50 ++++++-
 deepspeed/utils/__init__.py                   |   6 +-
 deepspeed/utils/mixed_precision_linkage.py    |   3 +-
 deepspeed/utils/tensor_fragment.py            | 127 +++++++++++-------
 docs/code-docs/source/zero3.rst               |  44 +++++-
 .../runtime/zero/test_zero_tensor_fragment.py |  49 +++++--
 6 files changed, 200 insertions(+), 79 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 6895916783f15..e2c273fd913fb 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -2299,6 +2299,24 @@ def get_fp32_grad_for_param(self, param) -> Tensor:
 
         return self._fp32_state_allgather(param, fp32_grad)
 
+    def set_fp32_grad_for_param(self, value, param):
+        if not param.requires_grad:
+            return
+
+        if not get_accelerator().resolves_data_dependency():
+            self.reduce_and_partition_stream.synchronize()
+
+        if self.offload_optimizer:
+            group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
+        else:
+            fp32_grad = self.__param_id_to_grad_partition[param.ds_id]
+
+        my_rank = dist.get_rank(group=self.dp_process_group)
+        value_partition = value.flatten().narrow(0, fp32_grad.numel() * my_rank, fp32_grad.numel())
+
+        fp32_grad.data.copy_(value_partition.data)
+
     def _get_fp32_opt_state_partition(self, param, optim_state_key=None):
         if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
@@ -2347,12 +2365,6 @@ def set_full_hp_param(self, value, param, optim_state_key=None):
 
     ### Local API START ###
 
-    def get_local_fp32_param(self, param, optim_state_key=None) -> Tensor:
-        if not param.requires_grad:
-            return None
-        fp32_opt_state, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
-        return fp32_opt_state
-
     def get_local_fp32_grad_for_param(self, param) -> Tensor:
         if not param.requires_grad:
             return None
@@ -2367,6 +2379,30 @@ def get_local_fp32_grad_for_param(self, param) -> Tensor:
             fp32_grad = self.__param_id_to_grad_partition[param.ds_id].float()
         return fp32_grad
 
+    def set_local_grad_for_param(self, value, param):
+        if not param.requires_grad:
+            return
+
+        assert value.numel() == param.ds_tensor.numel(
+        ), f" Number of elements do not match: {value.numel()} != {param.ds_tensor.ds_numel}"
+
+        if not get_accelerator().resolves_data_dependency():
+            self.reduce_and_partition_stream.synchronize()
+
+        if self.offload_optimizer:
+            group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
+        else:
+            fp32_grad = self.__param_id_to_grad_partition[param.ds_id]
+
+        fp32_grad.data.copy_(value.flatten().data)
+
+    def get_local_fp32_param(self, param, optim_state_key=None) -> Tensor:
+        if not param.requires_grad:
+            return None
+        fp32_opt_state, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
+        return fp32_opt_state
+
     def set_local_hp_param(self, value, param, optim_state_key=None):
         if not param.requires_grad:
             return
@@ -2381,7 +2417,7 @@ def set_local_hp_param(self, value, param, optim_state_key=None):
 
         if self._swappable_optimizer_subgroup(group_idx):
             self._optimizer_states_and_gradient_swap_out(group_idx)
-        logger.info(f"[set_local_hp_param][update the params' value successfully]")
+        # logger.info(f"[set_local_hp_param][update the params' value successfully]")
 
     ### Local API END ###
 
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index 75fb6aa9d30ad..c6a202d485eba 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -12,10 +12,10 @@
 # TODO: Move tensor fragment and mixed precision to zero utils
 from .tensor_fragment import tensor_fragment, get_full_hp_param, get_hp_fragment_mapping, fragment_address, get_full_hp_grad, map_to_flat_opt_states
 from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
-from .tensor_fragment import set_full_hp_param
-from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state
+from .tensor_fragment import set_full_hp_param, set_full_hp_grad
+from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state, safe_set_full_grad
 from .tensor_fragment import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
-from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_optimizer_state
+from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_grad, safe_set_local_optimizer_state
 from .z3_leaf_module import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module, z3_leaf_parameter
 from .mixed_precision_linkage import link_hp_params, lazy_init_hp_params_optimizer_state
 from deepspeed.runtime.dataloader import RepeatingLoader
diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py
index 7dea6ba322dbe..c97515ca8fef7 100644
--- a/deepspeed/utils/mixed_precision_linkage.py
+++ b/deepspeed/utils/mixed_precision_linkage.py
@@ -5,7 +5,7 @@
 
 import types
 from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping
-from deepspeed.utils import set_full_hp_param
+from deepspeed.utils import set_full_hp_param, set_full_hp_grad
 
 
 def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
@@ -35,6 +35,7 @@ def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_gr
         lp_param.get_full_hp_param = types.MethodType(get_full_hp_param, lp_param)
         lp_param.get_full_hp_grad = types.MethodType(get_full_hp_grad, lp_param)
         lp_param.set_full_hp_param = types.MethodType(set_full_hp_param, lp_param)
+        lp_param.set_full_hp_grad = types.MethodType(set_full_hp_grad, lp_param)
 
         # lp_param overlaps with partition if both are true
         # 1) current_offset < partition_end,
diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py
index b34722580ddd1..053c8b5adad06 100644
--- a/deepspeed/utils/tensor_fragment.py
+++ b/deepspeed/utils/tensor_fragment.py
@@ -57,6 +57,17 @@ def get_hp_fragment(self, optim_state_key=None):
             return self.hp_fragment
         return self.get_optim_state_fragment(optim_state_key)
 
+    def get_lp_grad_fragment(self, index_in_param_group):
+        if self.use_offload:
+            gradient_dict = self.offload_gradient_dict
+        else:
+            gradient_dict = self.gradient_dict
+
+        if self.param_group_index not in gradient_dict or gradient_dict[self.param_group_index] is None:
+            raise ValueError("Gradients are only available immediately after backward and before engine step")
+
+        return gradient_dict[self.param_group_index][index_in_param_group]
+
 
 def map_to_flat_opt_states(flat_hp_tensor, lp_tensors, optim_state, opt_keys):
     for key in opt_keys:
@@ -95,17 +106,7 @@ def set_full_hp_param(self, value, optim_state_key=None):
 def get_full_hp_grad(self):
     reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
     if self._hp_mapping is not None:
-        hp_mapping = self._hp_mapping
-
-        if hp_mapping.use_offload:
-            gradient_dict = hp_mapping.offload_gradient_dict
-        else:
-            gradient_dict = hp_mapping.gradient_dict
-
-        if hp_mapping.param_group_index not in gradient_dict or gradient_dict[hp_mapping.param_group_index] is None:
-            raise ValueError("Gradients are only available immediately after backward and before engine step")
-
-        lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][self._index_in_param_group]
+        lp_grad_fragment = self._hp_mapping.get_lp_grad_fragment(self._index_in_param_group)
         hp_grad_fragment = lp_grad_fragment.to(torch.float32).flatten()
 
         lp_frag_address = self._hp_mapping.lp_fragment_address
@@ -120,6 +121,14 @@ def get_full_hp_grad(self):
     return reduce_buffer.reshape_as(self)
 
 
+def set_full_hp_grad(self, value):
+    if self._hp_mapping is not None:
+        lp_grad_fragment = self._hp_mapping.get_lp_grad_fragment(self._index_in_param_group)
+        lp_frag_address = self._hp_mapping.lp_fragment_address
+        value_fragment = torch.narrow(value.flatten(), 0, lp_frag_address.start, lp_frag_address.numel)
+        lp_grad_fragment.data.copy_(value_fragment.data.reshape_as(lp_grad_fragment.data))
+
+
 def safe_get_full_fp32_param(param):
     """Assemble and return the fp32 parameter of a low-precision (e.g., fp16) parameter.
 
@@ -188,7 +197,10 @@ def safe_set_full_optimizer_state(param, value, optim_state_key):
 
 # TODO: Figure out the correct return dtype
 def safe_get_full_grad(param):
-    """Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
+    """
+        Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
+        The return data type is that used for gradient accumulation. This is usually the param data type,
+        but could also be different (e.g., bf16 param training with fp32 gradient accumulation).
 
         Args:
             param (``torch.nn.Parameter``): A model parameter
@@ -207,74 +219,95 @@ def safe_get_full_grad(param):
     return None
 
 
+def safe_set_full_grad(param, value):
+    """
+        Update the partitioned gradient of a low-precision (e.g., fp16) parameter.
+        To avoid precision issues, the update value should have the data type of
+        gradient accumulation.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+            value (``torch.Tensor``): The un-partitioned new gradient value.
+    """
+    if param.grad is not None:
+        param.grad.copy_(value)
+    elif hasattr(param, 'ds_id'):
+        # ZeRO stage 3 param
+        param._z3_optimizer.set_fp32_grad_for_param(value, param)
+    elif hasattr(param, '_hp_mapping'):
+        # ZeRO stage 1, 2, and bf16_optimizer params
+        param.set_full_hp_grad(value)
+
+
 ### Local API  START ###
 def safe_get_local_grad(param):
-    """Get the fp32 gradient of a partitioned parameter.
+    """
+        Get the local gradient partition of a ZeRO-3 partitioned parameter.
+        The return data type is that used for gradient accumulation. This is usually the param data type,
+        but could also be different (e.g., bf16 param training with fp32 gradient accumulation).
         Args:
             param (``torch.nn.Parameter``): A model parameter
     """
-    if param.grad is not None:
-        return param.grad
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_grad_for_param(param)
 
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_grad_for_param(param)
 
-    return None
+def safe_set_local_grad(param, value):
+    """
+        Update the local gradient partition of a ZeRO-3 partitioned parameter.
+        To avoid precision issues, the update value should have the data type of
+        gradient accumulation.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local gradient partition.
+    """
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_grad_for_param(value, param)
 
 
 def safe_get_local_fp32_param(param):
-    """Get the fp32 partitioned parameter.
+    """Get the local partition of a ZeRO-3 partitioned parameter in fp32 precision.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
+            param (``torch.nn.Parameter``): A model parameter.
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_param(param)
-
-    return None
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_param(param)
 
 
 def safe_get_local_optimizer_state(param, optim_state_key):
-    """Get the fp32 optimizer state of a partitioned parameter.
+    """Get the local optimizer state partition of ZeRO-3 partitioned parameter in fp32 precision.
         Args:
             param (``torch.nn.Parameter``): A model parameter
             optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
-
-    return None
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
 
 
 def safe_set_local_optimizer_state(param, value, optim_state_key):
-    """Update the fp32 optimizer state of a partitioned parameter.
+    """Update the local optimizer state partition of a ZeRO-3 partitioned parameter.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
-            value (``torch.Tensor``): New value
-            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local optimizer state partition.
+            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer).
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
 
 
 def safe_set_local_fp32_param(param, value):
-    """Update the partitioned fp32 parameter.
+    """Update the local partition of ZeRO-3 partitioned parameter.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
-            value (``torch.Tensor``): New value
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local parameter partition.
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        param._z3_optimizer.set_local_hp_param(value, param)
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_hp_param(value, param)
 
 
 ### Local API  END ###
 
-# TODO: Implement API for setting ZeRO partitioned gradients
-
 
 def get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
                             param_group_index, partition_start, partition_size):
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index ae7cedd1a8b39..aa8139a654a10 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -369,13 +369,13 @@ These routines can be used in a training loop as shown in the following snippet.
     from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
     for n, lp in model.named_parameters():
         # 1. Access the full states
-        # 1) gradient lookup
+        #  1.1) gradient lookup
         # For zero1 and zero2, gradient lookup must be called after `backward` and before `step`
         # For zero3, gradient lookup must be called after `backward`
         hp_grad = safe_get_full_grad(lp)
 
 
-        # 2) fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
+        # 1.2) fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
         hp = safe_get_full_fp32_param(lp)
         exp_avg = safe_get_full_optimizer_state(lp, "exp_avg")
         exp_avg_sq = safe_get_full_optimizer_state(lp, "exp_avg_sq")
@@ -396,34 +396,39 @@ These routines can be used in a training loop as shown in the following snippet.
 Modifying Partitioned States
 ----------------------------
 
-Sometimes, a user may want to modify parameters or optimizer states outside of the regular training loop. This is currently difficult in ZeRO training because of partitioning. To overcome that, DeepSpeed provides the following routines for modifying the fp32 master parameters and the fp32 optimizer states.
+Sometimes, a user may want to modify parameters, gradients, or optimizer states outside of the regular training loop. This is currently difficult in ZeRO training because of partitioning. To overcome that, DeepSpeed provides the following routines for modifying the fp32 master parameters and the fp32 optimizer states.
 
 .. autofunction:: deepspeed.utils.safe_set_full_fp32_param
 
 .. autofunction:: deepspeed.utils.safe_set_full_optimizer_state
 
+.. autofunction:: deepspeed.utils.safe_set_full_grad
+
 .. autofunction:: deepspeed.utils.safe_set_local_fp32_param
 
+.. autofunction:: deepspeed.utils.safe_set_local_grad
+
 .. autofunction:: deepspeed.utils.safe_set_local_optimizer_state
 
-These routines can be used at any point after initialization of the DeepSpeed engine (i.e., ``deepspeed.initialize()``) as shown in the following snippet.
+The routines for modifying parameters and optimizer states can be used at any point after initialization of the DeepSpeed engine (i.e., ``deepspeed.initialize()``) as shown in the following snippet.
 
 .. code-block:: python
 
     [...]
+    from deepspeed.runtime.zero.utils import is_zero_param
     from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
     from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
     # Here is an example to zero all the fp32 parameters and optimizer states.
     for n, lp in model.named_parameters():
-        # 1. For zero stage 1 or 2, set the full fp32 and their full optim states
-        zero_tensor = torch.zeros_like(lp)
+        # 1. For zero stage 1, 2, or 3 set the full fp32 and their full optim states
+        zero_tensor = torch.zeros(lp.ds_shape) if is_zero_param(lp) else torch.zeros(lp.shape)
 
         safe_set_full_fp32_param(lp, zero_tensor)
         safe_get_full_optimizer_state(lp, zero_tensor, "exp_avg")
         safe_get_full_optimizer_state(lp, zero_tensor, "exp_avg_sq")
 
         # 2. For zero stage 3, each process sets its local fp32 parameters and their local optimizer states individually
-        zero_tensor_local = torch.zeros_like(lp.ds_tensor.shape)
+        zero_tensor_local = torch.zeros(lp.ds_tensor.shape)
 
         safe_set_local_fp32_param(lp, zero_tensor_local)
         safe_set_local_optimizer_state(lp, zero_tensor_local, "exp_avg")
@@ -432,6 +437,31 @@ These routines can be used at any point after initialization of the DeepSpeed en
     [...]
 
 
+The routines for modifying gradients can be used after ``backward`` but before ``step`` as shown in the following snippet.
+
+.. code-block:: python
+
+    backward(loss)
+    [...]
+    from deepspeed.runtime.zero.utils import is_zero_param
+    from deepspeed.utils import safe_set_full_grad, safe_set_local_grad
+    # Here is an example of how to zero all the gradients.
+    for n, lp in model.named_parameters():
+        # 1. For zero stage 1, 2, or 3 set the full gradient.
+        zero_tensor = torch.zeros(lp.ds_shape) if is_zero_param(lp) else torch.zeros(lp.shape)
+
+        safe_set_full_grad(lp, zero_tensor)
+
+        # 2. For zero stage 3, each process sets its local gradient partition.
+        zero_tensor_local = torch.zeros_like(lp.ds_tensor.shape)
+
+        safe_set_local_grad(lp, zero_tensor_local)
+
+    [...]
+    optimizer.step()
+
+
+
 GPU Memory Management
 ---------------------
 
diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
index 3bb4af3e3d912..2e3a652668ed5 100644
--- a/tests/unit/runtime/zero/test_zero_tensor_fragment.py
+++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
@@ -13,9 +13,9 @@
 
 import deepspeed
 from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
-from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
+from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_grad, safe_set_full_optimizer_state
 from deepspeed.utils import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
-from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
+from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_grad, safe_set_local_optimizer_state
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.ops.aio import AsyncIOBuilder
 from deepspeed.accelerator import get_accelerator
@@ -23,6 +23,7 @@
 WEIGHT_KEY = 'weight'
 FIRST_ORDER_KEY = 'exp_avg'
 SECOND_ORDER_KEY = 'exp_avg_sq'
+GRADIENT_KEY = 'gradient'
 
 
 def validate_tensor(model, api_type, opt_states):
@@ -180,13 +181,14 @@ def test_bf16_fragments(self, frozen_weights):
         run_fragmented_model(model, config_dict, hidden_dim, torch.bfloat16, validate_after_bwd, validate_after_step)
 
 
-def create_random_values(model, key_list, group, use_cuda=True):
+def create_random_values(model, key_list, group, grad_dtype, use_cuda=True):
     param_values = {}
     for n, lp in model.named_parameters():
         param_shape = lp.ds_shape if hasattr(lp, 'ds_id') else lp.shape
         param_values[n] = {}
         for key in key_list:
-            rand_value = torch.rand(param_shape, dtype=torch.float32, device=model.device)
+            dtype = grad_dtype if key == GRADIENT_KEY else torch.float32
+            rand_value = torch.rand(param_shape, dtype=dtype, device=model.device)
             dist.broadcast(rand_value, src=0, group=group)
             param_values[n][key] = rand_value
     return param_values
@@ -195,7 +197,9 @@ def create_random_values(model, key_list, group, use_cuda=True):
 def set_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, value_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                safe_set_full_grad(lp, value_tensor)
+            elif key == WEIGHT_KEY:
                 safe_set_full_fp32_param(lp, value_tensor)
             else:
                 safe_set_full_optimizer_state(lp, value_tensor, key)
@@ -204,21 +208,25 @@ def set_param_values_with_dict(model, value_dict):
 def validate_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, expected_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                actual_tensor = safe_get_full_grad(lp)
+            elif key == WEIGHT_KEY:
                 actual_tensor = safe_get_full_fp32_param(lp)
             else:
                 actual_tensor = safe_get_full_optimizer_state(lp, key)
+
             assert torch.equal(expected_tensor, actual_tensor)
 
 
-def create_random_values_for_local(model, key_list, group, use_cuda=True):
+def create_random_values_for_local(model, key_list, group, grad_dtype, use_cuda=True):
     param_values = {}
     for n, lp in model.named_parameters():
         param_shape = lp.ds_tensor.shape
         param_values[n] = {}
         for key in key_list:
             device = model.device if use_cuda else "cpu"
-            rand_value = torch.rand(param_shape, dtype=torch.float32, device=device)
+            dtype = grad_dtype if key == GRADIENT_KEY else torch.float32
+            rand_value = torch.rand(param_shape, dtype=dtype, device=device)
             # dist.broadcast(rand_value, src=0, group=group)
             param_values[n][key] = rand_value
     return param_values
@@ -228,7 +236,9 @@ def set_local_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
 
         for key, value_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                safe_set_local_grad(lp, value_tensor)
+            elif key == WEIGHT_KEY:
                 safe_set_local_fp32_param(lp, value_tensor)
             else:
                 safe_set_local_optimizer_state(lp, value_tensor, key)
@@ -237,10 +247,13 @@ def set_local_param_values_with_dict(model, value_dict):
 def validate_local_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, expected_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                actual_tensor = safe_get_local_grad(lp)
+            elif key == WEIGHT_KEY:
                 actual_tensor = safe_get_local_fp32_param(lp)
             else:
                 actual_tensor = safe_get_local_optimizer_state(lp, key)
+
             assert torch.equal(expected_tensor, actual_tensor)
 
 
@@ -325,12 +338,20 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, dtyp
 
         dist.barrier()
 
-        def validate_func(model):
-            optim_keys = [WEIGHT_KEY, FIRST_ORDER_KEY, SECOND_ORDER_KEY]
+        def after_bwd_validate_func(model):
+            state_keys = [WEIGHT_KEY, GRADIENT_KEY]
+            helper_funcs = helper_funcs_mapping[api_type]
+            optim_state_values = helper_funcs["create_random_values"](
+                model, state_keys, group, grad_dtype=dtype, use_cuda=offload_device == OffloadDeviceEnum.none)
+            helper_funcs["set_param_values_with_dict"](model, optim_state_values)
+            helper_funcs["validate_param_values_with_dict"](model, optim_state_values)
+
+        def after_step_validate_func(model):
+            state_keys = [WEIGHT_KEY, FIRST_ORDER_KEY, SECOND_ORDER_KEY]
             helper_funcs = helper_funcs_mapping[api_type]
             optim_state_values = helper_funcs["create_random_values"](
-                model, optim_keys, group, use_cuda=offload_device == OffloadDeviceEnum.none)
+                model, state_keys, group, grad_dtype=dtype, use_cuda=offload_device == OffloadDeviceEnum.none)
             helper_funcs["set_param_values_with_dict"](model, optim_state_values)
             helper_funcs["validate_param_values_with_dict"](model, optim_state_values)
 
-        run_fragmented_model(model, config_dict, hidden_dim, dtype, lambda _: None, validate_func)
+        run_fragmented_model(model, config_dict, hidden_dim, dtype, after_bwd_validate_func, after_step_validate_func)

From 13c16c9562dc41e153679278cf2ecad058a9fbc7 Mon Sep 17 00:00:00 2001
From: diskkid <disk2id@gmail.com>
Date: Tue, 15 Oct 2024 04:26:24 +0900
Subject: [PATCH 35/43] Accept btl_tcp_if_include option through launcher_args
 (#6613)

This patch fixes issue #4460.
When `btl_tcp_if_include` option is provided through `--launcher_args`,
we use the provided option instead of the hardcoded `--mca
btl_tcp_if_include eth0`. Otherwise we use `--mca btl_tcp_if_include
eth0` as the default for compatibility.

Fixes #4460

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/launcher/multinode_runner.py       | 16 +++++++++++----
 tests/unit/launcher/test_multinode_runner.py | 21 ++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index a816b56857603..74d20a6d53e52 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -141,6 +141,17 @@ def validate_args(self):
     def get_cmd(self, environment, active_resources):
         total_process_count = sum(self.resource_pool.values())
 
+        launcher_args = split(self.args.launcher_args)
+
+        # If btl_tcp_if_include option is provided through launcher_args, we use it. Otherwise, we add
+        # `--mca btl_tcp_if_include eth0` option as a default value for compatibility.
+        btl_tcp_opt = ['--mca', 'btl_tcp_if_include', 'eth0']
+        if len(launcher_args) >= 2:
+            for i in range(len(launcher_args) - 1):
+                if launcher_args[i] in ['-mca', '--mca'] and launcher_args[i + 1] == 'btl_tcp_if_include':
+                    btl_tcp_opt = []
+                    break
+
         mpirun_cmd = [
             'mpirun',
             '-n',
@@ -150,10 +161,7 @@ def get_cmd(self, environment, active_resources):
             '--mca',
             'btl',
             '^openib',
-            '--mca',
-            'btl_tcp_if_include',
-            'eth0',
-        ] + split(self.args.launcher_args)
+        ] + btl_tcp_opt + launcher_args
 
         export_cmd = []
         for k, v in self.exports.items():
diff --git a/tests/unit/launcher/test_multinode_runner.py b/tests/unit/launcher/test_multinode_runner.py
index ec0459ab0a6fe..a3b50a4c90ab2 100644
--- a/tests/unit/launcher/test_multinode_runner.py
+++ b/tests/unit/launcher/test_multinode_runner.py
@@ -32,6 +32,27 @@ def test_openmpi_runner(runner_info):
     runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
     cmd = runner.get_cmd(env, resource_pool)
     assert cmd[0] == 'mpirun'
+    assert 'eth0' in cmd
+
+
+def test_btl_nic_openmpi_runner(runner_info):
+    env, resource_pool, world_info, _ = runner_info
+    args = parse_args(['--launcher_arg', '-mca btl_tcp_if_include eth1', 'test_launcher.py'])
+
+    runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert 'eth0' not in cmd
+    assert 'eth1' in cmd
+
+
+def test_btl_nic_two_dashes_openmpi_runner(runner_info):
+    env, resource_pool, world_info, _ = runner_info
+    args = parse_args(['--launcher_arg', '--mca btl_tcp_if_include eth1', 'test_launcher.py'])
+
+    runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert 'eth0' not in cmd
+    assert 'eth1' in cmd
 
 
 def test_mpich_runner(runner_info):

From 85b7469ea00f7719a27e3e8d1ffaa8765575f820 Mon Sep 17 00:00:00 2001
From: Joe Mayer <114769929+jomayeri@users.noreply.github.com>
Date: Mon, 14 Oct 2024 12:31:45 -0700
Subject: [PATCH 36/43] Add first Step in LR Schedulers (#6597)

Some (not all) of the LR schedulers in runtime were missing the
initialization of the optimizer group lr.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/lr_schedules.py        | 38 ++++++++++++------------
 tests/unit/runtime/test_lr_schedulers.py |  8 +++++
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index d7f7e15a4dbda..f25a19e8e4991 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -247,6 +247,12 @@ def get_lr_from_config(config):
     return lr_params[WARMUP_MAX_LR], ''
 
 
+def update_lr(param_groups, lrs):
+    for param_group, lr in zip(param_groups, lrs):
+        param_group['lr'] = lr
+    return [group['lr'] for group in param_groups]
+
+
 """
 Only optimizers that are subclass of torch.optim.Optimizer are supported. So check the passed optimizer and wrapped
 optimizer to see if requirement is satisfied.
@@ -328,7 +334,7 @@ def __init__(self,
         self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continuous_interval
 
         if last_batch_iteration == -1:
-            self._update_optimizer(self.min_lr)
+            self._last_lr = update_lr(self.optimizer.param_groups, self.min_lr)
 
     def _staircase_interval(self):
         return math.floor(float(self.last_batch_iteration + 1) / self.step_size)
@@ -349,16 +355,11 @@ def get_last_lr(self):
         assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
         return self._last_lr
 
-    def _update_optimizer(self, group_lrs):
-        for param_group, lr in zip(self.optimizer.param_groups, group_lrs):
-            param_group['lr'] = lr
-
     def step(self, batch_iteration=None):
         if batch_iteration is None:
             batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = batch_iteration
-        self._update_optimizer(self.get_lr())
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -615,9 +616,7 @@ def step(self, batch_iteration=None):
             batch_iteration = self.last_batch_iteration + 1
 
         self.last_batch_iteration = batch_iteration
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
         if self.cycle_momentum:
             momentums = self.get_mom()
@@ -675,11 +674,14 @@ def __init__(self,
         self.warmup_type = warmup_type
         self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps)
         self.last_batch_iteration = last_batch_iteration
+        # Initialize lr in optimizer
+        if last_batch_iteration == -1:
+            self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def get_lr(self):
         if self.last_batch_iteration < 0:
             logger.warning("Attempting to get learning rate from scheduler before it has started")
-            return [0.0]
+            return self.min_lrs
         gamma = self._get_gamma()
         return [min_lr + (delta_lr * gamma) for min_lr, delta_lr in zip(self.min_lrs, self.delta_lrs)]
 
@@ -693,9 +695,7 @@ def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -819,6 +819,10 @@ def __init__(self,
                 total_num_steps, warmup_num_steps))
         self.org_lrs = [group['lr'] for group in self.optimizer.param_groups]
 
+        # Initialize lrs in optimizer groups
+        if last_batch_iteration == -1:
+            self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
+
     def get_lr_ratio(self):
         if self.last_batch_iteration < 0:
             logger.warning("Attempting to get learning rate from scheduler before it has started")
@@ -844,11 +848,7 @@ def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
-
-        lrs = self.get_lr()
-        for param_group, lr in zip(self.optimizer.param_groups, lrs):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def get_lr(self):
         if self.last_batch_iteration < 0:
diff --git a/tests/unit/runtime/test_lr_schedulers.py b/tests/unit/runtime/test_lr_schedulers.py
index bcfc485f2b8f5..47734c0cd864d 100644
--- a/tests/unit/runtime/test_lr_schedulers.py
+++ b/tests/unit/runtime/test_lr_schedulers.py
@@ -37,6 +37,9 @@ def _verify_staircase_increase(values, step_size):
                                                    (WARMUP_DECAY_LR, {
                                                        WARMUP_NUM_STEPS: 10,
                                                        TOTAL_NUM_STEPS: 20
+                                                   }), (WARMUP_COSINE_LR, {
+                                                       WARMUP_NUM_STEPS: 10,
+                                                       TOTAL_NUM_STEPS: 20
                                                    }), (ONE_CYCLE, {
                                                        CYCLE_MIN_LR: 0,
                                                        CYCLE_MAX_LR: 0.1
@@ -71,6 +74,11 @@ def test(self, scheduler_type, params):
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
+
+        true_lrs = lr_scheduler.get_lr()
+        for group, true_lr in zip(model.optimizer.param_groups, true_lrs):
+            assert group['lr'] == true_lr, f"True lr {true_lr}, optimizer lr {group['lr']}"
+
         for n, batch in enumerate(data_loader):
             # get lr before training starts
             lr_scheduler.get_lr()

From bf60fc0ca63b74722d3cf1bbabf17ea9fff37ffe Mon Sep 17 00:00:00 2001
From: Xu Song <xusong28@jd.com>
Date: Tue, 15 Oct 2024 19:22:31 +0800
Subject: [PATCH 37/43] Support safetensors export (#6579)

## Feature

This commit implements the following features:

- [x] support saving checkpoint as safetensors (more commonly used
format)
- [x] support sharding checkpoints (which is important for very large
models)

Most of the codes are borrowed from
https://github.com/huggingface/transformers/blob/v4.45.1/src/transformers/modeling_utils.py#L2490

## Usage

For `pytorch_model.bin` export
```
python zero_to_fp32.py . output_dir/
```

For  `model.safetensors` export
```
python zero_to_fp32.py . output_dir/ --safe_serialization
```

---------

Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/utils/zero_to_fp32.py | 98 ++++++++++++++++++++++++++++-----
 1 file changed, 84 insertions(+), 14 deletions(-)

diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index 24cc342e78d1a..e69ecd9acb5a2 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -10,7 +10,10 @@
 # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
 # application.
 #
-# example: python zero_to_fp32.py . pytorch_model.bin
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
 
 import argparse
 import torch
@@ -18,6 +21,8 @@
 import math
 import os
 import re
+import json
+from tqdm import tqdm
 from collections import OrderedDict
 from dataclasses import dataclass
 
@@ -139,7 +144,6 @@ def parse_model_states(files):
 
 
 def parse_optim_states(files, ds_checkpoint_dir):
-
     total_files = len(files)
     state_dicts = []
     for f in files:
@@ -420,12 +424,10 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
     offset = 0
     total_numel = 0
     total_params = 0
-    for name, shape in param_shapes.items():
-
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
         unpartitioned_numel = shape.numel()
         total_numel += unpartitioned_numel
         total_params += 1
-
         partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
 
         if debug:
@@ -521,21 +523,75 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
     return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
 
 
-def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
     loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
 
     Args:
         - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
-        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
         - ``exclude_frozen_parameters``: exclude frozen parameters
     """
-
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
     state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
-    print(f"Saving fp32 state dict to {output_file}")
-    torch.save(state_dict, output_file)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        state_dict_split = split_torch_state_dict_into_shards(state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_path)
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
 
 
 def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
@@ -578,15 +634,27 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
     parser.add_argument("checkpoint_dir",
                         type=str,
                         help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
     parser.add_argument(
-        "output_file",
+        "--max_shard_size",
         type=str,
-        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
     parser.add_argument("-t",
                         "--tag",
                         type=str,
@@ -599,6 +667,8 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
     debug = args.debug
 
     convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
-                                               args.output_file,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
                                                tag=args.tag,
                                                exclude_frozen_parameters=args.exclude_frozen_parameters)

From ce468c3756561dc868672219b2895a56df2babe2 Mon Sep 17 00:00:00 2001
From: Shelly Nahir <73890534+ShellyNR@users.noreply.github.com>
Date: Tue, 15 Oct 2024 21:30:42 +0300
Subject: [PATCH 38/43] add option to disable logger while compiling to avoid
 graph breaks (#6496)

adding an option to disable calls for logger while compiling to avoid
graph breaks. Here I used an environment variable to determine whether
to activate this option, but it can also be determined using the json
config file or any other way you see fit.

---------

Co-authored-by: snahir <snahir@habana.ai>
Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
---
 deepspeed/utils/logging.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
index 1e62d96e10325..55fb72f6c9fba 100644
--- a/deepspeed/utils/logging.py
+++ b/deepspeed/utils/logging.py
@@ -7,6 +7,8 @@
 import logging
 import sys
 import os
+import torch
+from deepspeed.runtime.compiler import is_compile_supported
 
 log_levels = {
     "debug": logging.DEBUG,
@@ -19,6 +21,31 @@
 
 class LoggerFactory:
 
+    def create_warning_filter(logger):
+        warn = False
+
+        def warn_once(record):
+            nonlocal warn
+            if is_compile_supported() and torch.compiler.is_compiling() and not warn:
+                warn = True
+                logger.warning("To avoid graph breaks caused by logger in compile-mode, it is recommended to"
+                               " disable logging by setting env var DISABLE_LOGS_WHILE_COMPILING=1")
+            return True
+
+        return warn_once
+
+    @staticmethod
+    def logging_decorator(func):
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if torch.compiler.is_compiling():
+                return
+            else:
+                return func(*args, **kwargs)
+
+        return wrapper
+
     @staticmethod
     def create_logger(name=None, level=logging.INFO):
         """create a logger
@@ -44,6 +71,12 @@ def create_logger(name=None, level=logging.INFO):
         ch.setLevel(level)
         ch.setFormatter(formatter)
         logger_.addHandler(ch)
+        if os.getenv("DISABLE_LOGS_WHILE_COMPILING", "0") == "1":
+            for method in ['info', 'debug', 'error', 'warning', 'critical', 'exception']:
+                original_logger = getattr(logger_, method)
+                setattr(logger_, method, LoggerFactory.logging_decorator(original_logger))
+        else:
+            logger_.addFilter(LoggerFactory.create_warning_filter(logger_))
         return logger_
 
 

From 1a45bd8e8ca27ce32a7091e64d07a04b2adb2bb5 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Tue, 15 Oct 2024 14:49:37 -0700
Subject: [PATCH 39/43] Lock cache file of HF model list (#6628)

The error in the following log suggests that the cache file for HF model
list can be broken:

https://github.com/microsoft/DeepSpeed/actions/runs/11343665365/job/31546708118?pr=6614

The actual cause of the above error is unclear, but `_hf_model_list`
potentially breaks the cache file when it is concurrently called from
multiple processes. This PR locks the cache file to ensure
`_hf_model_list` safely reads and writes the file.
---
 tests/unit/inference/test_inference.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 581a2ce433edc..9b563523dbebb 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -10,6 +10,7 @@
 import os
 import time
 import requests
+import fcntl
 
 from dataclasses import dataclass
 from typing import List
@@ -95,9 +96,12 @@ def _hf_model_list() -> List[ModelInfo]:
     if os.path.isfile(cache_file_path):
         with open(cache_file_path, 'rb') as f:
             try:
+                fcntl.flock(f, fcntl.LOCK_SH)
                 model_data = pickle.load(f)
             except Exception as e:
                 print(f"Error loading cache file {cache_file_path}: {e}")
+            finally:
+                fcntl.flock(f, fcntl.LOCK_UN)
 
     current_time = time.time()
 
@@ -125,7 +129,11 @@ def _hf_model_list() -> List[ModelInfo]:
         # Save the updated cache
         os.makedirs(cache_dir, exist_ok=True)
         with open(cache_file_path, 'wb') as f:
-            pickle.dump(model_data, f)
+            try:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                pickle.dump(model_data, f)
+            finally:
+                fcntl.flock(f, fcntl.LOCK_UN)
 
     return model_data["model_list"]
 

From c9899dc14a391538ce8f0c4d58920c390dc381de Mon Sep 17 00:00:00 2001
From: jiahao su <damow890@gmail.com>
Date: Wed, 16 Oct 2024 07:36:10 +0800
Subject: [PATCH 40/43] Add README Pipeline Status for Huawei Ascend NPU
 (#6588)

Hello! Following the merge of
https://github.com/microsoft/DeepSpeed/pull/6445, I have implemented a
CI pipeline to validate the Huawei Ascend NPU.

---------

Co-authored-by: sjh <sjh1270@163.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 86711c4374f83..b302e32dfd9ca 100755
--- a/README.md
+++ b/README.md
@@ -142,6 +142,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
 | Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
 | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
+| Huawei Ascend NPU | [![Huawei Ascend NPU](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml/badge.svg?branch=master)](https://github.com/cosdt/DeepSpeed/actions/workflows/huawei-ascend-npu.yml) |
 
 # Installation
 

From a36db9cc1c167c24c7ce6b1d3f38a8550c6ec4b7 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Thu, 17 Oct 2024 10:50:55 -0700
Subject: [PATCH 41/43] Update torch version in workflows (#6631)

Set PyTorch version in CI workflows to v2.5.

Context: The
[error](https://github.com/microsoft/DeepSpeed/actions/runs/11371525624/job/31633793986?pr=6630)
in #6630 might have been caused by the PyTorch version mismatch or
something.
---
 .github/workflows/cpu-torch-latest.yml     | 4 ++--
 .github/workflows/nv-nightly.yml           | 2 +-
 .github/workflows/nv-torch-latest-v100.yml | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
index bb2b002b1a174..0de6832b37c13 100644
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -50,5 +50,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.4"
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.4"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.5"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.5"
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index e6032941ebb30..8c56d0445c9c7 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -58,7 +58,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.4" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.5" --cuda_ver="12.1"
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index e888c472638fd..0b8f504d8b5a7 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -55,5 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="12.1"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.5" --cuda_ver="12.1"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.5" --cuda_ver="12.1"

From c9fc34a4be558efce2a26d2b4e08cd8880524dc8 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Thu, 17 Oct 2024 15:15:25 -0700
Subject: [PATCH 42/43] Use file store for tests (#6632)

This PR changes the `init_method` for tests to `FileStore` for
robustness.
---
 tests/unit/common.py | 50 +++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/tests/unit/common.py b/tests/unit/common.py
index 69ba4c2708ace..685f943df2fec 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -147,16 +147,13 @@ class DistributedExec(ABC):
     def run(self):
         ...
 
-    def __call__(self, request=None):
+    def __call__(self, request):
         self._fixture_kwargs = self._get_fixture_kwargs(request, self.run)
         world_size = self.world_size
         if self.requires_cuda_env and not get_accelerator().is_available():
             pytest.skip("only supported in accelerator environments.")
 
-        if isinstance(world_size, int):
-            world_size = [world_size]
-        for procs in world_size:
-            self._launch_procs(procs)
+        self._launch_with_file_store(request, world_size)
 
     def _get_fixture_kwargs(self, request, func):
         if not request:
@@ -172,7 +169,7 @@ def _get_fixture_kwargs(self, request, func):
                 pass  # test methods can have kwargs that are not fixtures
         return fixture_kwargs
 
-    def _launch_daemonic_procs(self, num_procs):
+    def _launch_daemonic_procs(self, num_procs, init_method):
         # Create process pool or use cached one
         master_port = None
 
@@ -198,7 +195,7 @@ def _launch_daemonic_procs(self, num_procs):
             master_port = get_master_port()
 
         # Run the test
-        args = [(local_rank, num_procs, master_port) for local_rank in range(num_procs)]
+        args = [(local_rank, num_procs, master_port, init_method) for local_rank in range(num_procs)]
         skip_msgs_async = pool.starmap_async(self._dist_run, args)
 
         try:
@@ -218,7 +215,7 @@ def _launch_daemonic_procs(self, num_procs):
             assert len(set(skip_msgs)) == 1, "Multiple different skip messages received"
             pytest.skip(skip_msgs[0])
 
-    def _launch_non_daemonic_procs(self, num_procs):
+    def _launch_non_daemonic_procs(self, num_procs, init_method):
         assert not self.reuse_dist_env, "Cannot reuse distributed environment with non-daemonic processes"
 
         master_port = get_master_port()
@@ -227,7 +224,7 @@ def _launch_non_daemonic_procs(self, num_procs):
         prev_start_method = mp.get_start_method()
         mp.set_start_method('spawn', force=True)
         for local_rank in range(num_procs):
-            p = mp.Process(target=self._dist_run, args=(local_rank, num_procs, master_port, skip_msg))
+            p = mp.Process(target=self._dist_run, args=(local_rank, num_procs, master_port, init_method, skip_msg))
             p.start()
             processes.append(p)
         mp.set_start_method(prev_start_method, force=True)
@@ -269,7 +266,7 @@ def _launch_non_daemonic_procs(self, num_procs):
             # add a check here to assert all exit messages are equal
             pytest.skip(skip_msg.get())
 
-    def _launch_procs(self, num_procs):
+    def _launch_procs(self, num_procs, init_method):
         # Verify we have enough accelerator devices to run this test
         if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
             pytest.skip(
@@ -284,11 +281,11 @@ def _launch_procs(self, num_procs):
         mp.set_start_method('forkserver', force=True)
 
         if self.non_daemonic_procs:
-            self._launch_non_daemonic_procs(num_procs)
+            self._launch_non_daemonic_procs(num_procs, init_method)
         else:
-            self._launch_daemonic_procs(num_procs)
+            self._launch_daemonic_procs(num_procs, init_method)
 
-    def _dist_run(self, local_rank, num_procs, master_port, skip_msg=""):
+    def _dist_run(self, local_rank, num_procs, master_port, init_method, skip_msg=""):
         if not dist.is_initialized():
             """ Initialize deepspeed.comm and execute the user function. """
             if self.set_dist_env:
@@ -312,7 +309,10 @@ def _dist_run(self, local_rank, num_procs, master_port, skip_msg=""):
                 get_accelerator().set_device(local_rank)
 
             if self.init_distributed:
-                deepspeed.init_distributed(dist_backend=self.backend)
+                deepspeed.init_distributed(dist_backend=self.backend,
+                                           init_method=init_method,
+                                           rank=local_rank,
+                                           world_size=num_procs)
                 dist.barrier()
 
         try:
@@ -328,6 +328,22 @@ def _dist_run(self, local_rank, num_procs, master_port, skip_msg=""):
 
         return skip_msg
 
+    def _launch_with_file_store(self, request, world_size):
+        tmpdir = request.getfixturevalue("tmpdir")
+        dist_file_store = tmpdir.join("dist_file_store")
+        assert not os.path.exists(dist_file_store)
+        init_method = f"file://{dist_file_store}"
+
+        if isinstance(world_size, int):
+            world_size = [world_size]
+        for procs in world_size:
+            try:
+                self._launch_procs(procs, init_method)
+            finally:
+                if os.path.exists(dist_file_store):
+                    os.remove(dist_file_store)
+            time.sleep(0.5)
+
     def _dist_destroy(self):
         if (dist is not None) and dist.is_initialized():
             dist.barrier()
@@ -473,11 +489,7 @@ def __call__(self, request):
         else:
             world_size = self._fixture_kwargs.get("world_size", self.world_size)
 
-        if isinstance(world_size, int):
-            world_size = [world_size]
-        for procs in world_size:
-            self._launch_procs(procs)
-            time.sleep(0.5)
+        self._launch_with_file_store(request, world_size)
 
     def _get_current_test_func(self, request):
         # DistributedTest subclasses may have multiple test methods

From 6eefc3d0ead2c6360eec248daab0cae66a737ea0 Mon Sep 17 00:00:00 2001
From: Joe Mayer <114769929+jomayeri@users.noreply.github.com>
Date: Thu, 17 Oct 2024 19:58:06 -0700
Subject: [PATCH 43/43] Fix Memory Leak In AIO (#6630)

Fixing a memory leak in AIO pinned tensor as well as an incorrect
function type for gds op.

---------

Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
---
 csrc/aio/py_lib/deepspeed_cpu_op.cpp        | 5 ++++-
 csrc/aio/py_lib/deepspeed_pin_tensor.cpp    | 2 ++
 csrc/gds/py_lib/deepspeed_gds_op.cpp        | 3 +--
 csrc/gds/py_lib/deepspeed_gds_op.h          | 2 +-
 csrc/gds/py_lib/deepspeed_py_gds_handle.cpp | 2 +-
 csrc/gds/py_lib/deepspeed_py_gds_handle.h   | 2 +-
 6 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
index da1a52d9c6e3d..da2ff568d74bf 100644
--- a/csrc/aio/py_lib/deepspeed_cpu_op.cpp
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -38,7 +38,10 @@ void cpu_op_desc_t::finish()
 {
     if (_use_bounce_buffer) {
         if (_read_op) {
-            if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+            if (_buffer.is_cuda()) {
+                _buffer.copy_(_cpu_buffer.to(torch::Device(torch::kCUDA, _buffer.get_device()),
+                                             /*non_blocking=*/true));
+            }
             if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
             if (_buffer.is_cpu()) { _buffer.copy_(_cpu_buffer); }
 #if defined(__ENABLE_CANN__)
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
index 6d2800468e061..a97a4ac18ba81 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
@@ -15,6 +15,7 @@ deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t()
 {
     for (auto iter = _locked_tensors.begin(); iter != _locked_tensors.end(); ++iter) {
         munlock(iter->first, iter->second);
+        std::free((void*)iter->first);
     }
     _locked_tensors.clear();
 }
@@ -43,6 +44,7 @@ bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
     auto addr = locked_tensor.data_ptr();
     if (_locked_tensors.find(addr) != _locked_tensors.end()) {
         munlock(addr, _locked_tensors[addr]);
+        std::free(addr);
         _locked_tensors.erase(addr);
         return true;
     }
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
index dae2eef21c6f1..f49f74394374e 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.cpp
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -58,7 +58,6 @@ void gds_op_desc_t::add_buffer_to_registry(const torch::Tensor& buffer)
     const int64_t device = buffer.get_device();
     void* reg_ptr = buffer.data_ptr();
 
-    // std::cout << "REG PTR " <<  reg_ptr << std::endl;
     // TODO: add checking to make sure pointer isn't already in set
     const auto it = base_ptr_registry.find(device);
     if (it == base_ptr_registry.end()) {
@@ -94,7 +93,7 @@ gds_op_desc_t::gds_op_desc_t(const bool read_op,
                              const torch::Tensor& buffer,
                              const int fd,
                              const char* filename,
-                             const long long int file_num_bytes,
+                             const int64_t file_num_bytes,
                              const int intra_op_parallelism,
                              const bool validate)
     : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, intra_op_parallelism, validate)
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
index c9d4c076f1894..380bb0b9b6aed 100644
--- a/csrc/gds/py_lib/deepspeed_gds_op.h
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -22,7 +22,7 @@ struct gds_op_desc_t : io_op_desc_t {
                   const torch::Tensor& buffer,
                   const int fd,
                   const char* filename,
-                  const long long int file_num_bytes,
+                  const int64_t file_num_bytes,
                   const int intra_op_parallelism,
                   const bool validate);
 
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
index 43705939dc3e5..c052144a0190b 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -106,7 +106,7 @@ std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
     const torch::Tensor& buffer,
     const int fd,
     const char* filename,
-    const long long int file_num_bytes,
+    const int64_t file_num_bytes,
     const bool validate)
 {
     if (buffer.is_cuda()) {
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
index a3c10a4f64676..131e83e7b838a 100644
--- a/csrc/gds/py_lib/deepspeed_py_gds_handle.h
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -41,7 +41,7 @@ struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
                                                             const torch::Tensor& buffer,
                                                             const int fd,
                                                             const char* filename,
-                                                            const long long int file_num_bytes,
+                                                            const int64_t file_num_bytes,
                                                             const bool validate);
 
     static int s_cuFile_init;