From 1b58ba5ec04493a112fae10d9cc9c824dfbd40ca Mon Sep 17 00:00:00 2001
From: xyxie <xyxie@pku.edu.cn>
Date: Wed, 11 Dec 2024 02:31:11 +0800
Subject: [PATCH 01/15] Merge LoCo with Zero++ (#6730)

      ### Integration of LoCo Method into ZeRO++

#### Overview
This PR introduces the integration of the **LoCo** method, as outlined
in [this paper](https://arxiv.org/abs/2407.04480), into the ZeRO++
framework of DeepSpeed. The key enhancement involves applying error
feedback compensation to 4-bit gradients before communication. This
approach ***improves pre-training loss outcomes without additional time
overhead***, though it requires extra GPU memory. The extent of this
memory increase depends on model size and training configuration.

#### Experimental Results
We conducted pre-training experiments using the Llama2 architecture,
adjusting the number of layers and hidden size. The experiments
included:
- **A smaller-scale model with 0.8B parameters trained on 30B tokens**.
- **A larger-scale model with 8B parameters trained on 5B tokens**.

The training data was sampled from **Redpajama-V2**.
<p align="center">
<img
src="https://github.com/user-attachments/assets/e7db9487-728c-4a17-9806-c15afa12f62e"
width="49%" />
<img
src="https://github.com/user-attachments/assets/3efec895-b71d-43ab-b5ce-65468ba8b9f1"
width="49%" />
</p>

**Findings**:
- **Smaller Models (0.8B parameters)**: Significant gains were observed
when applying the LoCo method.
- **Larger Models (8B parameters)**: The gains were present but less
pronounced. This could be due to:
  1. Relatively smaller data volume.
2. Lower pre-training loss for larger models, making significant
improvements harder to achieve.

However, even a smaller pre-training loss gap in larger models can
translate to meaningful gains in downstream tasks.

#### Example Script
For reference, the
[run.sh](https://github.com/user-attachments/files/17679552/zeroplus-7b3.zip)
script used for the 8B parameter, 5B tokens experiment is attached. The
experiment was conducted using the **DeepSpeed-Megatron** platform.



#### Acknowledgments
Special thanks to cc @GuanhuaWang for ongoing communication and guidance
throughout this work.

---

We appreciate your consideration of this PR and welcome any feedback or
questions!

---------

Co-authored-by: ChuanxinTang <tangchuanxin.chn@gmail.com>
Co-authored-by: root <pan.jiachun@outlook.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
Co-authored-by: Hongwei Chen <33092912+hwchen2017@users.noreply.github.com>
---
 csrc/includes/quantization.h                  |  30 ++
 csrc/includes/quantization_utils.h            |   1 +
 csrc/quantization/pt_binding.cpp              | 106 +++++++
 csrc/quantization/quant_reduce.cu             | 294 ++++++++++++++++++
 csrc/quantization/swizzled_quantize.cu        | 231 ++++++++++++++
 .../runtime/comm/coalesced_collectives.py     |  79 ++++-
 deepspeed/runtime/engine.py                   |   7 +-
 deepspeed/runtime/zero/config.py              |  13 +-
 deepspeed/runtime/zero/stage3.py              |  32 +-
 .../comm/test_coalesced_collectives.py        |  63 ++++
 10 files changed, 851 insertions(+), 5 deletions(-)
diff --git a/csrc/includes/quantization.h b/csrc/includes/quantization.h
index 45828832d8d2..5bdc96061a31 100644
--- a/csrc/includes/quantization.h
+++ b/csrc/includes/quantization.h
@@ -52,6 +52,36 @@ void launch_swizzled_quant(int8_t* q_data,
                            int devices_per_node,
                            cudaStream_t stream);
 
+void launch_loco_swizzled_quant(int8_t* quantized_data,
+                                float* quantized_scales,
+                                const __half* uncompressed_data,
+                                __half* error_feedback,
+                                const float err_beta,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int groups,
+                                int elems_per_group,
+                                int pipelining,
+                                int nodes,
+                                int devices_per_node,
+                                cudaStream_t stream);
+
+void launch_loco_dequant_reduce(int8_t* reduced_data,
+                                float* reduced_scales,
+                                const int8_t* input_data,
+                                const float* input_scales,
+                                int num_gpus,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int out_groups,
+                                int elems_per_out_group,
+                                int elems_per_in_tensor,
+                                int groups_per_in_tensor,
+                                int elems_per_in_group,
+                                __half2* error_feedback,
+                                const float err_beta,
+                                cudaStream_t stream);
+
 void launch_dequant_reduce(int8_t* reduced_data,
                            float* reduced_scales,
                            const int8_t* input_data,
diff --git a/csrc/includes/quantization_utils.h b/csrc/includes/quantization_utils.h
index 26db86ec1e0b..61630d0aae57 100644
--- a/csrc/includes/quantization_utils.h
+++ b/csrc/includes/quantization_utils.h
@@ -24,6 +24,7 @@ constexpr int max_threads = 1024;
 Class to hold the quantization parameters for a given tensor.
 Holds the implementation of the quantization operation.
 */
+
 template <Type qType, int numBits>
 class Params {
 public:
diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
index e74c541699d7..b48eaacd0881 100644
--- a/csrc/quantization/pt_binding.cpp
+++ b/csrc/quantization/pt_binding.cpp
@@ -176,6 +176,53 @@ at::Tensor dequantize_int8_to_half_experimental(at::Tensor& data_in,
     return output;
 }
 
+std::vector<at::Tensor> ds_loco_swizzle_quant(at::Tensor& input_vals,
+                                              at::Tensor& error_feedback,
+                                              float err_beta,
+                                              int groups,
+                                              int num_bits,
+                                              quantize::Type quant_type,
+                                              int pipeline_size,
+                                              int nodes,
+                                              int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+    auto scales = torch::empty({groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int quantization_scalar = 8 / num_bits;
+    const int compressed_vals = at::numel(input_vals) / quantization_scalar;
+
+    auto output = torch::empty({compressed_vals}, output_options);
+    const int elems_per_group = at::numel(input_vals) / groups;
+
+    launch_loco_swizzled_quant(reinterpret_cast<int8_t*>(output.data_ptr()),
+                               reinterpret_cast<float*>(scales.data_ptr()),
+                               reinterpret_cast<const __half*>(input_vals.data_ptr()),
+                               reinterpret_cast<__half*>(error_feedback.data_ptr()),
+                               err_beta,
+                               num_bits,
+                               quant_type,
+                               groups,
+                               elems_per_group,
+                               pipeline_size,
+                               nodes,
+                               devices_per_node,
+                               at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
 std::vector<at::Tensor> ds_swizzle_quant(at::Tensor& input_vals,
                                          int groups,
                                          int num_bits,
@@ -265,6 +312,61 @@ std::vector<at::Tensor> quantized_reduction(at::Tensor& input_vals,
     return {output, scales};
 }
 
+std::vector<at::Tensor> loco_quantized_reduction(at::Tensor& input_vals,
+                                                 at::Tensor& input_scales,
+                                                 at::Tensor& error_feedback,
+                                                 float err_beta,
+                                                 int in_groups,
+                                                 int out_groups,
+                                                 int num_bits,
+                                                 quantize::Type quant_type,
+                                                 int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+
+    auto scales = torch::empty({out_groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    std::vector<int64_t> sz(input_vals.sizes().begin(), input_vals.sizes().end());
+    sz[sz.size() - 1] = sz.back() / devices_per_node;
+
+    const int elems_per_in_tensor = at::numel(input_vals) / devices_per_node;
+
+    auto output = torch::empty(sz, output_options);
+
+    const int elems_per_in_group = elems_per_in_tensor / (in_groups / devices_per_node);
+    const int elems_per_out_group = elems_per_in_tensor / out_groups;
+
+    launch_loco_dequant_reduce((int8_t*)output.data_ptr(),
+                               (float*)scales.data_ptr(),
+                               (const int8_t*)input_vals.data_ptr(),
+                               (const float*)input_scales.data_ptr(),
+                               devices_per_node,
+                               num_bits,
+                               quant_type,
+                               out_groups,
+                               elems_per_out_group,
+                               elems_per_in_tensor,
+                               in_groups / devices_per_node,
+                               elems_per_in_group,
+                               (__half2*)error_feedback.data_ptr(),
+                               err_beta,
+                               at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
@@ -295,4 +397,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "Dequantize int8 to half (experimental)");
     m.def("swizzle_quant", &ds_swizzle_quant);
     m.def("quantized_reduction", &quantized_reduction);
+    m.def("loco_swizzle_quant", &ds_loco_swizzle_quant, "LoCo Swizzled Quantization Kernel");
+    m.def("loco_quantized_reduction",
+          &loco_quantized_reduction,
+          "LoCo Quantization and Reduction Kernel");
 }
diff --git a/csrc/quantization/quant_reduce.cu b/csrc/quantization/quant_reduce.cu
index 26db1118c831..4100c5174b80 100644
--- a/csrc/quantization/quant_reduce.cu
+++ b/csrc/quantization/quant_reduce.cu
@@ -261,3 +261,297 @@ void launch_dequant_reduce(int8_t* reduced_data,
         }
     }
 }
+
+/*
+Modified loco_dequant_reduce function that performs dequantization and reduction,
+and incorporates error-feedback by updating the error_feedback tensor in-place.
+*/
+
+template <int numBits, int numTensors, int totalChunks, quantize::Type quantType>
+__global__ void __launch_bounds__(1024) loco_dequant_reduce(int8_t* reduced_data,
+                                                            float* reduced_scales,
+                                                            const int8_t* input_data,
+                                                            const float* input_scales,
+                                                            int elems_per_out_group,
+                                                            int elems_per_in_tensor,
+                                                            int groups_per_in_tensor,
+                                                            int elems_per_in_group,
+                                                            int num_tensors,
+                                                            __half2* error_feedback,
+                                                            const float err_beta)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    constexpr int mem_granularity = (numBits == 8) ? 8 : 4;
+    constexpr int elems_per_load = mem_granularity / sizeof(int8_t);
+    constexpr int storage_values = 16 / sizeof(__half2);
+
+    const int block_offset = tb.group_index().x * elems_per_out_group;
+    const int elem_offset = tb.thread_index().x * elems_per_load;
+    const int base_offset = block_offset + elem_offset;
+    const int stride = tb.group_dim().x * elems_per_load;
+
+    constexpr int scaling_factor = elems_per_load / storage_values;
+    const int block_offset_err = block_offset / scaling_factor;
+    const int elem_offset_err = tb.thread_index().x * storage_values;
+    const int base_offset_err = block_offset_err + elem_offset_err;
+    const int stride_err = tb.group_dim().x * storage_values;
+
+    __half2 local_buffer[totalChunks * storage_values];
+    __half2 err_buffer[totalChunks * storage_values];
+
+    quantize::GroupStats<quantType> stats;
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        __half2* iteration_buffer = local_buffer + i * storage_values;
+        __half2* iter_err_buffer = err_buffer + i * storage_values;
+
+#pragma unroll
+        for (int j = 0; j < storage_values; j++) {
+            iteration_buffer[j] = reduce::init<rop::Add, __half2>();
+        }
+
+        const int iter_offset = i * stride + base_offset;
+        const int iter_offset_err = i * stride_err + base_offset_err;
+        const int iter_scale_idx = iter_offset / elems_per_in_group;
+        bool do_loads = i * stride + elem_offset < elems_per_out_group;
+
+        if (numTensors > 0) {
+#pragma unroll
+            for (int j = 0; j < numTensors; j++) {
+                if (do_loads) {
+                    int8_t load_buffer[elems_per_load];
+
+                    mem_access::load_global<mem_granularity>(
+                        load_buffer, input_data + j * elems_per_in_tensor + iter_offset);
+
+                    quantize::Params<quantType, numBits> params(
+                        input_scales + j * groups_per_in_tensor, iter_scale_idx);
+
+                    __half2 dequant_buffer[storage_values];
+                    dequantize::chunk<numBits, quantType>(dequant_buffer, load_buffer, params);
+
+#pragma unroll
+                    for (int k = 0; k < storage_values; k++) {
+                        iteration_buffer[k] =
+                            reduce::element<rop::Add>(iteration_buffer[k], dequant_buffer[k]);
+                    }
+                }
+            }
+        } else {
+#pragma unroll 4
+            for (int j = 0; j < num_tensors; j++) {
+                if (do_loads) {
+                    int8_t load_buffer[elems_per_load];
+
+                    mem_access::load_global<mem_granularity>(
+                        load_buffer, input_data + j * elems_per_in_tensor + iter_offset);
+
+                    quantize::Params<quantType, numBits> params(
+                        input_scales + j * groups_per_in_tensor, iter_scale_idx);
+
+                    __half2 dequant_buffer[storage_values];
+                    dequantize::chunk<numBits, quantType>(dequant_buffer, load_buffer, params);
+
+#pragma unroll
+                    for (int k = 0; k < storage_values; k++) {
+                        iteration_buffer[k] =
+                            reduce::element<rop::Add>(iteration_buffer[k], dequant_buffer[k]);
+                    }
+                }
+            }
+        }
+        mem_access::load_global<quantize::granularity>(
+            iter_err_buffer, error_feedback + iter_offset_err, do_loads);
+#pragma unroll
+        for (int k = 0; k < storage_values; k++) {
+            iteration_buffer[k] = __hadd2(iteration_buffer[k], iter_err_buffer[k]);
+            stats.update(iteration_buffer[k]);
+        }
+    }
+
+    auto params = stats.template get_params<numBits, 1024>(tb, warp);
+
+    // Initialize dequantization parameters based on params
+    auto de_params = params;
+    de_params.scale = 1.0f / params.scale;
+    if constexpr (quantType == quantize::Type::Asymmetric) { de_params.offset = params.offset; }
+
+    if (tb.thread_index().x == 0) { params.store(reduced_scales, tb.group_index().x); }
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        const int iter_offset = i * stride + base_offset;
+        const int iter_offset_err = i * stride_err + base_offset_err;
+        __half2* iteration_buffer = local_buffer + i * storage_values;
+        __half2* iter_err_buffer = err_buffer + i * storage_values;
+
+        if (i * stride + elem_offset < elems_per_out_group) {
+            // ----------- Begin Error-Feedback Modification -----------
+            int8_t local_output[elems_per_load];
+            quantize::_chunk<numBits, quantType>(local_output, iteration_buffer, params);
+            mem_access::store_global<mem_granularity>(reduced_data + iter_offset, local_output);
+
+            // Dequantize the quantized output to compute the dequantized value
+            __half2 dequant_buffer[storage_values];
+            dequantize::chunk<numBits, quantType>(dequant_buffer, local_output, de_params);
+
+#pragma unroll
+            for (int k = 0; k < storage_values; k++) {
+                // __half2 to float2
+                float2 iter_buf_f = __half22float2(iteration_buffer[k]);
+                float2 dequant_buf_f = __half22float2(dequant_buffer[k]);
+
+                // Update within float precision
+                float2 new_error_f;
+                new_error_f.x = iter_buf_f.x - dequant_buf_f.x;
+                new_error_f.y = iter_buf_f.y - dequant_buf_f.y;
+
+                float2 iter_err_buf_f = __half22float2(iter_err_buffer[k]);
+
+                iter_err_buf_f.x = err_beta * iter_err_buf_f.x + (1.0f - err_beta) * new_error_f.x;
+                iter_err_buf_f.y = err_beta * iter_err_buf_f.y + (1.0f - err_beta) * new_error_f.y;
+
+                // float2 back to __half2
+                iter_err_buffer[k] = __float22half2_rn(iter_err_buf_f);
+            }
+            mem_access::store_global<quantize::granularity>(error_feedback + iter_offset_err,
+                                                            iter_err_buffer);
+        }
+    }
+}
+
+#define LAUNCH_LOCO_DEQUANT_REDUCE(num_chunks)                      \
+    loco_dequant_reduce<numBits, numTensors, num_chunks, quantType> \
+        <<<grid, block, 0, stream>>>(reduced_data,                  \
+                                     reduced_scales,                \
+                                     input_data,                    \
+                                     input_scales,                  \
+                                     elems_per_out_group,           \
+                                     elems_per_in_tensor,           \
+                                     groups_per_in_tensor,          \
+                                     elems_per_in_group,            \
+                                     num_tensors,                   \
+                                     error_feedback,                \
+                                     err_beta);
+
+template <int numBits, int numTensors, quantize::Type quantType>
+void launch_loco_dequant_reduce_impl(int8_t* reduced_data,
+                                     float* reduced_scales,
+                                     const int8_t* input_data,
+                                     const float* input_scales,
+                                     int out_groups,
+                                     int elems_per_out_group,
+                                     int elems_per_in_tensor,
+                                     int groups_per_in_tensor,
+                                     int elems_per_in_group,
+                                     int num_tensors,
+                                     __half2* error_feedback,
+                                     const float err_beta,
+                                     cudaStream_t stream)
+{
+    constexpr int elems_per_thread = numBits;
+    const int one_step_threads =
+        next_pow2((elems_per_out_group + elems_per_thread - 1) / (elems_per_thread));
+    const int threads = (one_step_threads < 1024) ? one_step_threads : 1024;
+
+    dim3 block(threads);
+    dim3 grid(out_groups);
+
+    const int elems_per_step = threads * elems_per_thread;
+    const int unroll_raw = (elems_per_out_group + elems_per_step - 1) / elems_per_step;
+
+    const int unroll = (unroll_raw >= 4) ? pow2_round<1>(unroll_raw) : unroll_raw;
+
+    if (unroll == 1) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(1);
+    } else if (unroll == 2) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(2);
+    } else if (unroll == 3) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(3);
+    } else if (unroll == 4) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(4);
+    } else if (unroll == 6) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(6);
+    } else if (unroll == 8) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(8);
+    } else if (unroll == 10) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(10);
+    } else if (unroll == 12) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(12);
+    } else {
+        assert(false);
+    }
+}
+
+#define LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(NUM_BITS, NUM_GPUS, QUANT_TYPE)                   \
+    launch_loco_dequant_reduce_impl<NUM_BITS, NUM_GPUS, QUANT_TYPE>(reduced_data,         \
+                                                                    reduced_scales,       \
+                                                                    input_data,           \
+                                                                    input_scales,         \
+                                                                    out_groups,           \
+                                                                    elems_per_out_group,  \
+                                                                    elems_per_in_tensor,  \
+                                                                    groups_per_in_tensor, \
+                                                                    elems_per_in_group,   \
+                                                                    num_gpus,             \
+                                                                    error_feedback,       \
+                                                                    err_beta,             \
+                                                                    stream);
+
+void launch_loco_dequant_reduce(int8_t* reduced_data,
+                                float* reduced_scales,
+                                const int8_t* input_data,
+                                const float* input_scales,
+                                int num_gpus,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int out_groups,
+                                int elems_per_out_group,
+                                int elems_per_in_tensor,
+                                int groups_per_in_tensor,
+                                int elems_per_in_group,
+                                __half2* error_feedback,
+                                const float err_beta,
+                                cudaStream_t stream)
+{
+    if (quant_type == quantize::Type::Symmetric) {
+        if (num_bits == 4) {
+            if (num_gpus == 8) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, 8, quantize::Type::Symmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, 16, quantize::Type::Symmetric);
+            } else {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, -1, quantize::Type::Symmetric);
+            }
+        } else if (num_bits == 8) {
+            if (num_gpus == 8) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, 8, quantize::Type::Symmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, 16, quantize::Type::Symmetric);
+            } else {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, -1, quantize::Type::Symmetric);
+            }
+        }
+    } else if (quant_type == quantize::Type::Asymmetric) {
+        if (num_bits == 4) {
+            if (num_gpus == 8) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, 8, quantize::Type::Asymmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, 16, quantize::Type::Asymmetric);
+            } else {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, -1, quantize::Type::Asymmetric);
+            }
+        } else if (num_bits == 8) {
+            if (num_gpus == 8) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, 8, quantize::Type::Asymmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, 16, quantize::Type::Asymmetric);
+            } else {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, -1, quantize::Type::Asymmetric);
+            }
+        }
+    }
+}
diff --git a/csrc/quantization/swizzled_quantize.cu b/csrc/quantization/swizzled_quantize.cu
index 5a02a0ae8120..a4b6096c81af 100644
--- a/csrc/quantization/swizzled_quantize.cu
+++ b/csrc/quantization/swizzled_quantize.cu
@@ -3,6 +3,7 @@
 
 // DeepSpeed Team
 
+#include "dequantization_utils.h"
 #include "memory_access_utils.h"
 #include "quantization_utils.h"
 #include "reduction_utils.h"
@@ -194,3 +195,233 @@ void launch_swizzled_quant(int8_t* q_data,
         }
     }
 }
+
+template <int numBits, int totalChunks, int threads, quantize::Type quantType>
+__global__ void loco_swizzled_quant_kernel(int8_t* quantized_data,
+                                           float* quantized_scales,
+                                           const __half* uncompressed_data,
+                                           __half* error_feedback,
+                                           const float err_beta,
+                                           int groups,
+                                           int elems_per_group,
+                                           int pipelining,
+                                           int nodes,
+                                           int devices_per_node)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // Indexing offsets, same as normal quantization for in-case
+    const int block_rank_data =
+        blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
+    const int block_offset_data = block_rank_data * elems_per_group;
+    const int elem_offset = tb.thread_index().x * quantize::h_per_load;
+    const int base_offset_data = block_offset_data + elem_offset;
+    const int stride = tb.size() * quantize::h_per_load;
+    const __half* uncompressed_data_base = uncompressed_data + base_offset_data;
+
+    const int partition_id = blockIdx.z;
+    const int partition_offset = partition_id / devices_per_node;
+    const int partition_base = (partition_id % devices_per_node) * nodes;
+    const int pipelining_offset = blockIdx.y * (devices_per_node * nodes);
+    const int output_partition = (pipelining_offset + partition_base + partition_offset);
+    const int block_rank_err = output_partition * gridDim.x + blockIdx.x;
+
+    const int block_offset_err = block_rank_err * elems_per_group;
+    const int base_offset_err = block_offset_err + elem_offset;
+    __half* error_feedback_base = error_feedback + base_offset_err;
+
+    __half2 local_buffer[totalChunks * quantize::h2_per_load];
+    __half2 err_buffer[totalChunks * quantize::h2_per_load];
+
+    quantize::GroupStats<quantType> stats;
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        __half2* iteration_buffer = local_buffer + i * quantize::h2_per_load;
+        __half2* iter_err_buffer = err_buffer + i * quantize::h2_per_load;
+        const int i_stride = i * stride;
+        bool do_loads = (elem_offset + i_stride) < elems_per_group;
+
+        mem_access::load_global<quantize::granularity>(
+            iteration_buffer, uncompressed_data_base + i_stride, do_loads);
+
+        mem_access::load_global<quantize::granularity>(
+            iter_err_buffer, error_feedback_base + i_stride, do_loads);
+
+#pragma unroll
+        for (int j = 0; j < quantize::h2_per_load; j++) {
+            iteration_buffer[j] = __hadd2(iteration_buffer[j], iter_err_buffer[j]);
+            stats.update(iteration_buffer[j]);
+        }
+    }
+
+    auto params = stats.template get_params<numBits, threads>(tb, warp);
+
+    // Initialize dequantization parameters based on params
+    auto de_params = params;
+    de_params.scale = 1.0f / params.scale;
+    if constexpr (quantType == quantize::Type::Asymmetric) { de_params.offset = params.offset; }
+
+    if (threadIdx.x == 0) { params.store(quantized_scales, block_rank_err); }
+
+    constexpr int out_scalar_effect = 8 / numBits;
+    const int out_block_offset = block_rank_err * elems_per_group / out_scalar_effect;
+    const int out_base_offset = out_block_offset + elem_offset / out_scalar_effect;
+    int8_t* out_base = quantized_data + out_base_offset;
+
+    const int out_stride = stride / out_scalar_effect;
+    constexpr int num_int8_out = quantize::h_per_load / out_scalar_effect;
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        const int i_stride = i * stride;
+        __half2* iteration_buffer = local_buffer + i * quantize::h2_per_load;
+        __half2* iter_err_buffer = err_buffer + i * quantize::h2_per_load;
+
+        if (i_stride + elem_offset < elems_per_group) {
+            int8_t local_output[quantize::h_per_load / out_scalar_effect];
+            quantize::_chunk<numBits, quantType>(local_output, iteration_buffer, params);
+            mem_access::store_global<num_int8_out>(out_base + i * out_stride, local_output);
+
+            // Dequantize the quantized output to compute the dequantized value
+            __half2 dequant_buffer[quantize::h2_per_load];
+            dequantize::chunk<numBits, quantType>(dequant_buffer, local_output, de_params);
+
+// Compute new error: sum - dequant_buffer
+#pragma unroll
+            for (int k = 0; k < quantize::h2_per_load; k++) {
+                // __half2 to float2
+                float2 iter_buf_f = __half22float2(iteration_buffer[k]);
+                float2 dequant_buf_f = __half22float2(dequant_buffer[k]);
+
+                // Update within float precision
+                float2 new_error_f;
+                new_error_f.x = iter_buf_f.x - dequant_buf_f.x;
+                new_error_f.y = iter_buf_f.y - dequant_buf_f.y;
+
+                float2 iter_err_buf_f = __half22float2(iter_err_buffer[k]);
+
+                iter_err_buf_f.x = err_beta * iter_err_buf_f.x + (1.0f - err_beta) * new_error_f.x;
+                iter_err_buf_f.y = err_beta * iter_err_buf_f.y + (1.0f - err_beta) * new_error_f.y;
+
+                // float2 back to __half2
+                iter_err_buffer[k] = __float22half2_rn(iter_err_buf_f);
+            }
+            __half2* error_feedback_base_h2 = reinterpret_cast<__half2*>(error_feedback_base);
+            mem_access::store_global<quantize::granularity>(error_feedback_base_h2 + i_stride / 2,
+                                                            iter_err_buffer);
+        }
+    }
+}
+
+#define LAUNCH_LOCO_SWIZZLE_QUANT(total_chunks, threads)              \
+    loco_swizzled_quant_kernel<numBits, total_chunks, threads, qType> \
+        <<<grid, block, 0, stream>>>(output_data,                     \
+                                     params,                          \
+                                     input_data,                      \
+                                     error_feedback,                  \
+                                     err_beta,                        \
+                                     groups,                          \
+                                     elems_per_group,                 \
+                                     pipelining,                      \
+                                     nodes,                           \
+                                     devices_per_node);
+
+template <int numBits, quantize::Type qType>
+void launch_loco_swizzled_quant_impl(int8_t* output_data,
+                                     float* params,
+                                     const __half* input_data,
+                                     __half* error_feedback,
+                                     const float err_beta,
+                                     int groups,
+                                     int elems_per_group,
+                                     int pipelining,
+                                     int nodes,
+                                     int devices_per_node,
+                                     cudaStream_t stream)
+{
+    const int one_step_threads =
+        next_pow2((elems_per_group + swiz_quant::h_per_step - 1) / swiz_quant::h_per_step);
+    const int max_threads = (one_step_threads < swiz_quant::max_threads) ? one_step_threads
+                                                                         : swiz_quant::max_threads;
+    const int threads = (max_threads < swiz_quant::min_threads) ? swiz_quant::min_threads
+                                                                : max_threads;
+
+    dim3 block(threads);
+    const int groups_per_partition = groups / (nodes * devices_per_node);
+    assert(groups_per_partition % pipelining == 0);
+    const int contiguous_groups = groups_per_partition / pipelining;
+    const int partitions = nodes * devices_per_node;
+    dim3 grid(contiguous_groups, pipelining, partitions);
+
+    const int elems_per_step = threads * swiz_quant::h_per_step;
+    const int external_unroll = ((elems_per_group + elems_per_step - 1) / elems_per_step);
+    const int total_unroll = external_unroll * swiz_quant::step_granularity;
+
+    assert(total_unroll % 2 == 0);
+
+    if (threads == 32) {
+        LAUNCH_LOCO_SWIZZLE_QUANT(2, 32);
+    } else if (threads == 64) {
+        LAUNCH_LOCO_SWIZZLE_QUANT(2, 64);
+    } else if (threads == 128) {
+        LAUNCH_LOCO_SWIZZLE_QUANT(2, 128);
+    } else if (threads == 256) {
+        LAUNCH_LOCO_SWIZZLE_QUANT(2, 256);
+    } else if (threads == 512) {
+        if (total_unroll == 2) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(2, 512);
+        } else if (total_unroll == 4) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(4, 512);
+        } else if (total_unroll == 6) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(6, 512);
+        } else if (total_unroll == 8) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(8, 512);
+        } else if (total_unroll == 10) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(10, 512);
+        }
+    }
+}
+
+#define DISPATCH_LOCO_SWIZZLE_QUANT(num_bits, qtype)                   \
+    launch_loco_swizzled_quant_impl<num_bits, qtype>(output_data,      \
+                                                     params,           \
+                                                     input_data,       \
+                                                     error_feedback,   \
+                                                     err_beta,         \
+                                                     groups,           \
+                                                     elems_per_group,  \
+                                                     pipelining,       \
+                                                     nodes,            \
+                                                     devices_per_node, \
+                                                     stream);
+
+void launch_loco_swizzled_quant(int8_t* output_data,
+                                float* params,
+                                const __half* input_data,
+                                __half* error_feedback,
+                                const float err_beta,
+                                int num_bits,
+                                quantize::Type q_type,
+                                int groups,
+                                int elems_per_group,
+                                int pipelining,
+                                int nodes,
+                                int devices_per_node,
+                                cudaStream_t stream)
+{
+    if (num_bits == 4) {
+        if (q_type == quantize::Type::Asymmetric) {
+            DISPATCH_LOCO_SWIZZLE_QUANT(4, quantize::Type::Asymmetric);
+        } else if (q_type == quantize::Type::Symmetric) {
+            DISPATCH_LOCO_SWIZZLE_QUANT(4, quantize::Type::Symmetric);
+        }
+    } else if (num_bits == 8) {
+        if (q_type == quantize::Type::Asymmetric) {
+            DISPATCH_LOCO_SWIZZLE_QUANT(8, quantize::Type::Asymmetric);
+        } else if (q_type == quantize::Type::Symmetric) {
+            DISPATCH_LOCO_SWIZZLE_QUANT(8, quantize::Type::Symmetric);
+        }
+    }
+}
diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
index 543795126fab..c2fa907d7dbb 100644
--- a/deepspeed/runtime/comm/coalesced_collectives.py
+++ b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -8,7 +8,7 @@
 """
 
 import math
-from typing import List
+from typing import List, Any
 import torch
 from torch import Tensor
 from deepspeed import comm as dist
@@ -76,6 +76,83 @@ def all_to_all_quant_reduce(tensors: List[Tensor], groups: {}) -> List[Tensor]:
     return output_lst
 
 
+@instrument_w_nvtx
+@torch.no_grad()
+def all_to_all_loco_quant_reduce(
+    params: List[Tensor],
+    groups: {},
+    loco_param: Any = None,
+) -> List[Tensor]:
+    global quantizer_module
+    global loco_idx
+    if quantizer_module is None:
+        quantizer_module = op_builder.QuantizerBuilder().load()
+    local_world_size = get_accelerator().device_count()
+    global_world_size = dist.get_world_size()
+    num_nodes = global_world_size // local_world_size
+    this_rank = dist.get_rank()
+    intra_idx = int(this_rank / local_world_size)
+    inter_idx = this_rank % local_world_size
+    output_lst: List[Tensor] = [None] * len(params)
+    for idx, p in enumerate(params):
+        tensor = p.grad
+        if tensor.dim() == 1:
+            output_lst[idx] = reduce_scatter_coalesced([tensor])[0]
+        elif tensor.numel() % (2 * global_world_size) != 0:
+            # Due to the constraint of 2-stage all-to-all, the input tensor must be divisible by 2 * global_world_size
+            # Otherwise, all-to-all cannot be performed because of shape mismatch.
+            # See more at https://github.com/microsoft/DeepSpeed/pull/5056
+            logger.warning(
+                f"qgZ falls back to reduce_scatter because tensor size = {tensor.numel()} is not divisible by (2 * global_world_size) = {2 * global_world_size}. Please consider allocating a new world to enable qgZ"
+            )
+            output_lst[idx] = reduce_scatter_coalesced([tensor])[0]
+        else:
+            err_beta = loco_param['err_beta']
+            reset_T = loco_param['reset_T']
+            if not hasattr(p, 'intra_ef_buf') or loco_idx > reset_T:
+                loco_idx = 0
+                intra_err = torch.zeros_like(p.grad)
+                inter_err = torch.zeros(tensor.numel() // local_world_size, device=tensor.device, dtype=tensor.dtype)
+            else:
+                intra_err = quantizer_module.dequantize(p.intra_ef_buf[0], p.intra_ef_buf[1],
+                                                        p.intra_ef_buf[1].numel(), 8, quantizer_module.Symmetric)
+                inter_err = quantizer_module.dequantize(p.inter_ef_buf[0], p.inter_ef_buf[1],
+                                                        p.inter_ef_buf[1].numel(), 8, quantizer_module.Symmetric)
+
+            intra_quant_group = max(tensor.shape[0], tensor.shape[1], global_world_size)
+            inter_quant_group = intra_quant_group // local_world_size
+            intra_quant_int4, intra_q_scales = quantizer_module.loco_swizzle_quant(tensor, intra_err, err_beta,
+                                                                                   intra_quant_group, 4,
+                                                                                   quantizer_module.Symmetric, 1,
+                                                                                   num_nodes, local_world_size)
+            local_output = torch.empty_like(intra_quant_int4)
+            scale_output = torch.empty_like(intra_q_scales)
+            all_to_all_single(local_output, intra_quant_int4, group=groups[f'local_{intra_idx}'])
+            all_to_all_single(scale_output, intra_q_scales, group=groups[f'local_{intra_idx}'])
+
+            p.intra_ef_buf = quantizer_module.quantize(intra_err, intra_quant_group, 8, quantizer_module.Symmetric)
+
+            global_input_tensor, global_scales = quantizer_module.loco_quantized_reduction(
+                local_output, scale_output, inter_err, err_beta, intra_quant_group, inter_quant_group, 4,
+                quantizer_module.Symmetric, local_world_size)
+
+            global_output = torch.empty_like(global_input_tensor)
+            global_scale_output = torch.empty_like(global_scales)
+            all_to_all_single(global_output, global_input_tensor, group=groups[f'global_{inter_idx}'])
+            all_to_all_single(global_scale_output, global_scales, group=groups[f'global_{inter_idx}'])
+
+            p.inter_ef_buf = quantizer_module.quantize(inter_err, inter_quant_group, 8, quantizer_module.Symmetric)
+
+            final_output = quantizer_module.dequantize(global_output, global_scale_output, global_scale_output.numel(),
+                                                       4, quantizer_module.Symmetric)
+            assert final_output.numel(
+            ) % num_nodes == 0, f"final_output.numel()={final_output.numel()} is not divisible by num_nodes={num_nodes}"
+            output_lst[idx] = (sum(list(final_output.chunk(num_nodes))) / num_nodes).view(-1)
+            loco_idx += 1
+
+    return output_lst
+
+
 @instrument_w_nvtx
 @torch.no_grad()
 def reduce_scatter_coalesced(
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 8c5da36e5a78..0aad018528d3 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -912,6 +912,9 @@ def zero_quantized_nontrainable_weights(self):
     def zero_quantized_gradients(self):
         return self._config.zero_config.zero_quantized_gradients
 
+    def zeropp_loco_param(self):
+        return self._config.zero_config.zeropp_loco_param
+
     def dump_state(self):
         return self._config.dump_state
 
@@ -1191,7 +1194,8 @@ def _configure_distributed_model(self, model):
         # Query the groups module to get information about various parallel groups
         self.local_all_to_all_group = None
         if self.zero_quantized_gradients():
-            log_dist("Using quantized gradients", ranks=[0])
+            message = "Using LoCo quantized gradients" if self.zeropp_loco_param() else "Using quantized gradients"
+            log_dist(message, ranks=[0])
             self.local_all_to_all_group = groups._get_local_all_to_all_group()
         self.data_parallel_group = groups._get_data_parallel_group()
         self.dp_world_size = groups._get_data_parallel_world_size()
@@ -1667,6 +1671,7 @@ def _configure_zero_optimizer(self, optimizer):
                     zero_quantized_weights=self.zero_quantized_weights(),
                     zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
                     zero_module_granularity_threshold=self.zero_module_granularity_threshold(),
+                    zeropp_loco_param=self.zeropp_loco_param(),
                 )
 
         else:
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 19b272ce9e92..cbc6a15c2057 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 import sys
-from typing import Optional
+from typing import Optional, Dict, Any
 from enum import Enum
 from pydantic import Field, model_validator
 from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
@@ -44,6 +44,7 @@
     "zero_quantized_gradients": [true|false],
     "memory_efficient_linear": [true|false],
     "override_module_apply": [true|false],
+    "zeropp_loco_param": {...},
     }
 }
 """
@@ -310,6 +311,16 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     Boolean indicating whether to use quantized zero gradients
     for efficient all_2_all_reduce comm
     """
+    zeropp_loco_param: Optional[Dict[str, Any]] = None
+    """
+    This dictionary contains parameters for using LoCo-Zero++, with two key parameters:
+    - `err_beta`: A coefficient for the moving average of quantization errors before and after gradient computation.
+    It ranges between 0 and 1, with a default value of 0.8.
+    - `reset_T`: The number of steps after which the moving-average error buffer is cleared. The default value is 1024.
+    These parameters can be adjusted based on performance needs. Example configuration in ds config:
+    "zeropp_loco_param": { "err_beta": 0.8, "reset_T": 1024 }.
+    See LoCo paper for more details: (https://arxiv.org/abs/2407.04480).
+    """
 
     mics_shard_size: int = Field(-1, json_schema_extra={"new_param": "mics_shard_size"})
 
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 99a5ecf41a2f..459cffce52c8 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -17,7 +17,7 @@
 from deepspeed.runtime.base_optimizer import ZeROOptimizer
 from deepspeed.utils import logger
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
-from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
+from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce, all_to_all_loco_quant_reduce
 from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
@@ -158,6 +158,7 @@ def __init__(
         zero_quantized_weights=False,
         zero_quantized_nontrainable_weights=False,
         zero_module_granularity_threshold=0,
+        zeropp_loco_param=None,
     ):
         see_memory_usage("Stage 3 initialize beginning", force=True)
 
@@ -284,6 +285,8 @@ def __init__(
 
         self.partition_count = dist.get_world_size(group=self.dp_process_group)
 
+        self.zeropp_loco_param = zeropp_loco_param
+
         if mpu is None:
             self.model_parallel_group = None
             self.model_parallel_rank = 0
@@ -1383,7 +1386,10 @@ def __avg_scatter_grads(self, params_to_reduce: List[Parameter]) -> List[Tensor]
         global_world_size = dist.get_world_size()
         num_nodes = global_world_size // local_world_size
         if self.all2all_process_group is not None and num_nodes > 1:
-            grad_partitions_for_rank = all_to_all_quant_reduce(full_grads_for_rank, self.all2all_process_group)
+            grad_partitions_for_rank = (all_to_all_loco_quant_reduce(params_to_reduce, self.all2all_process_group,
+                                                                     self.zeropp_loco_param)
+                                        if self.zeropp_loco_param is not None else all_to_all_quant_reduce(
+                                            full_grads_for_rank, self.all2all_process_group))
         else:
             grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank, self.dp_process_group)
 
@@ -2009,6 +2015,25 @@ def _overflow_clean_up(self, prev_scale):
 
         see_memory_usage('After overflow after clearing gradients', force=False)
 
+    def _loco_err_buf_update(self, overflow: bool, scale=1.0):
+        """
+        Loco Error Buffer update.
+        """
+        if not overflow and scale == 1.0: return
+        if dist.get_rank() == 0:
+            logger.info(f"update loco-zero++ error buffer with overflow: {overflow}")
+        # FP32 grad should never exist.
+        # For speed, set model fp16 grad to None by default
+        for group in self.fp16_groups:
+            for p in group:
+                if hasattr(p, 'intra_ef_buf'):
+                    if overflow:
+                        del p.intra_ef_buf
+                        del p.inter_ef_buf
+                    else:
+                        p.intra_ef_buf[1] *= scale
+                        p.inter_ef_buf[1] *= scale
+
     @instrument_w_nvtx
     def _overflow_check_and_loss_scale_update(self):
 
@@ -2023,6 +2048,9 @@ def _overflow_check_and_loss_scale_update(self):
         if self.overflow:
             self._overflow_clean_up(prev_scale)
 
+        #update loco error buf
+        self._loco_err_buf_update(self.overflow, self.loss_scale / prev_scale)
+
         return self.overflow
 
     @instrument_w_nvtx
diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py
index 17b2ffbb9d29..2d5db192f2ca 100644
--- a/tests/unit/runtime/comm/test_coalesced_collectives.py
+++ b/tests/unit/runtime/comm/test_coalesced_collectives.py
@@ -96,3 +96,66 @@ def test_non_divisible(self):
         elif dist.get_rank() == 1:
             assert output.shape == (24, )
             assert torch.allclose(output, torch.zeros_like(output))
+
+
+class TestLocoQuantized(DistributedTest):
+
+    world_size = 1
+
+    @pytest.mark.parametrize("num_bits", [4, 8])
+    @pytest.mark.parametrize("tensor_size", [(16, 16), (64, 64)])
+    @pytest.mark.parametrize("devices_per_node", [4, 8])
+    def test_loco_quantized_reduction(self, num_bits, tensor_size, devices_per_node):
+        from deepspeed.ops.op_builder import QuantizerBuilder
+        if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]:
+            pytest.skip("QuantizerBuilder is not implemented")
+
+        quantizer_module = QuantizerBuilder().load()
+
+        tensor = torch.randn(tensor_size, device='cuda', dtype=torch.half)
+
+        num_nodes = 2  # Fake world size
+        total_elements = tensor.numel()
+        total_devices = devices_per_node * num_nodes
+        num_groups = max(tensor.shape[0], tensor.shape[1], total_devices)
+
+        # Initialize error_feedback tensor
+        error_feedback = torch.randn(tensor_size, device=tensor.device, dtype=tensor.dtype)
+        error_feedback_ori = error_feedback.clone()
+        # Swizzle the original tensor
+        tensor_reshaped = tensor.reshape(num_nodes, devices_per_node, total_elements // total_devices)
+        swizzled_tensor = tensor_reshaped.permute(1, 0, 2).reshape(tensor.size())
+
+        # Perform loco_swizzle_quant
+        output, scales = quantizer_module.loco_swizzle_quant(tensor, error_feedback, 0.0, num_groups, num_bits,
+                                                             quantizer_module.Symmetric, 1, num_nodes,
+                                                             devices_per_node)
+
+        # Compare swizzled_tensor with the output of loco_swizzle_quant
+        dequantized = quantizer_module.dequantize(output, scales, scales.numel(), num_bits,
+                                                  quantizer_module.Symmetric).view(tensor.size())
+
+        assert torch.allclose(swizzled_tensor + error_feedback_ori, dequantized + error_feedback)
+
+        # Calculate elements per group and groups per partition
+        elements_per_group = total_elements // num_groups
+        groups_per_partition = num_groups // devices_per_node
+
+        # Reshape dequantized data to match the grouping in loco_quantized_reduction
+        dequantized_reshaped = dequantized.view(devices_per_node, groups_per_partition, elements_per_group)
+
+        # Perform reduction across devices_per_node dimension
+        reduced_dequantized = dequantized_reshaped.cumsum(dim=0)[-1]
+        # Initialize error_feedback tensor
+        error_feedback = torch.randn(reduced_dequantized.shape, device=tensor.device, dtype=dequantized.dtype)
+        error_feedback_ori = error_feedback.clone()
+
+        # perform loco_quantized_reduction
+        output, scales = quantizer_module.loco_quantized_reduction(output, scales, error_feedback, 0.0, num_groups,
+                                                                   num_groups // devices_per_node, num_bits,
+                                                                   quantizer_module.Symmetric, devices_per_node)
+
+        dequantized_reduced = quantizer_module.dequantize(output, scales, scales.numel(), num_bits,
+                                                          quantizer_module.Symmetric).view(error_feedback.size())
+
+        assert torch.allclose(reduced_dequantized + error_feedback_ori, dequantized_reduced + error_feedback)

From 9e3125281f992a15486ca083f1acfa5f61931497 Mon Sep 17 00:00:00 2001
From: Yichen Yan <wenji.yyc@alibaba-inc.com>
Date: Wed, 11 Dec 2024 04:12:42 +0800
Subject: [PATCH 02/15] Fix type error in `ZeROOrderedDict` (#6794)

As @keskival pointed in
https://github.com/microsoft/DeepSpeed/commit/3d5cf739ead7c78f518a518ccaa15a323bd5c8da#r149582004,
I've confirmed there's a type error, which this PR fixes.

I didn't run into this because our internal version still use `*r2`.

Co-authored-by: Tero Keski-Valkama <tero.keskivalkama@gmail.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/zero/parameter_offload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index f945f5166190..0be88a1e1ba6 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -52,7 +52,7 @@ def __init__(self, parent_module, *args, **kwargs):
 
     def __reduce__(self):
         r0, _, *r2 = super().__reduce__()
-        return (r0, (self._parent_module, )) + r2
+        return (r0, (self._parent_module, )) + tuple(r2)
 
     def __getitem__(self, key):
         param = super().__getitem__(key)

From ecb4bf32614fa8d6d4433ea579ca867008d34140 Mon Sep 17 00:00:00 2001
From: Eugene29 <52263376+Eugene29@users.noreply.github.com>
Date: Tue, 10 Dec 2024 17:58:25 -0600
Subject: [PATCH 03/15] Fix uneven head sequence parallelism bug (#6774)
 (#6797)

Here `gather_idx < 2` represents `is_first_all2all`. During the first
all2all, `uneven_head_all2all` will be called if either `num_heads %
seq_world_size != 0` or `get_num_kv_heads() is None`.

During the second all2all, it'll return return `uneven_head_all2all` if
and only if `get_num_kv_heads() is None` which is always set during the
first uneven all2all. This means that there will no longer be issue
where `uneven_head_all2all ` is returned for the second all2all because
of `num_heads % seq_world_size != 0`.

Fixes: #6774

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/sequence/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/sequence/layer.py b/deepspeed/sequence/layer.py
index 3dfe149057e9..ccc43fdf7164 100644
--- a/deepspeed/sequence/layer.py
+++ b/deepspeed/sequence/layer.py
@@ -184,7 +184,7 @@ def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, asyn
     # we only need num_heads once
     num_heads = input.shape[2]
 
-    if get_num_kv_heads() is not None or num_heads % seq_world_size != 0:
+    if get_num_kv_heads() is not None or (num_heads % seq_world_size != 0 and not scatter_idx < 2):
         # Assuming here that the number of heads for q is consistent with kv
         # If not, additional logic is required for cases like GQA
         if get_num_kv_heads() is None:

From 074d5c69c3eeffdf34b406153b9ece971ae7f115 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 11 Dec 2024 10:34:31 -0800
Subject: [PATCH 04/15] Fix nv-torch-nightly test by pinning transformers
 (#6849)

---
 .github/workflows/nv-torch-nightly-v100.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index 74495812add5..c2d10a454f94 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -2,6 +2,9 @@ name: nv-torch-nightly-v100
 
 on:
   workflow_dispatch:
+    pull_request:
+    paths:
+      - '.github/workflows/nv-torch-nightly-v100.yml'
   schedule:
     - cron: "0 0 * * *"
 
@@ -34,7 +37,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
+          git checkout 6c3f168b3
           git rev-parse --short HEAD
           pip install .
 

From bd6fd50e9f49a81d71284a49b8449c334bf11074 Mon Sep 17 00:00:00 2001
From: kaiksi-bb <122809228+kaiksi-bb@users.noreply.github.com>
Date: Thu, 12 Dec 2024 01:48:18 +0100
Subject: [PATCH 05/15] Remove broken links to non-active site (#6854)

The site referenced in various places on the README is no longer active:
https://deepspeed4science.ai


![image](https://github.com/user-attachments/assets/8ec47313-2abf-40d6-b1f8-9a9234c15e2f)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2334fd366ae4..9f06af022853 100755
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
 * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
 * [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
 * [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
-* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
+* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
 
 
 <!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
@@ -78,7 +78,7 @@ To further increase the inference efficiency, DeepSpeed offers easy-to-use and f
 
 ## DeepSpeed4Science
 
-In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [DeepSpeed4Science website](https://deepspeed4science.ai/) and [tutorials](https://www.deepspeed.ai/deepspeed4science/)
+In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [tutorials](https://www.deepspeed.ai/deepspeed4science/)
 
 ---
 

From 91829476a8fd4d0d9268c03c1d56795d20a51c12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?=
 =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?=
 <hollowman@opensuse.org>
Date: Thu, 12 Dec 2024 22:48:32 +0200
Subject: [PATCH 06/15] Avoid poisoning process with CUDA calls as soon as
 importing (#6810)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Call `torch.cuda.device_count() > 0` before `torch.cuda.is_available()`,
to give priority to nvml based availability, so that we can try not to
poison process with CUDA calls as soon as we execute `import deepspeed`.


https://github.com/pytorch/pytorch/blob/v2.5.1/torch/cuda/__init__.py#L120-L124

There are 2 reasons to make this change:

Firstly, if we accidentally import deepspeed, since the CUDA runtime
initializes when the first CUDA API call is made and caches the device
list, changing the CUDA_VISIBLE_DEVICES within the same process after
initialization won't have any effect on the visible devices. The
specific case:
https://github.com/OpenRLHF/OpenRLHF/pull/524#issuecomment-2501505023

A demo for reproduction before the fix is applied:

```python
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import deepspeed
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
torch.cuda.set_device('cuda:0')
```

Secondly, https://pytorch.org/docs/stable/notes/cuda.html

When assessing the availability of CUDA in a given environment
(is_available()), PyTorch’s default behavior is to call the CUDA Runtime
API method cudaGetDeviceCount. Because this call in turn initializes the
CUDA Driver API (via cuInit) if it is not already initialized,
subsequent forks of a process that has run is_available() will fail with
a CUDA initialization error.

Signed-off-by: Hollow Man <hollowman@opensuse.org>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 accelerator/real_accelerator.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 69e96d285bb8..a6173ac70abd 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -167,7 +167,12 @@ def get_accelerator():
                 import torch
 
                 # Determine if we are on a GPU or x86 CPU with torch.
-                if torch.cuda.is_available():  #ignore-cuda
+                # "torch.cuda.is_available()" provides a stronger guarantee,     #ignore-cuda
+                # ensuring that we are free from CUDA initialization errors.
+                # While "torch.cuda.device_count() > 0" check ensures that       #ignore-cuda
+                # we won't try to do any CUDA calls when no device is available
+                # For reference: https://github.com/microsoft/DeepSpeed/pull/6810
+                if torch.cuda.device_count() > 0 and torch.cuda.is_available():  #ignore-cuda
                     accelerator_name = "cuda"
                 else:
                     if accel_logger is not None:

From 853a97648b9ba3acbb990018eab1dd928a08c390 Mon Sep 17 00:00:00 2001
From: Liangliang Ma <liangliang.ma@intel.com>
Date: Sat, 14 Dec 2024 03:29:48 +0800
Subject: [PATCH 07/15] Fix xpu tests workflow failure by changing pip index
 url (#6864)

Update xpu-max1100.yml and xpu-compile.yml
---
 .github/workflows/xpu-compile.yml | 8 ++++----
 .github/workflows/xpu-max1100.yml | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
index e095e089fc30..9e8bd9d792fb 100644
--- a/.github/workflows/xpu-compile.yml
+++ b/.github/workflows/xpu-compile.yml
@@ -31,10 +31,10 @@ jobs:
       run: |
         apt-get update
         apt-get install clinfo libaio-dev python3-pip -y
-        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
-        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
-        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
-        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
         pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
         pip install py-cpuinfo numpy
         pip install .[dev,autotuning]
diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index d19e73aeef1c..56bff4a88ba9 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -47,10 +47,10 @@ jobs:
       run: |
         apt-get update
         apt-get install clinfo libaio-dev python3-pip -y
-        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
-        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
-        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
-        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
         pip install py-cpuinfo numpy
         pip install .[dev,autotuning]
 

From d7750c34291b9dcd892de4a795ecd0e35b28f6ee Mon Sep 17 00:00:00 2001
From: Guanhua Wang <alexwgh333@gmail.com>
Date: Fri, 13 Dec 2024 11:40:41 -0800
Subject: [PATCH 08/15] Domino updates (#6861)

Updating our website for Domino

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 docs/_tutorials/domino.md | 6 ++++++
 docs/index.md             | 6 +++---
 2 files changed, 9 insertions(+), 3 deletions(-)
 create mode 100644 docs/_tutorials/domino.md

diff --git a/docs/_tutorials/domino.md b/docs/_tutorials/domino.md
new file mode 100644
index 000000000000..6b116cb87463
--- /dev/null
+++ b/docs/_tutorials/domino.md
@@ -0,0 +1,6 @@
+---
+title: "Domino"
+tags: training
+---
+
+Domino achieves near-complete communication hiding behind computation for tensor parallel training. Please find our [Domino-tutorial](https://github.com/microsoft/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in DeepSpeedExample repo.
diff --git a/docs/index.md b/docs/index.md
index 3279682b42d4..3d5f290f2bde 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,25 +7,25 @@ title: "Latest News"
 ---
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
+* [2024/12] [DeepSpeed Domino: Communication-Free LLM Training Engine](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md)
+
 * [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/chinese/README.md)]
 
 * [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/chinese/README.md)]
 * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
 * [2024/03] [DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
-* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
 
 <!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
 
 <details>
  <summary>More news</summary>
  <ul>
+  <li>[2024/01] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19">DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.</a></li>
 
   <li>[2023/11] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference/README.md">Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed</a> [<a href="https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html">Intel version</a>]</li>
 
   <li>[2023/11] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp/README.md">DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow</a></li>
 
-  <li>[2023/11] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen">DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference</a> [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md">日本語</a>]</li>
-
 
  </ul>
 </details>

From b5e3fac6a599f80a7db1899e47ea6993131bb51b Mon Sep 17 00:00:00 2001
From: Guanhua Wang <alexwgh333@gmail.com>
Date: Fri, 13 Dec 2024 12:59:08 -0800
Subject: [PATCH 09/15] add domino navigation (#6866)

add domino item into navigation list
---
 docs/_data/navigation.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index b17685b8dc4d..c3ebad4f86af 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -75,6 +75,8 @@ lnav:
         url: /tutorials/data-efficiency/
       - title: 'DeepNVMe'
         url: /tutorials/deepnvme/
+      - title: 'Domino'
+        url: /tutorials/domino/
       - title: 'DS4Sci_EvoformerAttention'
         url: /tutorials/ds4sci_evoformerattention/
       - title: 'Flops Profiler'

From 8efbcc495c3c7c072d10bfd672932807fb9eb8e5 Mon Sep 17 00:00:00 2001
From: Olatunji Ruwase <olruwase@microsoft.com>
Date: Fri, 13 Dec 2024 16:49:08 -0500
Subject: [PATCH 10/15] Update TSC (#6867)

---
 COMMITTERS.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/COMMITTERS.md b/COMMITTERS.md
index bcb8579bf1f7..8418bdf8629d 100644
--- a/COMMITTERS.md
+++ b/COMMITTERS.md
@@ -5,5 +5,7 @@
 | Olatunji Ruwase | [tjruwase](https://github.com/tjruwase)     | Microsoft |
 | Logan Adams     | [loadams](https://github.com/loadams)      | Microsoft |
 | Masahiro Tanaka | [tohtana](https://github.com/tohtana)      | Microsoft |
-| Jeff Rasley     | [jeffra](https://github.com/jeffra)       | SnowFlake |
-| Minjia Zhang    | [minjiazhang](https://github.com/minjiazhang)  | UIUC      |
+| Jeff Rasley     | [jeffra](https://github.com/jeffra)       | SnowFlake  |
+| Minjia Zhang    | [minjiazhang](https://github.com/minjiazhang)  | UIUC  |
+| Ashwin Aji      | [ashwinma](https://github.com/ashwinma)        | AMD   |
+| Sam Foreman     | [saforem2](https://github.com/saforem2)        | Argonne National Laboratory |

From 6e3e13cb280b684ebedb5c2aecb36efb545ebfce Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:35:12 -0800
Subject: [PATCH 11/15] Remove warnings from autodoc and sphinx (#6788)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/runtime/fp16/onebit/zoadam.py |  4 +++-
 deepspeed/runtime/lr_schedules.py       |  4 ++--
 docs/code-docs/source/monitor.rst       | 18 +++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py
index 803bd929742d..70282ec41714 100644
--- a/deepspeed/runtime/fp16/onebit/zoadam.py
+++ b/deepspeed/runtime/fp16/onebit/zoadam.py
@@ -12,9 +12,11 @@
 
 
 class ZeroOneAdam(torch.optim.Optimizer):
-    """Implements the 0/1 Adam algorithm. Currently GPU-only.
+    """
+    Implements the 0/1 Adam algorithm. Currently GPU-only.
     For usage example please see https://www.deepspeed.ai/tutorials/zero-one-adam/
     For technical details please read https://arxiv.org/abs/2202.06009
+
     Arguments:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups.
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index f25a19e8e499..899358e2c5ef 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -274,7 +274,7 @@ class LRRangeTest(object):
     """Sets the learning rate of each parameter group according to
     learning rate range test (LRRT) policy. The policy increases learning
     rate starting from a base value with a constant frequency, as detailed in
-    the paper `A disciplined approach to neural network hyper-parameters: Part1`_.
+    the paper `A disciplined approach to neural network hyper-parameters: Part 1 <https://arxiv.org/abs/1803.09820>`_
 
     LRRT policy is used for finding maximum LR that trains a model without divergence, and can be used to
     configure the LR boundaries for Cyclic LR schedules.
@@ -379,7 +379,7 @@ class OneCycle(object):
     1CLR policy changes the learning rate after every batch.
     `step` should be called after a batch has been used for training.
 
-    This implementation was adapted from the github repo: `pytorch/pytorch`_
+    This implementation was adapted from the github repo: `PyTorch <https://github.com/pytorch/pytorch>`_.
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
diff --git a/docs/code-docs/source/monitor.rst b/docs/code-docs/source/monitor.rst
index 694c72b9b870..b185ed433c1c 100644
--- a/docs/code-docs/source/monitor.rst
+++ b/docs/code-docs/source/monitor.rst
@@ -9,15 +9,15 @@ overview of what DeepSpeed will log automatically.
     :header: "Field", "Description", "Condition"
     :widths: 20, 20, 10
 
-    `Train/Samples/train_loss`,The training loss.,None
-    `Train/Samples/lr`,The learning rate during training.,None
-    `Train/Samples/loss_scale`,The loss scale when training using `fp16`.,`fp16` must be enabled.
-    `Train/Eigenvalues/ModelBlockParam_{i}`,Eigen values per param block.,`eigenvalue` must be enabled.
-    `Train/Samples/elapsed_time_ms_forward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_backward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_backward_inner`,The backward time that does not include the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_backward_allreduce`,The global duration of the allreduce operation.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_step`,The optimizer step time,`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/train_loss`,"The training loss.",None
+    `Train/Samples/lr`,"The learning rate during training.",None
+    `Train/Samples/loss_scale`,"The loss scale when training using `fp16`.",`fp16` must be enabled.
+    `Train/Eigenvalues/ModelBlockParam_{i}`,"Eigen values per param block.",`eigenvalue` must be enabled.
+    `Train/Samples/elapsed_time_ms_forward`,"The global duration of the forward pass.",`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward`,"The global duration of the forward pass.",`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward_inner`,"The backward time that does not include the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.",`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward_allreduce`,"The global duration of the allreduce operation.",`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_step`,"The optimizer step time.",`flops_profiler.enabled` or `wall_clock_breakdown`.
 
 TensorBoard
 -----------

From fc7c07007fe341bf6d78a9126d0cb5a914ce28fd Mon Sep 17 00:00:00 2001
From: keiwoo <p2417631@mpu.edu.mo>
Date: Sat, 14 Dec 2024 08:41:43 +0800
Subject: [PATCH 12/15] Update real_accelerator.py (#6845)

### Comment out or delete `accelerate_name="cpu"` when `xpu` is not
detected.
When `xpu `is not detected it just pass at lines from 68 to 74 if
`DS_ACCELERATOR` is set. However, `cpu` is assigned to `accelerate_name`
if it cannot import `intel_extension_for_pytorch` or find` xpu`, namely,
at line from 125 to 133 when`DS_ACCELERATOR` is not set.

I found this problem yesterday and spent whole afternoon figuring it
out. I got `intel_extension_for_pytorch `installed with other package
which I do not use actually and have no idea about this. Then I found
that it `cpu` is assigned to accelerate_name directly if it cannot find
`xpu` and it affects `cuda` detection. In fact, `cpu` will be assigned
finally if `cuda` is even not detected at line from 170 to 177.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <loadams@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 accelerator/real_accelerator.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index a6173ac70abd..ced9218d7aca 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -125,10 +125,9 @@ def get_accelerator():
         if accelerator_name is None:
             try:
                 import intel_extension_for_pytorch as ipex
+
                 if ipex._C._has_xpu():
                     accelerator_name = "xpu"
-                else:
-                    accelerator_name = "cpu"
             except ImportError as e:
                 pass
         if accelerator_name is None:
@@ -162,7 +161,6 @@ def get_accelerator():
             except ImportError as e:
                 pass
         if accelerator_name is None:
-            # borrow this log from PR#5084
             try:
                 import torch
 
@@ -174,16 +172,16 @@ def get_accelerator():
                 # For reference: https://github.com/microsoft/DeepSpeed/pull/6810
                 if torch.cuda.device_count() > 0 and torch.cuda.is_available():  #ignore-cuda
                     accelerator_name = "cuda"
-                else:
-                    if accel_logger is not None:
-                        accel_logger.warn(
-                            "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it."
-                        )
-                    accelerator_name = "cpu"
             except (RuntimeError, ImportError) as e:
                 # TODO need a more decent way to detect which accelerator to use, consider using nvidia-smi command for detection
-                accelerator_name = "cuda"
                 pass
+        if accelerator_name is None:
+            # borrow this log from PR#5084
+            if accel_logger is not None:
+                accel_logger.warn(
+                    "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.")
+            # cpu added as catch-all when accelerator detection fails
+            accelerator_name = "cpu"
 
         ds_set_method = "auto detect"
 

From db98cc3ad1e0a20807e0c2513f0eee40f626860e Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:05:55 -0800
Subject: [PATCH 13/15] Fix assertion for offloading states (#6855)

This PR fixes the assertions in `offload_states` method mentioned in
#6833.

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/engine.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 0aad018528d3..5f023d87f375 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -3738,6 +3738,11 @@ def offload_states(self,
         assert self.zero_optimization_stage(
         ) == ZeroStageEnum.weights, "Moving buffers across devices is supported only for ZeRO stage 3."
 
+        opt_offload_config = self.zero_offload_optimizer()
+        assert opt_offload_config is None or opt_offload_config.device == OffloadDeviceEnum.none, "Moving states across devices is not supported for offloaded optimizer states."
+        param_offload_config = self.zero_offload_param()
+        assert param_offload_config is None or param_offload_config.device == OffloadDeviceEnum.none, "Moving states across devices is not supported for offloaded parameters."
+
         assert not self.zero_offload_param(), "Moving states across devices is not supported for offloaded parameters."
 
         if device == OffloadDeviceEnum.none:

From 87c650681eb285ab34a69a011b520f756f42d4b9 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:21:51 -0800
Subject: [PATCH 14/15] Remove pin from transformers version and fix
 Processing/Threading issues in tests (#6822)

Changes from https://github.com/huggingface/transformers/pull/34966
caused the `nv-torch-latest-v100` tests to fail with the following
error:

```
  File "/tmp/azureml/cr/j/e4bfd57a509846d6bbc4914639ad248d/exe/wd/actions-runner/_work/DeepSpeed/DeepSpeed/unit-test-venv/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3941, in from_pretrained
    raise EnvironmentError(
OSError: Can't load the model for 'hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2' is the correct path to a directory containing a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.
```

Sample failure here:
https://github.com/microsoft/DeepSpeed/actions/runs/12169422174/job/33942348835?pr=6794#step:8:3506

This was resolved on the Transformers side here:
https://github.com/huggingface/transformers/pull/35236
---
 .github/workflows/cpu-torch-latest.yml      | 2 +-
 .github/workflows/nv-torch-latest-v100.yml  | 2 +-
 .github/workflows/nv-torch-nightly-v100.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
index 51bc60c2c2ae..78a51905834b 100644
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -42,7 +42,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 6c3f168b3
+          # git checkout 6c3f168b3
           git rev-parse --short HEAD
           pip install .
 
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 2d69d0b94cb5..a1ba4937d164 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -38,7 +38,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 6c3f168b3
+          # git checkout 6c3f168b3
           git rev-parse --short HEAD
           pip install .
 
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index c2d10a454f94..0a9570a1ceaa 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -37,7 +37,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 6c3f168b3
+          # git checkout 6c3f168b3
           git rev-parse --short HEAD
           pip install .
 

From da771ed42e41a44d5047813ca4672f1cfe9d1731 Mon Sep 17 00:00:00 2001
From: Yejing-Lai <yejing.lai@intel.com>
Date: Tue, 17 Dec 2024 06:14:53 +0800
Subject: [PATCH 15/15] Add MLP/lm_head tp grain size setting. (#6828)

This PR aims to add MLP/lm_head tp size granularity setting to
deepspeed.init_inference() API. It will be more flexible to set the
MLP/lm_head sharding grain size.

DNN library favors tensor size in granularity of power of 2, we pick 64
as a default size.

We aim to be able to set the MLP/lm_head tp grain size flexibly. This is
a preliminary solution. If there is a better solution, we can discuss it
together. Thanks~

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
---
 deepspeed/inference/config.py             |  3 +++
 deepspeed/module_inject/replace_module.py |  5 ++++-
 deepspeed/module_inject/tp_shard.py       | 11 ++++++++---
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index c7c7684fff79..42ffebbc4386 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -40,6 +40,9 @@ class DeepSpeedTPConfig(DeepSpeedConfigModel):
     tp_size: int = 1
     """ Number of devices to split the model across using tensor parallelism. """
 
+    tp_grain_size: int = 64
+    "Desired MLP/lm_head tp size granularity. DNN library favors tensor size in granularity of power of 2, we pick 64 as a default size."
+
     mpu: object = None
     """
     A model parallelism unit object that implements
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 7afe6ca903fb..e59f84bc8453 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -17,7 +17,7 @@
 from .layers import TensorParallelOcShardConv2d, TensorParallelIcShardConv2d
 
 from deepspeed import comm as dist
-from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads
+from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads, set_tp_grain_size
 
 from .load_checkpoint import load_model_with_checkpoint
 import time
@@ -303,6 +303,9 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         if hasattr(model_config, 'num_attention_heads'):
             set_num_attention_heads(getattr(model_config, 'num_attention_heads'))
 
+        # 4.4 set tp_grain_size
+        set_tp_grain_size(config.tensor_parallel.tp_grain_size)
+
         # 5. Set linear policies
         _autotp.update_linear_policies()
 
diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py
index 57be0c793856..3e6fc2b63ef1 100644
--- a/deepspeed/module_inject/tp_shard.py
+++ b/deepspeed/module_inject/tp_shard.py
@@ -22,6 +22,11 @@ def set_n_embd(num):
     n_embd = num
 
 
+def set_tp_grain_size(num):
+    global tp_grain_size
+    tp_grain_size = num
+
+
 def get_num_kv_heads():
     global num_kv_heads
     if 'num_kv_heads' in globals():
@@ -45,9 +50,9 @@ def get_shard_size(total_size, mp_size, name=None, rank=None):
         my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0)
         return total_size * my_slices // num_kv_heads
     else:
-        if total_size >= 64:
-            grain_size = total_size // 64
-            return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * 64
+        if total_size >= tp_grain_size:
+            grain_size = total_size // tp_grain_size
+            return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * tp_grain_size
         else:
             return total_size // mp_size + (1 if rank < (total_size % mp_size) else 0)