From 5e6c9b931184bf8f0a245a2fef183078139b6d7e Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 23 Apr 2024 12:24:37 -0700
Subject: [PATCH 01/11] OptimizedLinear implementation (#5355)

Optimized version of `nn.Linear` that adds features such as:
      * LoRA w. base weight sharding
      * FP [6,8,12] quantization

Depends on #5336 being merged first

Co-authored-by: @rajhans
Co-authored-by: @aurickq

---------

Co-authored-by: Rajhans Samdani <rajhans.samdani@snowflake.com>
Co-authored-by: Jeff Rasley <jeff.rasley@snowflake.com>
---
 deepspeed/linear/__init__.py           |   7 ++
 deepspeed/linear/config.py             |  39 +++++++
 deepspeed/linear/optimized_linear.py   | 150 +++++++++++++++++++++++++
 deepspeed/linear/quantization.py       | 137 ++++++++++++++++++++++
 deepspeed/ops/fp_quantizer/__init__.py |   2 +-
 deepspeed/ops/fp_quantizer/quantize.py |  33 +++++-
 tests/unit/linear/test_linear.py       | 128 +++++++++++++++++++++
 tests/unit/linear/test_quant_param.py  |  58 ++++++++++
 8 files changed, 550 insertions(+), 4 deletions(-)
 create mode 100644 deepspeed/linear/__init__.py
 create mode 100644 deepspeed/linear/config.py
 create mode 100644 deepspeed/linear/optimized_linear.py
 create mode 100644 deepspeed/linear/quantization.py
 create mode 100644 tests/unit/linear/test_linear.py
 create mode 100644 tests/unit/linear/test_quant_param.py

diff --git a/deepspeed/linear/__init__.py b/deepspeed/linear/__init__.py
new file mode 100644
index 000000000000..a27f1c3eaee7
--- /dev/null
+++ b/deepspeed/linear/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .optimized_linear import OptimizedLinear
+from .config import LoRAConfig, QuantizationConfig
diff --git a/deepspeed/linear/config.py b/deepspeed/linear/config.py
new file mode 100644
index 000000000000..ae9050a3c92b
--- /dev/null
+++ b/deepspeed/linear/config.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from dataclasses import dataclass
+
+
+@dataclass
+class LoRAConfig:
+    """
+    Configuration settings for LoRAOptimizedLinear.
+
+    Attributes:
+        lora_r (int): LoRA attention dimension, also know as the rank. Defaults is 64.
+        lora_alpha (float): LoRA scaling factor, default is 16.
+        base_weight_sharding (int): The degree to which the base weights are sharded,
+            should typically be set to the data-parallel world size to maximize the memory
+            reduction benefits. Defaults to 1, which means this feature is disabled.
+    """
+    lora_r: int = 64
+    lora_alpha: float = 16.
+    base_weight_sharding: int = 1
+
+
+@dataclass
+class QuantizationConfig:
+    """
+    Configuration settings for quantization for LoRAOptimizedLinear, QuantizedLinear,
+    and QuantizedParameter
+
+    Attributes:
+        q_bits (int): The number of bits used for quantization. Default is 8.
+        mantissa_bits (int): The number of bits reserved for the mantissa in fixed-point quantization. Default is 3.
+        group_size (int): The size of the group used for quantization. Default is 512.
+    """
+    q_bits: int = 8
+    mantissa_bits: int = 3
+    group_size: int = 512
diff --git a/deepspeed/linear/optimized_linear.py b/deepspeed/linear/optimized_linear.py
new file mode 100644
index 000000000000..138bd493ffc7
--- /dev/null
+++ b/deepspeed/linear/optimized_linear.py
@@ -0,0 +1,150 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import is_dataclass
+from deepspeed.accelerator import get_accelerator
+import deepspeed.comm as dist
+
+from .config import LoRAConfig, QuantizationConfig
+from .quantization import QuantizedParameter, QuantizedLinear
+
+
+class OptimizedLinear(nn.Module):
+    """
+    Optimized version of nn.Linear that adds features such as:
+      * LoRA w. base weight sharding
+      * FP [6,8,12] quantization
+
+    Arguments:
+        input_dim: Required: size of each input sample
+        output_dim: Required: size of each output sample
+        bias: Optional: If set to False, the layer will not learn an additive bias. Default: False
+        lora_config: Optional: LoRAConfig defining lora features and base-weight-sharding degree
+        quantization_config: Optional: QuantizationConfig defining quantization features
+        dtype: Optional: parameter dtype, only supports bfloat16 currently
+
+    Returns:
+        Returns a new nn.Module depending on the input config. Either native
+        torch.nn.Linear, QuantizedLinear, or the full-featured DSOptimizedLinear.
+    """
+
+    def __new__(self,
+                input_dim: int,
+                output_dim: int,
+                bias: bool = False,
+                lora_config: LoRAConfig = None,
+                quantization_config: QuantizationConfig = None,
+                dtype=torch.bfloat16):
+
+        if quantization_config is not None and not is_dataclass(quantization_config):
+            raise ValueError(f"Expecting QuantizationConfig but received {type(quantization_config)}")
+        if lora_config is not None and not is_dataclass(lora_config):
+            raise ValueError(f"Expecting LoRAConfig but received {type(lora_config)}")
+        if lora_config is None and quantization_config is None:
+            # Everything disabled, fall back to normal nn.Linear
+            self = nn.Linear(input_dim, output_dim, bias=bias, dtype=dtype)
+
+        elif lora_config:
+            # lora enabled, quantization may or may not be
+            self = LoRAOptimizedLinear(input_dim=input_dim,
+                                       output_dim=output_dim,
+                                       bias=bias,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=dtype)
+
+        elif quantization_config:
+            # only quantization enabled, no lora
+            self = QuantizedLinear(input_dim=input_dim,
+                                   output_dim=output_dim,
+                                   bias=bias,
+                                   quantization_config=quantization_config,
+                                   dtype=dtype)
+        return self
+
+
+class LoRAOptimizedLinear(nn.Module):
+
+    def __init__(self,
+                 input_dim: int,
+                 output_dim: int,
+                 bias: bool = False,
+                 lora_config: LoRAConfig = None,
+                 quantization_config: QuantizationConfig = None,
+                 device=None,
+                 dtype=torch.bfloat16):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.bias = bias
+        self.lora_config = lora_config
+        self.quantization_config = quantization_config
+        device = get_accelerator().current_device() if device is None else device
+        assert self.lora_config is not None, "DSOptimizedLinear requires a LoRA config"
+
+        self.zero_shards = self.lora_config.base_weight_sharding
+        self.sharded_weight_size = int(float(self.input_dim) // self.zero_shards)
+        w = torch.nn.Parameter(torch.empty((self.output_dim, self.sharded_weight_size), dtype=dtype))
+        torch.nn.init.xavier_uniform_(w)
+
+        if self.quantization_config is not None:
+            assert dtype == torch.bfloat16, "only bfloat16 is supported when using quantization"
+            self.base_weight = QuantizedParameter(w, quantization_config=quantization_config)
+        else:
+            self.base_weight = w
+
+        self.base_weight.requires_grad = False
+
+        # Use RS lora for now.
+        self.lora_scaling_factor = self.lora_config.lora_alpha / math.sqrt(self.lora_config.lora_r)
+        # Keeping lora weights in bf16 precision for ease of training.
+        self.lora_weight_1 = nn.Linear(self.input_dim,
+                                       self.lora_config.lora_r,
+                                       bias=self.bias,
+                                       device=device,
+                                       dtype=dtype)
+        self.lora_weight_2 = nn.Linear(self.lora_config.lora_r,
+                                       self.output_dim,
+                                       bias=self.bias,
+                                       device=device,
+                                       dtype=dtype)
+        self.lora_weight_1.weight.requires_grad = True
+        self.lora_weight_2.weight.requires_grad = True
+
+    def full_weight(self):
+        # This assumes weights are evenly sharded across gpus. which might not be correct.
+        # in that case, we should flatten before all_gather.
+        local_weight = self.base_weight.dequantized() if isinstance(self.base_weight,
+                                                                    QuantizedParameter) else self.base_weight
+        tensor_list = [
+            torch.zeros_like(local_weight, device=local_weight.device, dtype=local_weight.dtype)
+            for _ in range(self.zero_shards)
+        ]
+        dist.all_gather(tensor_list, local_weight)
+        weight = nn.Parameter(torch.cat([tensor for tensor in tensor_list], dim=1))
+        return weight
+
+    def linear_without_F_linear(self, input, weight):
+        output = torch.mm(input.reshape(-1, input.shape[-1]), weight)
+        output = output.view(*input.shape[:-1], weight.shape[1])
+        return output
+
+    def forward(self, input_tensor):
+        # Gather the sharded base weight
+        if self.zero_shards > 1:
+            with torch.no_grad():
+                base_weight = self.full_weight()
+        elif self.quantization_config:
+            base_weight = self.base_weight.dequantized()
+        else:
+            base_weight = self.base_weight
+
+        base_weight_output = F.linear(input_tensor, base_weight)
+        lora_output = self.lora_weight_2(self.lora_weight_1(input_tensor))
+        return base_weight_output + self.lora_scaling_factor * lora_output
diff --git a/deepspeed/linear/quantization.py b/deepspeed/linear/quantization.py
new file mode 100644
index 000000000000..f5343af45fb8
--- /dev/null
+++ b/deepspeed/linear/quantization.py
@@ -0,0 +1,137 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import Optional
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.fp_quantizer import Quantizer, FP_Quantize
+from .config import QuantizationConfig
+
+
+class QuantizedParameter(nn.Parameter):
+    """
+    Quantized parameter class that implements weight quantization. Weights
+    are stored in quantized form on GPUs, and can be dequantized on-the-fly when
+    needed by the model. The weights are actually quantized during any `.to(device)`.
+
+    Arguments:
+        data (Tensor): parameter tensor.
+        requires_grad (bool, optional): if the parameter requires gradient. Defaults
+            to False and is not supported to be True. Argument provided only for interface
+            compatibility with torch.nn.Parameter.
+        quantization_config (QuantizationConfig, optional):
+        quantizer (Quantizer, optional): Defaults to FP_Quantize but can be any quantizer
+            that implements deepspeed.ops.fp_quantizer.Quantizer. This argument is also
+            required since the quantizer is stashed in the Parameter itself, some models
+            may clone the Parameter by passing an attribute __dict__. For an example, see
+            tests/unit/linear/test_quant_param.py::TestQuantParam::test_hf_clone
+    """
+
+    def __new__(
+        cls,
+        data: Optional[torch.Tensor] = None,
+        requires_grad: bool = False,  # quantized weights must be frozen
+        quantization_config: QuantizationConfig = None,
+        quantizer: Quantizer = None,
+    ):
+        if requires_grad:
+            raise ValueError(f"requires_grad=True is not supported with QuantizedParameter")
+        if data is None:
+            data = torch.empty(0)
+        self = torch.Tensor._make_subclass(cls, data, requires_grad)
+        self.quantization_config = QuantizationConfig() if quantization_config is None else quantization_config
+        if quantizer is not None:
+            self.quantizer = quantizer
+        else:
+            # if FPQuantizerBuilder is not compatible in this env this init will fail
+            self.quantizer = FP_Quantize(group_size=self.quantization_config.group_size)
+        self._ensure_quantized(self)
+        return self
+
+    def _ensure_quantized(self, tensor: torch.Tensor):
+        # If the tensor is on the accelerator and is not quantized, then quantize it in-place.
+        if get_accelerator().on_accelerator(tensor) and tensor.dtype != torch.int8:
+            with get_accelerator().stream(get_accelerator().current_stream(tensor.device)):
+                tensor.data = self.quantizer.quantize(tensor.data,
+                                                      q_bits=self.quantization_config.q_bits,
+                                                      q_mantisa_bits=self.quantization_config.mantissa_bits)
+            assert tensor.dtype == torch.int8
+
+    def dequantized(self) -> torch.Tensor:
+        """
+        Return a tensor containing the dequantized weights of this parameter.
+        """
+        if get_accelerator().on_accelerator(self.data) and self.data.dtype == torch.int8:
+            with get_accelerator().stream(get_accelerator().current_stream(self.data.device)):
+                return self.quantizer.dequantize(self.data,
+                                                 q_bits=self.quantization_config.q_bits,
+                                                 q_mantisa_bits=self.quantization_config.mantissa_bits)
+        return self.data
+
+    def __getstate__(self):
+        state = self.__dict__
+        state["data"] = self.data
+        state["quantization_config"] = self.quantization_config
+        state["requires_grad"] = self.requires_grad
+        return state
+
+    def __setstate__(self, state):
+        self.quantizer = state["quantizer"]
+        self.quantization_config = state["quantization_config"]
+        self.data = state["data"]
+        self.requires_grad = state["requires_grad"]
+
+    def __deepcopy__(self, memo):
+        new_instance = type(self).__new__(type(self))
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        new_instance.quantizer = copy.deepcopy(state["quantizer"])
+        new_instance.quantization_config = copy.deepcopy(state["quantization_config"])
+        new_instance.data = copy.deepcopy(state["data"])
+        return new_instance
+
+    def __copy__(self):
+        new_instance = type(self).__new__(type(self))
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        return new_instance
+
+    def cuda(self, device=None, non_blocking=False):
+        return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
+
+    def to(self, *args, **kwargs):
+        """
+        Move the parameter to the given device. Then, if the device is a cuda device,
+        quantize it.
+        """
+        tensor = super().to(*args, **kwargs)
+        self._ensure_quantized(tensor)
+        return tensor
+
+
+class QuantizedLinear(nn.Linear):
+    """
+    Linear layer that implements weight quantization. Parameters
+    are stored via `QuantizedParameter` and are dequantized on-the-fly during any
+    forward pass.
+    """
+
+    def __init__(self,
+                 input_dim: int,
+                 output_dim: int,
+                 bias: bool = False,
+                 quantization_config: QuantizationConfig = None,
+                 dtype=torch.bfloat16):
+        super().__init__(input_dim, output_dim, bias=bias, dtype=dtype)
+        assert dtype == torch.bfloat16, "currently only supports bfloat16 dtype"
+        self.weight = QuantizedParameter(self.weight.data, quantization_config=quantization_config)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight.dequantized(), self.bias)
diff --git a/deepspeed/ops/fp_quantizer/__init__.py b/deepspeed/ops/fp_quantizer/__init__.py
index 5575f3567185..995bbae4aeaf 100644
--- a/deepspeed/ops/fp_quantizer/__init__.py
+++ b/deepspeed/ops/fp_quantizer/__init__.py
@@ -3,4 +3,4 @@
 
 # DeepSpeed Team
 
-from .quantize import FP_Quantize
+from .quantize import FP_Quantize, Quantizer
diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py
index 0d4bf7bc6db1..f8435bda16c1 100644
--- a/deepspeed/ops/fp_quantizer/quantize.py
+++ b/deepspeed/ops/fp_quantizer/quantize.py
@@ -4,20 +4,47 @@
 # DeepSpeed Team
 
 import torch
+import abc
+from abc import ABC
 
 from deepspeed.ops.op_builder import FPQuantizerBuilder
 
 fp_quant_module = None
 
 
-class FP_Quantize:
+class Quantizer(ABC):
+    """
+    Abstract Quantizer class that implmenents quantize/dequantize methods.
+
+    Arguments:
+        group_size (int, optional): number of values or elements that are grouped
+            together for the quantization process.
+    """
+
+    def __init__(self, group_size=512) -> None:
+        self.group_size = group_size
+
+    @abc.abstractmethod
+    def quantize(self,
+                 input,
+                 q_bits=8,
+                 q_mantisa_bits=3,
+                 stochastic_mode=False,
+                 return_meta_tensor=False) -> torch.Tensor:
+        ...
+
+    @abc.abstractmethod
+    def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=None) -> torch.Tensor:
+        ...
+
+
+class FP_Quantize(Quantizer):
 
     def __init__(self, group_size=512) -> None:
         global fp_quant_module
+        super().__init__(group_size=group_size)
         if fp_quant_module is None:
             fp_quant_module = FPQuantizerBuilder().load()
-
-        self.group_size = group_size
         self.orig_dtype = None
 
     def quantize(self,
diff --git a/tests/unit/linear/test_linear.py b/tests/unit/linear/test_linear.py
new file mode 100644
index 000000000000..ccd26b4cd726
--- /dev/null
+++ b/tests/unit/linear/test_linear.py
@@ -0,0 +1,128 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+import deepspeed.comm as dist
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.linear import OptimizedLinear, LoRAConfig, QuantizationConfig
+from unit.common import DistributedTest
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+
+class TestBasicLinear(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        lora_config = None
+        quantization_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 1  # Number of samples in a batch
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+
+        dummy_input = torch.rand(batch_size, input_features, dtype=torch.bfloat16)
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("base_weight_sharding", [1, 2])
+class TestLoRALinear(DistributedTest):
+    world_size = 2
+
+    def test(self, base_weight_sharding):
+        rank = dist.get_rank()
+        lora_config = None
+        quantization_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding)
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        if rank == 0:
+            for n, p in linear_layer.named_parameters():
+                print(f"{n}, {p.shape}")
+
+        dummy_input = torch.rand(batch_size, input_features, device=device, dtype=torch.bfloat16)
+
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("q_bits", [8, 6])
+class TestQuantLinear(DistributedTest):
+    world_size = 2
+
+    def test(self, q_bits):
+        rank = dist.get_rank()
+        lora_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = None
+        quantization_config = QuantizationConfig(q_bits=q_bits)
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16)
+
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("base_weight_sharding", [1, 2], ids=['bws1', 'bws2'])
+@pytest.mark.parametrize("q_bits", [8, 6], ids=['qbit8', 'qbit6'])
+class TestOptimizedLinear(DistributedTest):
+    world_size = 2
+
+    def test(self, base_weight_sharding, q_bits):
+        rank = dist.get_rank()
+        lora_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding)
+        quantization_config = QuantizationConfig(q_bits=q_bits)
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16)
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
diff --git a/tests/unit/linear/test_quant_param.py b/tests/unit/linear/test_quant_param.py
new file mode 100644
index 000000000000..9479b3cba8a0
--- /dev/null
+++ b/tests/unit/linear/test_quant_param.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.linear.quantization import QuantizedParameter
+from deepspeed.linear.config import QuantizationConfig
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+
+class TestQuantParam(DistributedTest):
+    world_size = 1
+
+    @pytest.mark.parametrize('dtype', [torch.half, torch.float])
+    def test_unsupported_dtypes(self, dtype):
+        device = get_accelerator().current_device_name()
+        data = torch.rand(5, 5, device='cpu', dtype=dtype)
+        qp = QuantizedParameter(data)
+        with pytest.raises(AssertionError):
+            qp.to(device)
+
+    def test_requires_grad(self):
+        data = torch.rand(5, 5, dtype=torch.bfloat16)
+        with pytest.raises(ValueError):
+            QuantizedParameter(data, requires_grad=True)
+
+    def test_move_to_accelerator(self):
+        device = get_accelerator().current_device()
+        data = torch.rand(5, 5, device='cpu', dtype=torch.bfloat16)
+        qp = QuantizedParameter(data)
+        assert qp.device == torch.device('cpu')
+        qp = qp.to(get_accelerator().current_device_name())
+        assert qp.device == torch.device(device)
+        assert qp.dtype == torch.int8
+
+    def test_hf_clone(self):
+        device = get_accelerator().current_device_name()
+        data = torch.rand(5, 5, device=device, dtype=torch.bfloat16)
+
+        quantization_config = QuantizationConfig(q_bits=6)
+        qp = QuantizedParameter(data, quantization_config=quantization_config)
+
+        # should be able to clone parameter via dict, HF expects this to work
+        qp_copy = QuantizedParameter(qp.data, **qp.__dict__)
+
+        assert all(qp.data == qp_copy.data)
+        assert qp.quantization_config == qp_copy.quantization_config

From ad2027952f9730cbd1a8385e4a441e470248645e Mon Sep 17 00:00:00 2001
From: Jhonso7393 <167781426+Jhonso7393@users.noreply.github.com>
Date: Tue, 23 Apr 2024 23:45:47 +0300
Subject: [PATCH 02/11] Update README.md (#5453)

Fixing a minor typo at the README file

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 201b9016f8ab..f9d81eddfdae 100755
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
 * [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
-* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
+* [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
 * [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
 * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
 * [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]

From 5f631abc2f930ecece38fae05dc9bd3923c555dd Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Tue, 23 Apr 2024 16:24:12 -0700
Subject: [PATCH 03/11] Update PyTest torch version to match PyTorch latest
 official (2.3.0) (#5454)

---
 .github/workflows/cpu-torch-latest.yml     | 4 ++--
 .github/workflows/nv-torch-latest-v100.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
index 9c1ad02f75a6..5727ff2e1cde 100644
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -50,5 +50,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.2"
-          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.2"
+          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.3"
+          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.3"
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 3ca8ac43dfa4..2e0490c18ba7 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch==2.2.2 torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 

From fbdf0eaf15f45cd2a8b5846ad9428609a3860b41 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Tue, 23 Apr 2024 16:27:27 -0700
Subject: [PATCH 04/11] Update version.txt after 0.14.2 release (#5458)

**Auto-generated PR to update version.txt after a DeepSpeed release**
Released version - 0.14.2
Author           - @loadams

Co-authored-by: loadams <loadams@users.noreply.github.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index e867cc2a66a8..ac4a79626c87 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.14.2
+0.14.3

From fa8458b1a80d6ba55091b17f092de19bbf95eb3d Mon Sep 17 00:00:00 2001
From: vikram singh shekhawat <vshekhawat@habana.ai>
Date: Wed, 24 Apr 2024 20:55:18 +0530
Subject: [PATCH 05/11] Add getter and setter methods for compile_backend
 across accelerators. (#5299)

Add getter and setter methods for `compile_backend` across accelerators,
which provide a mechanism to retrieve the compile backend. These APIs
handle user-defined backend selection and raise a `ValueError` with
informative error messages for unsupported backends.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 accelerator/abstract_accelerator.py                |  9 +++++++++
 accelerator/cpu_accelerator.py                     | 12 ++++++++++++
 accelerator/cuda_accelerator.py                    | 12 ++++++++++++
 accelerator/hpu_accelerator.py                     | 12 ++++++++++++
 accelerator/mps_accelerator.py                     | 12 ++++++++++++
 accelerator/npu_accelerator.py                     | 12 ++++++++++++
 accelerator/xpu_accelerator.py                     | 12 ++++++++++++
 tests/unit/runtime/compile/test_compile_wrapper.py |  4 +---
 tests/unit/runtime/compile/test_compile_zero.py    |  4 +---
 tests/unit/runtime/compile/test_load_config.py     |  4 +---
 10 files changed, 84 insertions(+), 9 deletions(-)

diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index 768d5ea34e5e..690f19292299 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -12,6 +12,7 @@ class DeepSpeedAccelerator(ABC):
     def __init__(self):
         self._name = None
         self._communication_backend_name = None
+        self._compile_backend = None
 
     @abc.abstractmethod
     def is_synchronized_device(self):
@@ -295,3 +296,11 @@ def visible_devices_envs(self):
     @abc.abstractmethod
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         ...
+
+    @abc.abstractmethod
+    def get_compile_backend(self):
+        ...
+
+    @abc.abstractmethod
+    def set_compile_backend(self, backend):
+        ...
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index 237e7f51dcb4..bd11d034f312 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -20,6 +20,7 @@ class CPU_Accelerator(DeepSpeedAccelerator):
 
     def __init__(self):
         self._name = 'cpu'
+        self._compile_backend = "inductor"
         if oneccl_imported_p:
             self._communication_backend_name = 'ccl'
         else:
@@ -330,3 +331,14 @@ def visible_devices_envs(self):
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         for env in self.visible_devices_envs():
             current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 2fc0cfd94125..60d66b6cdbab 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -25,6 +25,7 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
     def __init__(self):
         self._name = 'cuda'
         self._communication_backend_name = 'nccl'
+        self._compile_backend = "inductor"
         if pynvml is None:
             self._init_pynvml()
 
@@ -367,3 +368,14 @@ def visible_devices_envs(self):
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         for env in self.visible_devices_envs():
             current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
index 326efc8fa01b..114f367e879d 100644
--- a/accelerator/hpu_accelerator.py
+++ b/accelerator/hpu_accelerator.py
@@ -16,6 +16,7 @@ class HPU_Accelerator(DeepSpeedAccelerator):
     def __init__(self):
         self._name = 'hpu'
         self._communication_backend_name = 'hccl'
+        self._compile_backend = "hpu_backend"
         try:
             import habana_frameworks.torch.hpu as hpu
             hpu.setDeterministic(True)
@@ -301,3 +302,14 @@ def visible_devices_envs(self):
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         for env in self.visible_devices_envs():
             current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
index ff70b860d7c7..5fc9b1c8cfb6 100644
--- a/accelerator/mps_accelerator.py
+++ b/accelerator/mps_accelerator.py
@@ -20,6 +20,7 @@ class MPS_Accelerator(DeepSpeedAccelerator):
     def __init__(self):
         self._name = "mps"
         self._communication_backend_name = None
+        self._compile_backend = "inductor"
 
     def is_synchronized_device(self):
         return False
@@ -267,3 +268,14 @@ def visible_devices_envs(self):
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         for env in self.visible_devices_envs():
             current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
index 5d891ecb707d..b0e0ff948e52 100644
--- a/accelerator/npu_accelerator.py
+++ b/accelerator/npu_accelerator.py
@@ -20,6 +20,7 @@ def __init__(self):
         super().__init__()
         self._name = 'npu'
         self._communication_backend_name = 'hccl'
+        self._compile_backend = "inductor"
         # dict that holds class name <--> class type mapping i.e.
         # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
         # this dict will be filled at init stage
@@ -285,3 +286,14 @@ def visible_devices_envs(self):
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         for env in self.visible_devices_envs():
             current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }")
diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py
index c59f60077d2f..9c4a9c903f96 100644
--- a/accelerator/xpu_accelerator.py
+++ b/accelerator/xpu_accelerator.py
@@ -14,6 +14,7 @@ class XPU_Accelerator(DeepSpeedAccelerator):
     def __init__(self):
         self._name = 'xpu'
         self._communication_backend_name = 'ccl'
+        self._compile_backend = "inductor"
         self.aligned_tensors = []
 
     def is_synchronized_device(self):
@@ -296,3 +297,14 @@ def visible_devices_envs(self):
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         for env in self.visible_devices_envs():
             current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py
index d1830534f6ea..62af25ac3ba4 100644
--- a/tests/unit/runtime/compile/test_compile_wrapper.py
+++ b/tests/unit/runtime/compile/test_compile_wrapper.py
@@ -31,11 +31,9 @@ def base_config():
         },
         "compile": {
             "enabled": True,
-            "backend": "inductor"
+            "backend": get_accelerator().get_compile_backend()
         }
     }
-    if get_accelerator().device_name() == 'hpu':
-        config_dict['compile']['backend'] = 'hpu_backend'
     return config_dict
 
 
diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py
index 7568c27e3ed2..9890ea708eec 100644
--- a/tests/unit/runtime/compile/test_compile_zero.py
+++ b/tests/unit/runtime/compile/test_compile_zero.py
@@ -51,12 +51,10 @@ def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device):
             },
             "compile": {
                 "enabled": True,
-                "backend": "inductor"
+                "backend": get_accelerator().get_compile_backend()
             }
         }
 
-        if get_accelerator().device_name() == 'hpu':
-            config_dict['compile']['backend'] = 'hpu_backend'
         if offload_device == OffloadDeviceEnum.cpu:
             config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device}
         elif offload_device == OffloadDeviceEnum.nvme:
diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py
index 601adae58884..cee8d3b23f6b 100644
--- a/tests/unit/runtime/compile/test_load_config.py
+++ b/tests/unit/runtime/compile/test_load_config.py
@@ -47,12 +47,10 @@ def base_config():
         },
         "compile": {
             "enabled": True,
-            "backend": "inductor"
+            "backend": get_accelerator().get_compile_backend()
         }
     }
 
-    if get_accelerator().device_name() == 'hpu':
-        config_dict['compile']['backend'] = 'hpu_backend'
     return config_dict
 
 

From fcc731f09d8e09b04a816b3ea0f83ab1d15169b3 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:01:35 -0700
Subject: [PATCH 06/11] Fix torch.compile error for PyTorch v2.3 (#5463)

PyTorch v2.3 throws an error when it tries to compile `iter_params` used
for ZeRO3.
This PR excludes the function from the compilation targets.

After this PR is merged, we can [unpin the torch version for unit
tests](https://github.com/microsoft/DeepSpeed/pull/5459).
---
 deepspeed/runtime/zero/partitioned_param_coordinator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index 8fc962c4f2a7..bdec8a55fcbc 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -34,6 +34,7 @@ def get_all_parameters(sub_module, recurse=False):
     return itertools.chain(sub_module.named_parameters(recurse=recurse), sub_module.ds_external_parameters())
 
 
+@compiler.disable
 def iter_params(module: Module, recurse=False) -> Iterable[Parameter]:
     return map(lambda pair: pair[1], get_all_parameters(module, recurse))
 

From bc48371c5e1fb8fd70fc79285e66201dbb65679b Mon Sep 17 00:00:00 2001
From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com>
Date: Thu, 25 Apr 2024 11:37:15 -0700
Subject: [PATCH 07/11] Revert "stage3: efficient compute of
 scaled_global_grad_norm (#5256)" (#5461)

This reverts commit 54c06872647ca60699f752e60ac1643bd05aa63c due to
#5256 causing bugs when the ZeRO3 + ZeRO Offload features are enabled.

This bug was discovered due to failures in the DS Chat CI workflow.
Failing tests across CI failures:
| Failing Test Name |
| --- |
| test_ds_chat[zero3--offload-] |
| test_ds_chat[zero3--offload-lora] |
| test_ds_chat[zero3-he-offload-] |
| test_ds_chat[zero3-he-offload-lora] |

Error message:
```
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cpu!
```

It seems that `torch.stack()` or `torch.norm()` is having issues when
the offload feature is enabled and tensors are split between CPU/GPU,
however this is just an initial guess and would require more
investigation.

@nelyahu Since you are the original author of the PR, if you have some
bandwidth, any help here is greatly appreciated!

After reverting this commit, all tests pass in the DS Chat CI workflow:

https://github.com/microsoft/DeepSpeed/actions/runs/8824064414/job/24225802763

@tjruwase for context.
---
 deepspeed/runtime/zero/stage3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 68cab13c4a93..c6ff216edfcb 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -15,7 +15,7 @@
 from deepspeed.utils import logger
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
-from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
+from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
@@ -2027,7 +2027,7 @@ def step(self, closure=None):
             return
 
         norm_groups = self._get_norm_groups()
-        scaled_global_grad_norm = torch.norm(torch.stack(norm_groups))
+        scaled_global_grad_norm = get_global_norm(norm_list=norm_groups)
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_global_grad_norm / self.loss_scale

From e37296b23c1df99dfba508a678d9f310b22aeeaa Mon Sep 17 00:00:00 2001
From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com>
Date: Thu, 25 Apr 2024 13:36:46 -0700
Subject: [PATCH 08/11] Update ds-chat CI workflow paths to include zero stage
 1-3 files (#5462)

This PR updates the ds-chat CI workflow to run when ZeRO stage 1-3 files
are updated.
---
 .github/workflows/nv-ds-chat.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
index 94571eb101bb..cf8756fbd528 100644
--- a/.github/workflows/nv-ds-chat.yml
+++ b/.github/workflows/nv-ds-chat.yml
@@ -10,6 +10,10 @@ on:
         required: false
         default: 'master'
         type: string
+  pull_request:
+    paths:
+      - "deepspeed/runtime/zero/stage_1_and_2.py"
+      - "deepspeed/runtime/zero/stage3.py"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

From 4c15ad9f8d51a1950842c69bbbc9d93c73afbcfc Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Thu, 25 Apr 2024 14:44:39 -0700
Subject: [PATCH 09/11] Update with ops not supported on Windows (#5468)

---
 build_win.bat | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/build_win.bat b/build_win.bat
index ec8c8a362a78..af5c5103fa4b 100644
--- a/build_win.bat
+++ b/build_win.bat
@@ -1,6 +1,10 @@
 @echo off
 
 set DS_BUILD_AIO=0
+set DS_BUILD_CUTLASS_OPS=0
+set DS_BUILD_EVOFORMER_ATTN=0
+set DS_BUILD_FP_QUANTIZER=0
+set DS_BUILD_RAGGED_DEVICE_OPS=0
 set DS_BUILD_SPARSE_ATTN=0
 
 echo Administrative permissions required. Detecting permissions...

From 059bb2085cf404caa5874004a252a56fc74c952e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ant=C3=B4nio=20Vieira?= <alvieirajr@gmail.com>
Date: Mon, 29 Apr 2024 14:37:54 -0300
Subject: [PATCH 10/11] fix: swapping order of parameters in create_dir_symlink
 method. (#5465)

Order of parameters in create_dir_symlink method looks wrong. Because
this we get the error "PermissionError: [WinError 5] Denied access:
'.\\deepspeed\\ops\\csrc'" when install deepspeed >= 0.4.0 on Windows
enviroment.

Please check this out @eltonzheng and @jeffra.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index f1367b850e02..839941b989c9 100755
--- a/setup.py
+++ b/setup.py
@@ -219,9 +219,9 @@ def create_dir_symlink(src, dest):
 if sys.platform == "win32":
     # This creates a symbolic links on Windows.
     # It needs Administrator privilege to create symlinks on Windows.
-    create_dir_symlink('..\\..\\csrc', '.\\deepspeed\\ops\\csrc')
-    create_dir_symlink('..\\..\\op_builder', '.\\deepspeed\\ops\\op_builder')
-    create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
+    create_dir_symlink('.\\deepspeed\\ops\\csrc', '..\\..\\csrc')
+    create_dir_symlink('.\\deepspeed\\ops\\op_builder', '..\\..\\op_builder')
+    create_dir_symlink('.\\deepspeed\\accelerator', '..\\accelerator')
     egg_info.manifest_maker.template = 'MANIFEST_win.in'
 
 # Parse the DeepSpeed version string from version.txt.

From f32ad3e1c562be80bd8be6b7b2246dc3041b5bfd Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:39:12 -0700
Subject: [PATCH 11/11] Un-pin torch version in nv-torch-latest back to latest
 and skip test_compile_zero tests on v100 (#5459)

Torch updating to 2.3.0 broke some test_compile_zero tests, we pinned
it, @tohtana pushed fixes in #5463, this should un-pin and move us back
to the latest.

Failing test that indicates the generated code cannot run bf16 on V100
[here](https://github.com/microsoft/DeepSpeed/actions/runs/8838672379/job/24270349996?pr=5459#step:8:5157).
---
 .github/workflows/nv-torch-latest-v100.yml      | 6 +++---
 tests/unit/runtime/compile/test_compile_zero.py | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 2e0490c18ba7..3109f6060944 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -29,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==2.2.2 torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -55,5 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.2" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.2" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.3" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.3" --cuda_ver="11.8"
diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py
index 9890ea708eec..a0736b0f5425 100644
--- a/tests/unit/runtime/compile/test_compile_zero.py
+++ b/tests/unit/runtime/compile/test_compile_zero.py
@@ -12,7 +12,7 @@
 
 from unit.runtime.compile.util import compare_loss
 from unit.common import DistributedTest
-from unit.util import bf16_required_version_check
+from unit.util import bf16_required_version_check, skip_on_arch
 
 pytestmark = pytest.mark.skipif(not required_torch_version(min_version=2.1),
                                 reason="Compile tests requires Pytorch version 2.1 or above")
@@ -26,9 +26,11 @@ class TestZeRO(DistributedTest):
     @pytest.mark.parametrize('zero_stage', [1, 2, 3])
     @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
     def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device):
+        if dtype == torch.bfloat16:
+            skip_on_arch(min_arch=8)
         if dtype == torch.bfloat16 and not bf16_required_version_check():
             pytest.skip(
-                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+                "DeepSpeed BFloat16 tests need NCCL >= 2.10.3, CUDA >=11.0, and HW support for BFloat16 to run correctly"
             )
         if get_accelerator().device_name() == "cpu":
             pytest.skip("CPU does not support this test yet")