From 5e6c9b931184bf8f0a245a2fef183078139b6d7e Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Tue, 23 Apr 2024 12:24:37 -0700 Subject: [PATCH 1/5] OptimizedLinear implementation (#5355) Optimized version of `nn.Linear` that adds features such as: * LoRA w. base weight sharding * FP [6,8,12] quantization Depends on #5336 being merged first Co-authored-by: @rajhans Co-authored-by: @aurickq --------- Co-authored-by: Rajhans Samdani Co-authored-by: Jeff Rasley --- deepspeed/linear/__init__.py | 7 ++ deepspeed/linear/config.py | 39 +++++++ deepspeed/linear/optimized_linear.py | 150 +++++++++++++++++++++++++ deepspeed/linear/quantization.py | 137 ++++++++++++++++++++++ deepspeed/ops/fp_quantizer/__init__.py | 2 +- deepspeed/ops/fp_quantizer/quantize.py | 33 +++++- tests/unit/linear/test_linear.py | 128 +++++++++++++++++++++ tests/unit/linear/test_quant_param.py | 58 ++++++++++ 8 files changed, 550 insertions(+), 4 deletions(-) create mode 100644 deepspeed/linear/__init__.py create mode 100644 deepspeed/linear/config.py create mode 100644 deepspeed/linear/optimized_linear.py create mode 100644 deepspeed/linear/quantization.py create mode 100644 tests/unit/linear/test_linear.py create mode 100644 tests/unit/linear/test_quant_param.py diff --git a/deepspeed/linear/__init__.py b/deepspeed/linear/__init__.py new file mode 100644 index 000000000000..a27f1c3eaee7 --- /dev/null +++ b/deepspeed/linear/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from .optimized_linear import OptimizedLinear +from .config import LoRAConfig, QuantizationConfig diff --git a/deepspeed/linear/config.py b/deepspeed/linear/config.py new file mode 100644 index 000000000000..ae9050a3c92b --- /dev/null +++ b/deepspeed/linear/config.py @@ -0,0 +1,39 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from dataclasses import dataclass + + +@dataclass +class LoRAConfig: + """ + Configuration settings for LoRAOptimizedLinear. + + Attributes: + lora_r (int): LoRA attention dimension, also know as the rank. Defaults is 64. + lora_alpha (float): LoRA scaling factor, default is 16. + base_weight_sharding (int): The degree to which the base weights are sharded, + should typically be set to the data-parallel world size to maximize the memory + reduction benefits. Defaults to 1, which means this feature is disabled. + """ + lora_r: int = 64 + lora_alpha: float = 16. + base_weight_sharding: int = 1 + + +@dataclass +class QuantizationConfig: + """ + Configuration settings for quantization for LoRAOptimizedLinear, QuantizedLinear, + and QuantizedParameter + + Attributes: + q_bits (int): The number of bits used for quantization. Default is 8. + mantissa_bits (int): The number of bits reserved for the mantissa in fixed-point quantization. Default is 3. + group_size (int): The size of the group used for quantization. Default is 512. + """ + q_bits: int = 8 + mantissa_bits: int = 3 + group_size: int = 512 diff --git a/deepspeed/linear/optimized_linear.py b/deepspeed/linear/optimized_linear.py new file mode 100644 index 000000000000..138bd493ffc7 --- /dev/null +++ b/deepspeed/linear/optimized_linear.py @@ -0,0 +1,150 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import torch +import math +import torch.nn as nn +import torch.nn.functional as F +from dataclasses import is_dataclass +from deepspeed.accelerator import get_accelerator +import deepspeed.comm as dist + +from .config import LoRAConfig, QuantizationConfig +from .quantization import QuantizedParameter, QuantizedLinear + + +class OptimizedLinear(nn.Module): + """ + Optimized version of nn.Linear that adds features such as: + * LoRA w. base weight sharding + * FP [6,8,12] quantization + + Arguments: + input_dim: Required: size of each input sample + output_dim: Required: size of each output sample + bias: Optional: If set to False, the layer will not learn an additive bias. Default: False + lora_config: Optional: LoRAConfig defining lora features and base-weight-sharding degree + quantization_config: Optional: QuantizationConfig defining quantization features + dtype: Optional: parameter dtype, only supports bfloat16 currently + + Returns: + Returns a new nn.Module depending on the input config. Either native + torch.nn.Linear, QuantizedLinear, or the full-featured DSOptimizedLinear. + """ + + def __new__(self, + input_dim: int, + output_dim: int, + bias: bool = False, + lora_config: LoRAConfig = None, + quantization_config: QuantizationConfig = None, + dtype=torch.bfloat16): + + if quantization_config is not None and not is_dataclass(quantization_config): + raise ValueError(f"Expecting QuantizationConfig but received {type(quantization_config)}") + if lora_config is not None and not is_dataclass(lora_config): + raise ValueError(f"Expecting LoRAConfig but received {type(lora_config)}") + if lora_config is None and quantization_config is None: + # Everything disabled, fall back to normal nn.Linear + self = nn.Linear(input_dim, output_dim, bias=bias, dtype=dtype) + + elif lora_config: + # lora enabled, quantization may or may not be + self = LoRAOptimizedLinear(input_dim=input_dim, + output_dim=output_dim, + bias=bias, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=dtype) + + elif quantization_config: + # only quantization enabled, no lora + self = QuantizedLinear(input_dim=input_dim, + output_dim=output_dim, + bias=bias, + quantization_config=quantization_config, + dtype=dtype) + return self + + +class LoRAOptimizedLinear(nn.Module): + + def __init__(self, + input_dim: int, + output_dim: int, + bias: bool = False, + lora_config: LoRAConfig = None, + quantization_config: QuantizationConfig = None, + device=None, + dtype=torch.bfloat16): + super().__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.bias = bias + self.lora_config = lora_config + self.quantization_config = quantization_config + device = get_accelerator().current_device() if device is None else device + assert self.lora_config is not None, "DSOptimizedLinear requires a LoRA config" + + self.zero_shards = self.lora_config.base_weight_sharding + self.sharded_weight_size = int(float(self.input_dim) // self.zero_shards) + w = torch.nn.Parameter(torch.empty((self.output_dim, self.sharded_weight_size), dtype=dtype)) + torch.nn.init.xavier_uniform_(w) + + if self.quantization_config is not None: + assert dtype == torch.bfloat16, "only bfloat16 is supported when using quantization" + self.base_weight = QuantizedParameter(w, quantization_config=quantization_config) + else: + self.base_weight = w + + self.base_weight.requires_grad = False + + # Use RS lora for now. + self.lora_scaling_factor = self.lora_config.lora_alpha / math.sqrt(self.lora_config.lora_r) + # Keeping lora weights in bf16 precision for ease of training. + self.lora_weight_1 = nn.Linear(self.input_dim, + self.lora_config.lora_r, + bias=self.bias, + device=device, + dtype=dtype) + self.lora_weight_2 = nn.Linear(self.lora_config.lora_r, + self.output_dim, + bias=self.bias, + device=device, + dtype=dtype) + self.lora_weight_1.weight.requires_grad = True + self.lora_weight_2.weight.requires_grad = True + + def full_weight(self): + # This assumes weights are evenly sharded across gpus. which might not be correct. + # in that case, we should flatten before all_gather. + local_weight = self.base_weight.dequantized() if isinstance(self.base_weight, + QuantizedParameter) else self.base_weight + tensor_list = [ + torch.zeros_like(local_weight, device=local_weight.device, dtype=local_weight.dtype) + for _ in range(self.zero_shards) + ] + dist.all_gather(tensor_list, local_weight) + weight = nn.Parameter(torch.cat([tensor for tensor in tensor_list], dim=1)) + return weight + + def linear_without_F_linear(self, input, weight): + output = torch.mm(input.reshape(-1, input.shape[-1]), weight) + output = output.view(*input.shape[:-1], weight.shape[1]) + return output + + def forward(self, input_tensor): + # Gather the sharded base weight + if self.zero_shards > 1: + with torch.no_grad(): + base_weight = self.full_weight() + elif self.quantization_config: + base_weight = self.base_weight.dequantized() + else: + base_weight = self.base_weight + + base_weight_output = F.linear(input_tensor, base_weight) + lora_output = self.lora_weight_2(self.lora_weight_1(input_tensor)) + return base_weight_output + self.lora_scaling_factor * lora_output diff --git a/deepspeed/linear/quantization.py b/deepspeed/linear/quantization.py new file mode 100644 index 000000000000..f5343af45fb8 --- /dev/null +++ b/deepspeed/linear/quantization.py @@ -0,0 +1,137 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F + +from typing import Optional + +from deepspeed.accelerator import get_accelerator +from deepspeed.ops.fp_quantizer import Quantizer, FP_Quantize +from .config import QuantizationConfig + + +class QuantizedParameter(nn.Parameter): + """ + Quantized parameter class that implements weight quantization. Weights + are stored in quantized form on GPUs, and can be dequantized on-the-fly when + needed by the model. The weights are actually quantized during any `.to(device)`. + + Arguments: + data (Tensor): parameter tensor. + requires_grad (bool, optional): if the parameter requires gradient. Defaults + to False and is not supported to be True. Argument provided only for interface + compatibility with torch.nn.Parameter. + quantization_config (QuantizationConfig, optional): + quantizer (Quantizer, optional): Defaults to FP_Quantize but can be any quantizer + that implements deepspeed.ops.fp_quantizer.Quantizer. This argument is also + required since the quantizer is stashed in the Parameter itself, some models + may clone the Parameter by passing an attribute __dict__. For an example, see + tests/unit/linear/test_quant_param.py::TestQuantParam::test_hf_clone + """ + + def __new__( + cls, + data: Optional[torch.Tensor] = None, + requires_grad: bool = False, # quantized weights must be frozen + quantization_config: QuantizationConfig = None, + quantizer: Quantizer = None, + ): + if requires_grad: + raise ValueError(f"requires_grad=True is not supported with QuantizedParameter") + if data is None: + data = torch.empty(0) + self = torch.Tensor._make_subclass(cls, data, requires_grad) + self.quantization_config = QuantizationConfig() if quantization_config is None else quantization_config + if quantizer is not None: + self.quantizer = quantizer + else: + # if FPQuantizerBuilder is not compatible in this env this init will fail + self.quantizer = FP_Quantize(group_size=self.quantization_config.group_size) + self._ensure_quantized(self) + return self + + def _ensure_quantized(self, tensor: torch.Tensor): + # If the tensor is on the accelerator and is not quantized, then quantize it in-place. + if get_accelerator().on_accelerator(tensor) and tensor.dtype != torch.int8: + with get_accelerator().stream(get_accelerator().current_stream(tensor.device)): + tensor.data = self.quantizer.quantize(tensor.data, + q_bits=self.quantization_config.q_bits, + q_mantisa_bits=self.quantization_config.mantissa_bits) + assert tensor.dtype == torch.int8 + + def dequantized(self) -> torch.Tensor: + """ + Return a tensor containing the dequantized weights of this parameter. + """ + if get_accelerator().on_accelerator(self.data) and self.data.dtype == torch.int8: + with get_accelerator().stream(get_accelerator().current_stream(self.data.device)): + return self.quantizer.dequantize(self.data, + q_bits=self.quantization_config.q_bits, + q_mantisa_bits=self.quantization_config.mantissa_bits) + return self.data + + def __getstate__(self): + state = self.__dict__ + state["data"] = self.data + state["quantization_config"] = self.quantization_config + state["requires_grad"] = self.requires_grad + return state + + def __setstate__(self, state): + self.quantizer = state["quantizer"] + self.quantization_config = state["quantization_config"] + self.data = state["data"] + self.requires_grad = state["requires_grad"] + + def __deepcopy__(self, memo): + new_instance = type(self).__new__(type(self)) + state = self.__getstate__() + new_instance.__setstate__(state) + new_instance.quantizer = copy.deepcopy(state["quantizer"]) + new_instance.quantization_config = copy.deepcopy(state["quantization_config"]) + new_instance.data = copy.deepcopy(state["data"]) + return new_instance + + def __copy__(self): + new_instance = type(self).__new__(type(self)) + state = self.__getstate__() + new_instance.__setstate__(state) + return new_instance + + def cuda(self, device=None, non_blocking=False): + return self.to(device="cuda" if device is None else device, non_blocking=non_blocking) + + def to(self, *args, **kwargs): + """ + Move the parameter to the given device. Then, if the device is a cuda device, + quantize it. + """ + tensor = super().to(*args, **kwargs) + self._ensure_quantized(tensor) + return tensor + + +class QuantizedLinear(nn.Linear): + """ + Linear layer that implements weight quantization. Parameters + are stored via `QuantizedParameter` and are dequantized on-the-fly during any + forward pass. + """ + + def __init__(self, + input_dim: int, + output_dim: int, + bias: bool = False, + quantization_config: QuantizationConfig = None, + dtype=torch.bfloat16): + super().__init__(input_dim, output_dim, bias=bias, dtype=dtype) + assert dtype == torch.bfloat16, "currently only supports bfloat16 dtype" + self.weight = QuantizedParameter(self.weight.data, quantization_config=quantization_config) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.linear(input, self.weight.dequantized(), self.bias) diff --git a/deepspeed/ops/fp_quantizer/__init__.py b/deepspeed/ops/fp_quantizer/__init__.py index 5575f3567185..995bbae4aeaf 100644 --- a/deepspeed/ops/fp_quantizer/__init__.py +++ b/deepspeed/ops/fp_quantizer/__init__.py @@ -3,4 +3,4 @@ # DeepSpeed Team -from .quantize import FP_Quantize +from .quantize import FP_Quantize, Quantizer diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py index 0d4bf7bc6db1..f8435bda16c1 100644 --- a/deepspeed/ops/fp_quantizer/quantize.py +++ b/deepspeed/ops/fp_quantizer/quantize.py @@ -4,20 +4,47 @@ # DeepSpeed Team import torch +import abc +from abc import ABC from deepspeed.ops.op_builder import FPQuantizerBuilder fp_quant_module = None -class FP_Quantize: +class Quantizer(ABC): + """ + Abstract Quantizer class that implmenents quantize/dequantize methods. + + Arguments: + group_size (int, optional): number of values or elements that are grouped + together for the quantization process. + """ + + def __init__(self, group_size=512) -> None: + self.group_size = group_size + + @abc.abstractmethod + def quantize(self, + input, + q_bits=8, + q_mantisa_bits=3, + stochastic_mode=False, + return_meta_tensor=False) -> torch.Tensor: + ... + + @abc.abstractmethod + def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=None) -> torch.Tensor: + ... + + +class FP_Quantize(Quantizer): def __init__(self, group_size=512) -> None: global fp_quant_module + super().__init__(group_size=group_size) if fp_quant_module is None: fp_quant_module = FPQuantizerBuilder().load() - - self.group_size = group_size self.orig_dtype = None def quantize(self, diff --git a/tests/unit/linear/test_linear.py b/tests/unit/linear/test_linear.py new file mode 100644 index 000000000000..ccd26b4cd726 --- /dev/null +++ b/tests/unit/linear/test_linear.py @@ -0,0 +1,128 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import torch +import deepspeed +import deepspeed.comm as dist + +from deepspeed.accelerator import get_accelerator +from deepspeed.linear import OptimizedLinear, LoRAConfig, QuantizationConfig +from unit.common import DistributedTest + +from deepspeed.ops.op_builder import FPQuantizerBuilder + +if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]: + pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True) + + +class TestBasicLinear(DistributedTest): + world_size = 2 + + def test(self): + lora_config = None + quantization_config = None + + input_features = 64 # Number of input features + output_features = 64 # Number of output features + batch_size = 1 # Number of samples in a batch + + linear_layer = OptimizedLinear(input_dim=input_features, + output_dim=output_features, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=torch.bfloat16) + + dummy_input = torch.rand(batch_size, input_features, dtype=torch.bfloat16) + output = linear_layer(dummy_input) + assert output.shape == (batch_size, output_features) + + +@pytest.mark.parametrize("base_weight_sharding", [1, 2]) +class TestLoRALinear(DistributedTest): + world_size = 2 + + def test(self, base_weight_sharding): + rank = dist.get_rank() + lora_config = None + quantization_config = None + + input_features = 64 # Number of input features + output_features = 64 # Number of output features + batch_size = 5 # Number of samples in a batch + + lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding) + + linear_layer = OptimizedLinear(input_dim=input_features, + output_dim=output_features, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=torch.bfloat16) + device = get_accelerator().current_device_name() + linear_layer = linear_layer.to(device) + if rank == 0: + for n, p in linear_layer.named_parameters(): + print(f"{n}, {p.shape}") + + dummy_input = torch.rand(batch_size, input_features, device=device, dtype=torch.bfloat16) + + output = linear_layer(dummy_input) + assert output.shape == (batch_size, output_features) + + +@pytest.mark.parametrize("q_bits", [8, 6]) +class TestQuantLinear(DistributedTest): + world_size = 2 + + def test(self, q_bits): + rank = dist.get_rank() + lora_config = None + + input_features = 64 # Number of input features + output_features = 64 # Number of output features + batch_size = 5 # Number of samples in a batch + + lora_config = None + quantization_config = QuantizationConfig(q_bits=q_bits) + + linear_layer = OptimizedLinear(input_dim=input_features, + output_dim=output_features, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=torch.bfloat16) + device = get_accelerator().current_device_name() + linear_layer = linear_layer.to(device) + dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16) + + output = linear_layer(dummy_input) + assert output.shape == (batch_size, output_features) + + +@pytest.mark.parametrize("base_weight_sharding", [1, 2], ids=['bws1', 'bws2']) +@pytest.mark.parametrize("q_bits", [8, 6], ids=['qbit8', 'qbit6']) +class TestOptimizedLinear(DistributedTest): + world_size = 2 + + def test(self, base_weight_sharding, q_bits): + rank = dist.get_rank() + lora_config = None + + input_features = 64 # Number of input features + output_features = 64 # Number of output features + batch_size = 5 # Number of samples in a batch + + lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding) + quantization_config = QuantizationConfig(q_bits=q_bits) + + linear_layer = OptimizedLinear(input_dim=input_features, + output_dim=output_features, + lora_config=lora_config, + quantization_config=quantization_config, + dtype=torch.bfloat16) + device = get_accelerator().current_device_name() + linear_layer = linear_layer.to(device) + dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16) + output = linear_layer(dummy_input) + assert output.shape == (batch_size, output_features) diff --git a/tests/unit/linear/test_quant_param.py b/tests/unit/linear/test_quant_param.py new file mode 100644 index 000000000000..9479b3cba8a0 --- /dev/null +++ b/tests/unit/linear/test_quant_param.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import torch +import deepspeed + +from deepspeed.accelerator import get_accelerator +from deepspeed.linear.quantization import QuantizedParameter +from deepspeed.linear.config import QuantizationConfig + +from deepspeed.ops.op_builder import FPQuantizerBuilder + +from unit.common import DistributedTest + +if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]: + pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True) + + +class TestQuantParam(DistributedTest): + world_size = 1 + + @pytest.mark.parametrize('dtype', [torch.half, torch.float]) + def test_unsupported_dtypes(self, dtype): + device = get_accelerator().current_device_name() + data = torch.rand(5, 5, device='cpu', dtype=dtype) + qp = QuantizedParameter(data) + with pytest.raises(AssertionError): + qp.to(device) + + def test_requires_grad(self): + data = torch.rand(5, 5, dtype=torch.bfloat16) + with pytest.raises(ValueError): + QuantizedParameter(data, requires_grad=True) + + def test_move_to_accelerator(self): + device = get_accelerator().current_device() + data = torch.rand(5, 5, device='cpu', dtype=torch.bfloat16) + qp = QuantizedParameter(data) + assert qp.device == torch.device('cpu') + qp = qp.to(get_accelerator().current_device_name()) + assert qp.device == torch.device(device) + assert qp.dtype == torch.int8 + + def test_hf_clone(self): + device = get_accelerator().current_device_name() + data = torch.rand(5, 5, device=device, dtype=torch.bfloat16) + + quantization_config = QuantizationConfig(q_bits=6) + qp = QuantizedParameter(data, quantization_config=quantization_config) + + # should be able to clone parameter via dict, HF expects this to work + qp_copy = QuantizedParameter(qp.data, **qp.__dict__) + + assert all(qp.data == qp_copy.data) + assert qp.quantization_config == qp_copy.quantization_config From ad2027952f9730cbd1a8385e4a441e470248645e Mon Sep 17 00:00:00 2001 From: Jhonso7393 <167781426+Jhonso7393@users.noreply.github.com> Date: Tue, 23 Apr 2024 23:45:47 +0300 Subject: [PATCH 2/5] Update README.md (#5453) Fixing a minor typo at the README file Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 201b9016f8ab..f9d81eddfdae 100755 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). * [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)] -* [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) +* [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19) * [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html) * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp) * [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)] From 5f631abc2f930ecece38fae05dc9bd3923c555dd Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Tue, 23 Apr 2024 16:24:12 -0700 Subject: [PATCH 3/5] Update PyTest torch version to match PyTorch latest official (2.3.0) (#5454) --- .github/workflows/cpu-torch-latest.yml | 4 ++-- .github/workflows/nv-torch-latest-v100.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml index 9c1ad02f75a6..5727ff2e1cde 100644 --- a/.github/workflows/cpu-torch-latest.yml +++ b/.github/workflows/cpu-torch-latest.yml @@ -50,5 +50,5 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.2" - TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.2" + TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.3" + TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.3" diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 3ca8ac43dfa4..2e0490c18ba7 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -29,7 +29,7 @@ jobs: - name: Install pytorch run: | - pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118 + pip install -U --cache-dir $TORCH_CACHE torch==2.2.2 torchvision --index-url https://download.pytorch.org/whl/cu118 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" From fbdf0eaf15f45cd2a8b5846ad9428609a3860b41 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 23 Apr 2024 16:27:27 -0700 Subject: [PATCH 4/5] Update version.txt after 0.14.2 release (#5458) **Auto-generated PR to update version.txt after a DeepSpeed release** Released version - 0.14.2 Author - @loadams Co-authored-by: loadams --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index e867cc2a66a8..ac4a79626c87 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.14.2 +0.14.3 From fa8458b1a80d6ba55091b17f092de19bbf95eb3d Mon Sep 17 00:00:00 2001 From: vikram singh shekhawat Date: Wed, 24 Apr 2024 20:55:18 +0530 Subject: [PATCH 5/5] Add getter and setter methods for compile_backend across accelerators. (#5299) Add getter and setter methods for `compile_backend` across accelerators, which provide a mechanism to retrieve the compile backend. These APIs handle user-defined backend selection and raise a `ValueError` with informative error messages for unsupported backends. --------- Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- accelerator/abstract_accelerator.py | 9 +++++++++ accelerator/cpu_accelerator.py | 12 ++++++++++++ accelerator/cuda_accelerator.py | 12 ++++++++++++ accelerator/hpu_accelerator.py | 12 ++++++++++++ accelerator/mps_accelerator.py | 12 ++++++++++++ accelerator/npu_accelerator.py | 12 ++++++++++++ accelerator/xpu_accelerator.py | 12 ++++++++++++ tests/unit/runtime/compile/test_compile_wrapper.py | 4 +--- tests/unit/runtime/compile/test_compile_zero.py | 4 +--- tests/unit/runtime/compile/test_load_config.py | 4 +--- 10 files changed, 84 insertions(+), 9 deletions(-) diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 768d5ea34e5e..690f19292299 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -12,6 +12,7 @@ class DeepSpeedAccelerator(ABC): def __init__(self): self._name = None self._communication_backend_name = None + self._compile_backend = None @abc.abstractmethod def is_synchronized_device(self): @@ -295,3 +296,11 @@ def visible_devices_envs(self): @abc.abstractmethod def set_visible_devices_envs(self, current_env, local_accelerator_ids): ... + + @abc.abstractmethod + def get_compile_backend(self): + ... + + @abc.abstractmethod + def set_compile_backend(self, backend): + ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 237e7f51dcb4..bd11d034f312 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -20,6 +20,7 @@ class CPU_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'cpu' + self._compile_backend = "inductor" if oneccl_imported_p: self._communication_backend_name = 'ccl' else: @@ -330,3 +331,14 @@ def visible_devices_envs(self): def set_visible_devices_envs(self, current_env, local_accelerator_ids): for env in self.visible_devices_envs(): current_env[env] = ",".join(map(str, local_accelerator_ids)) + + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 2fc0cfd94125..60d66b6cdbab 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -25,6 +25,7 @@ class CUDA_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'cuda' self._communication_backend_name = 'nccl' + self._compile_backend = "inductor" if pynvml is None: self._init_pynvml() @@ -367,3 +368,14 @@ def visible_devices_envs(self): def set_visible_devices_envs(self, current_env, local_accelerator_ids): for env in self.visible_devices_envs(): current_env[env] = ",".join(map(str, local_accelerator_ids)) + + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 326efc8fa01b..114f367e879d 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -16,6 +16,7 @@ class HPU_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'hpu' self._communication_backend_name = 'hccl' + self._compile_backend = "hpu_backend" try: import habana_frameworks.torch.hpu as hpu hpu.setDeterministic(True) @@ -301,3 +302,14 @@ def visible_devices_envs(self): def set_visible_devices_envs(self, current_env, local_accelerator_ids): for env in self.visible_devices_envs(): current_env[env] = ",".join(map(str, local_accelerator_ids)) + + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index ff70b860d7c7..5fc9b1c8cfb6 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -20,6 +20,7 @@ class MPS_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = "mps" self._communication_backend_name = None + self._compile_backend = "inductor" def is_synchronized_device(self): return False @@ -267,3 +268,14 @@ def visible_devices_envs(self): def set_visible_devices_envs(self, current_env, local_accelerator_ids): for env in self.visible_devices_envs(): current_env[env] = ",".join(map(str, local_accelerator_ids)) + + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 5d891ecb707d..b0e0ff948e52 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -20,6 +20,7 @@ def __init__(self): super().__init__() self._name = 'npu' self._communication_backend_name = 'hccl' + self._compile_backend = "inductor" # dict that holds class name <--> class type mapping i.e. # 'AsyncIOBuilder': # this dict will be filled at init stage @@ -285,3 +286,14 @@ def visible_devices_envs(self): def set_visible_devices_envs(self, current_env, local_accelerator_ids): for env in self.visible_devices_envs(): current_env[env] = ",".join(map(str, local_accelerator_ids)) + + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }") diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index c59f60077d2f..9c4a9c903f96 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -14,6 +14,7 @@ class XPU_Accelerator(DeepSpeedAccelerator): def __init__(self): self._name = 'xpu' self._communication_backend_name = 'ccl' + self._compile_backend = "inductor" self.aligned_tensors = [] def is_synchronized_device(self): @@ -296,3 +297,14 @@ def visible_devices_envs(self): def set_visible_devices_envs(self, current_env, local_accelerator_ids): for env in self.visible_devices_envs(): current_env[env] = ",".join(map(str, local_accelerator_ids)) + + def get_compile_backend(self): + return self._compile_backend + + def set_compile_backend(self, backend): + supported_backends = torch._dynamo.list_backends(exclude_tags=()) + if backend in supported_backends: + self._compile_backend = backend + else: + raise ValueError( + f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}") diff --git a/tests/unit/runtime/compile/test_compile_wrapper.py b/tests/unit/runtime/compile/test_compile_wrapper.py index d1830534f6ea..62af25ac3ba4 100644 --- a/tests/unit/runtime/compile/test_compile_wrapper.py +++ b/tests/unit/runtime/compile/test_compile_wrapper.py @@ -31,11 +31,9 @@ def base_config(): }, "compile": { "enabled": True, - "backend": "inductor" + "backend": get_accelerator().get_compile_backend() } } - if get_accelerator().device_name() == 'hpu': - config_dict['compile']['backend'] = 'hpu_backend' return config_dict diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py index 7568c27e3ed2..9890ea708eec 100644 --- a/tests/unit/runtime/compile/test_compile_zero.py +++ b/tests/unit/runtime/compile/test_compile_zero.py @@ -51,12 +51,10 @@ def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device): }, "compile": { "enabled": True, - "backend": "inductor" + "backend": get_accelerator().get_compile_backend() } } - if get_accelerator().device_name() == 'hpu': - config_dict['compile']['backend'] = 'hpu_backend' if offload_device == OffloadDeviceEnum.cpu: config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device} elif offload_device == OffloadDeviceEnum.nvme: diff --git a/tests/unit/runtime/compile/test_load_config.py b/tests/unit/runtime/compile/test_load_config.py index 601adae58884..cee8d3b23f6b 100644 --- a/tests/unit/runtime/compile/test_load_config.py +++ b/tests/unit/runtime/compile/test_load_config.py @@ -47,12 +47,10 @@ def base_config(): }, "compile": { "enabled": True, - "backend": "inductor" + "backend": get_accelerator().get_compile_backend() } } - if get_accelerator().device_name() == 'hpu': - config_dict['compile']['backend'] = 'hpu_backend' return config_dict