Merge branch 'master' into loadams/transformers-inference

microsoft · Dec 17, 2024 · cc34e72 · cc34e72
2 parents 36bd4c7 + 2f32966
commit cc34e72
Show file tree

Hide file tree

Showing 38 changed files with 952 additions and 72 deletions.
diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
@@ -42,7 +42,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 6c3f168b3
+          # git checkout 6c3f168b3
           git rev-parse --short HEAD
           pip install .
 

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
@@ -52,7 +52,5 @@ jobs:
           pip install .[testing]
           # force protobuf version due to issues
           pip install "protobuf<4.21.0"
-          # force pytest-subtests version due to issues
-          pip install pytest-subtests==0.13.1
           pip list
           pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -38,7 +38,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout 6c3f168b3
+          # git checkout 6c3f168b3
           git rev-parse --short HEAD
           pip install .
 

diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
@@ -2,6 +2,9 @@ name: nv-torch-nightly-v100
 
 on:
   workflow_dispatch:
+    pull_request:
+    paths:
+      - '.github/workflows/nv-torch-nightly-v100.yml'
   schedule:
     - cron: "0 0 * * *"
 
@@ -34,7 +37,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
+          # git checkout 6c3f168b3
           git rev-parse --short HEAD
           pip install .
 

diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
@@ -31,10 +31,10 @@ jobs:
       run: |
         apt-get update
         apt-get install clinfo libaio-dev python3-pip -y
-        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
-        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
-        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
-        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
         pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
         pip install py-cpuinfo numpy
         pip install .[dev,autotuning]

diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
@@ -47,10 +47,10 @@ jobs:
       run: |
         apt-get update
         apt-get install clinfo libaio-dev python3-pip -y
-        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
-        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
-        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
-        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
         pip install py-cpuinfo numpy
         pip install .[dev,autotuning]
 

diff --git a/COMMITTERS.md b/COMMITTERS.md
@@ -5,5 +5,7 @@
 | Olatunji Ruwase | [tjruwase](https://github.com/tjruwase)     | Microsoft |
 | Logan Adams     | [loadams](https://github.com/loadams)      | Microsoft |
 | Masahiro Tanaka | [tohtana](https://github.com/tohtana)      | Microsoft |
-| Jeff Rasley     | [jeffra](https://github.com/jeffra)       | SnowFlake |
-| Minjia Zhang    | [minjiazhang](https://github.com/minjiazhang)  | UIUC      |
+| Jeff Rasley     | [jeffra](https://github.com/jeffra)       | SnowFlake  |
+| Minjia Zhang    | [minjiazhang](https://github.com/minjiazhang)  | UIUC  |
+| Ashwin Aji      | [ashwinma](https://github.com/ashwinma)        | AMD   |
+| Sam Foreman     | [saforem2](https://github.com/saforem2)        | Argonne National Laboratory |
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@
 * [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
 * [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
 * [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
-* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
+* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
 
 
 <!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
@@ -78,7 +78,7 @@ To further increase the inference efficiency, DeepSpeed offers easy-to-use and f
 
 ## DeepSpeed4Science
 
-In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [DeepSpeed4Science website](https://deepspeed4science.ai/) and [tutorials](https://www.deepspeed.ai/deepspeed4science/)
+In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [tutorials](https://www.deepspeed.ai/deepspeed4science/)
 
 ---
 

diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
@@ -125,10 +125,9 @@ def get_accelerator():
         if accelerator_name is None:
             try:
                 import intel_extension_for_pytorch as ipex
+
                 if ipex._C._has_xpu():
                     accelerator_name = "xpu"
-                else:
-                    accelerator_name = "cpu"
             except ImportError as e:
                 pass
         if accelerator_name is None:
@@ -162,23 +161,27 @@ def get_accelerator():
             except ImportError as e:
                 pass
         if accelerator_name is None:
-            # borrow this log from PR#5084
             try:
                 import torch
 
                 # Determine if we are on a GPU or x86 CPU with torch.
-                if torch.cuda.is_available():  #ignore-cuda
+                # "torch.cuda.is_available()" provides a stronger guarantee,     #ignore-cuda
+                # ensuring that we are free from CUDA initialization errors.
+                # While "torch.cuda.device_count() > 0" check ensures that       #ignore-cuda
+                # we won't try to do any CUDA calls when no device is available
+                # For reference: https://github.com/microsoft/DeepSpeed/pull/6810
+                if torch.cuda.device_count() > 0 and torch.cuda.is_available():  #ignore-cuda
                     accelerator_name = "cuda"
-                else:
-                    if accel_logger is not None:
-                        accel_logger.warn(
-                            "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it."
-                        )
-                    accelerator_name = "cpu"
             except (RuntimeError, ImportError) as e:
                 # TODO need a more decent way to detect which accelerator to use, consider using nvidia-smi command for detection
-                accelerator_name = "cuda"
                 pass
+        if accelerator_name is None:
+            # borrow this log from PR#5084
+            if accel_logger is not None:
+                accel_logger.warn(
+                    "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.")
+            # cpu added as catch-all when accelerator detection fails
+            accelerator_name = "cpu"
 
         ds_set_method = "auto detect"
 

diff --git a/csrc/includes/quantization.h b/csrc/includes/quantization.h
@@ -52,6 +52,36 @@ void launch_swizzled_quant(int8_t* q_data,
                            int devices_per_node,
                            cudaStream_t stream);
 
+void launch_loco_swizzled_quant(int8_t* quantized_data,
+                                float* quantized_scales,
+                                const __half* uncompressed_data,
+                                __half* error_feedback,
+                                const float err_beta,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int groups,
+                                int elems_per_group,
+                                int pipelining,
+                                int nodes,
+                                int devices_per_node,
+                                cudaStream_t stream);
+
+void launch_loco_dequant_reduce(int8_t* reduced_data,
+                                float* reduced_scales,
+                                const int8_t* input_data,
+                                const float* input_scales,
+                                int num_gpus,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int out_groups,
+                                int elems_per_out_group,
+                                int elems_per_in_tensor,
+                                int groups_per_in_tensor,
+                                int elems_per_in_group,
+                                __half2* error_feedback,
+                                const float err_beta,
+                                cudaStream_t stream);
+
 void launch_dequant_reduce(int8_t* reduced_data,
                            float* reduced_scales,
                            const int8_t* input_data,

diff --git a/csrc/includes/quantization_utils.h b/csrc/includes/quantization_utils.h
@@ -24,6 +24,7 @@ constexpr int max_threads = 1024;
 Class to hold the quantization parameters for a given tensor.
 Holds the implementation of the quantization operation.
 */
+
 template <Type qType, int numBits>
 class Params {
 public:

diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
@@ -176,6 +176,53 @@ at::Tensor dequantize_int8_to_half_experimental(at::Tensor& data_in,
     return output;
 }
 
+std::vector<at::Tensor> ds_loco_swizzle_quant(at::Tensor& input_vals,
+                                              at::Tensor& error_feedback,
+                                              float err_beta,
+                                              int groups,
+                                              int num_bits,
+                                              quantize::Type quant_type,
+                                              int pipeline_size,
+                                              int nodes,
+                                              int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+    auto scales = torch::empty({groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int quantization_scalar = 8 / num_bits;
+    const int compressed_vals = at::numel(input_vals) / quantization_scalar;
+
+    auto output = torch::empty({compressed_vals}, output_options);
+    const int elems_per_group = at::numel(input_vals) / groups;
+
+    launch_loco_swizzled_quant(reinterpret_cast<int8_t*>(output.data_ptr()),
+                               reinterpret_cast<float*>(scales.data_ptr()),
+                               reinterpret_cast<const __half*>(input_vals.data_ptr()),
+                               reinterpret_cast<__half*>(error_feedback.data_ptr()),
+                               err_beta,
+                               num_bits,
+                               quant_type,
+                               groups,
+                               elems_per_group,
+                               pipeline_size,
+                               nodes,
+                               devices_per_node,
+                               at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
 std::vector<at::Tensor> ds_swizzle_quant(at::Tensor& input_vals,
                                          int groups,
                                          int num_bits,
@@ -265,6 +312,61 @@ std::vector<at::Tensor> quantized_reduction(at::Tensor& input_vals,
     return {output, scales};
 }
 
+std::vector<at::Tensor> loco_quantized_reduction(at::Tensor& input_vals,
+                                                 at::Tensor& input_scales,
+                                                 at::Tensor& error_feedback,
+                                                 float err_beta,
+                                                 int in_groups,
+                                                 int out_groups,
+                                                 int num_bits,
+                                                 quantize::Type quant_type,
+                                                 int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+
+    auto scales = torch::empty({out_groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    std::vector<int64_t> sz(input_vals.sizes().begin(), input_vals.sizes().end());
+    sz[sz.size() - 1] = sz.back() / devices_per_node;
+
+    const int elems_per_in_tensor = at::numel(input_vals) / devices_per_node;
+
+    auto output = torch::empty(sz, output_options);
+
+    const int elems_per_in_group = elems_per_in_tensor / (in_groups / devices_per_node);
+    const int elems_per_out_group = elems_per_in_tensor / out_groups;
+
+    launch_loco_dequant_reduce((int8_t*)output.data_ptr(),
+                               (float*)scales.data_ptr(),
+                               (const int8_t*)input_vals.data_ptr(),
+                               (const float*)input_scales.data_ptr(),
+                               devices_per_node,
+                               num_bits,
+                               quant_type,
+                               out_groups,
+                               elems_per_out_group,
+                               elems_per_in_tensor,
+                               in_groups / devices_per_node,
+                               elems_per_in_group,
+                               (__half2*)error_feedback.data_ptr(),
+                               err_beta,
+                               at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
@@ -295,4 +397,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "Dequantize int8 to half (experimental)");
     m.def("swizzle_quant", &ds_swizzle_quant);
     m.def("quantized_reduction", &quantized_reduction);
+    m.def("loco_swizzle_quant", &ds_loco_swizzle_quant, "LoCo Swizzled Quantization Kernel");
+    m.def("loco_quantized_reduction",
+          &loco_quantized_reduction,
+          "LoCo Quantization and Reduction Kernel");
 }