diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml index 94571eb101bb..cf8756fbd528 100644 --- a/.github/workflows/nv-ds-chat.yml +++ b/.github/workflows/nv-ds-chat.yml @@ -10,6 +10,10 @@ on: required: false default: 'master' type: string + pull_request: + paths: + - "deepspeed/runtime/zero/stage_1_and_2.py" + - "deepspeed/runtime/zero/stage3.py" concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/build_win.bat b/build_win.bat index ec8c8a362a78..af5c5103fa4b 100644 --- a/build_win.bat +++ b/build_win.bat @@ -1,6 +1,10 @@ @echo off set DS_BUILD_AIO=0 +set DS_BUILD_CUTLASS_OPS=0 +set DS_BUILD_EVOFORMER_ATTN=0 +set DS_BUILD_FP_QUANTIZER=0 +set DS_BUILD_RAGGED_DEVICE_OPS=0 set DS_BUILD_SPARSE_ATTN=0 echo Administrative permissions required. Detecting permissions... diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 68cab13c4a93..c6ff216edfcb 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -15,7 +15,7 @@ from deepspeed.utils import logger from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce -from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item +from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, get_only_unique_item from deepspeed.runtime.zero.partition_parameters import * from deepspeed.runtime.zero.config import ZeroStageEnum from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum @@ -2027,7 +2027,7 @@ def step(self, closure=None): return norm_groups = self._get_norm_groups() - scaled_global_grad_norm = torch.norm(torch.stack(norm_groups)) + scaled_global_grad_norm = get_global_norm(norm_list=norm_groups) # Stash unscaled gradient norm self._global_grad_norm = scaled_global_grad_norm / self.loss_scale