diff --git a/deepspeed/linear/optimized_linear.py b/deepspeed/linear/optimized_linear.py
index 138bd493ffc7..e982785a8122 100644
--- a/deepspeed/linear/optimized_linear.py
+++ b/deepspeed/linear/optimized_linear.py
@@ -85,7 +85,7 @@ def __init__(self,
         self.bias = bias
         self.lora_config = lora_config
         self.quantization_config = quantization_config
-        device = get_accelerator().current_device() if device is None else device
+        device = get_accelerator().current_device_name() if device is None else device
         assert self.lora_config is not None, "DSOptimizedLinear requires a LoRA config"
 
         self.zero_shards = self.lora_config.base_weight_sharding
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index 88f7086518e8..4944e1954e37 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -13,7 +13,7 @@
 from deepspeed import comm as dist
 from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce
 from deepspeed.accelerator import get_accelerator
-from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw
+from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw, shard_chunk_mlp
 from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
 
 
@@ -133,7 +133,8 @@ def is_load_module(module):
         load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm]
         load_layer_names = [
             "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "FalconLinear",
-            "MistralRMSNorm", "T5LayerNorm", "MixtralRMSNorm"
+            "MistralRMSNorm", "T5LayerNorm", "MixtralRMSNorm", "Phi3RotaryEmbedding", "Phi3SuScaledRotaryEmbedding",
+            "Phi3RMSNorm"
         ]
         return module.__class__ in load_layers or module._get_name() in load_layer_names
 
@@ -328,6 +329,10 @@ def _replace(self, child, name, conv_linear_layer):
         # For mixtral-7x8b, need to skip MoE gate linear replace.
         if name == "block_sparse_moe.gate":
             return child
+        # for phi3.
+        if 'gate_up_proj' in name:
+            weight, bias = shard_chunk_mlp(child.weight.data, child.bias, dist.get_rank(), dist.get_world_size())
+            return LinearLayer(weight=weight, bias=bias)
         if name in self.all_reduce_linears:
             # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
             # else [weight_shape[0], weight_shape[1] // mp_size]
diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py
index cf087c16da8a..33d36fbfae54 100644
--- a/deepspeed/module_inject/fusedqkv_utils.py
+++ b/deepspeed/module_inject/fusedqkv_utils.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 import torch
 from deepspeed.utils.logging import warning_once
-from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list, get_num_kv_heads, get_n_embd
+from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list, get_num_kv_heads, get_n_embd, get_num_attention_heads
 
 
 def split_by_qkvlist_and_refuse(qkv_list, split_size, split_dim=0, cat_dim=0):
@@ -42,6 +42,7 @@ def prepare_tp_fused_qkvw(module, src, mp_size, gpu_index):
         "FalconDecoderLayer": 'bloomtype',
         "GPTBigCodeBlock": 'bigcodetype',
         "DecoderLayer": 'glmtype',
+        "Phi3DecoderLayer": "phi3type"
     }
 
     def _codegen_type_transpose(input, mp_size, codegen_mp_num=4):
@@ -93,6 +94,20 @@ def _bigcode_type_transpose(input, mp_size):
         split_q = q.split(get_shard_size_list(shape[0], mp_size), dim=0)
         return torch.cat((split_q[gpu_index], kv), dim=0)
 
+    def _phi3_type_transpose(input, mp_size):
+        num_kv_heads = get_num_kv_heads()
+        num_heads = get_num_attention_heads()
+        hidden_size = input.shape[1]
+        head_dim = hidden_size // num_heads
+        q_pos = input.shape[0] - 2 * num_kv_heads * head_dim
+        q = input[:q_pos]
+        k = input[q_pos:q_pos + num_kv_heads * head_dim]
+        v = input[q_pos + num_kv_heads * head_dim:]
+        split_q = q.split(get_shard_size_list(q.shape[0], mp_size), dim=0)
+        split_k = k.split(get_shard_size_list(k.shape[0], mp_size), dim=0)
+        split_v = v.split(get_shard_size_list(v.shape[0], mp_size), dim=0)
+        return torch.cat((split_q[gpu_index], split_k[gpu_index], split_v[gpu_index]), dim=0)
+
     def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None, module=None):
 
         # suppose num_heads=n, q(n)_w means the n-th q head linear weight, the weight format are as following
@@ -110,6 +125,8 @@ def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None, module=None):
             return _qwen_type_transpose(src, mp_size, module)
         elif fused_qkv_type == 'bigcodetype':
             return _bigcode_type_transpose(src, mp_size)
+        elif fused_qkv_type == 'phi3type':
+            return _phi3_type_transpose(src, mp_size)
 
         raise ValueError("unknown fused_qkv_type")
 
@@ -123,3 +140,24 @@ def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None, module=None):
     warning_once(f"Unrecognized fusedkqv weight type, default to using bloom type,"
                  f"please check in prepare_tp_fused_qkvw() to avoid potential calculation errors")
     return _bloom_type_transpose(src, mp_size)
+
+
+# For phi3 with chunk mlp, adjust the weight order.
+def shard_chunk_mlp(
+    weight,
+    bias,
+    rank,
+    world_size,
+):
+    weight_gate, weight_states = weight.chunk(2, dim=0)
+    total_size = weight_gate.shape[0]
+    split_weight_gate = weight_gate.split(get_shard_size_list(total_size, world_size, "mlp"), dim=0)
+    split_weight_states = weight_states.split(get_shard_size_list(total_size, world_size, "mlp"), dim=0)
+    shard_weight = torch.cat((split_weight_gate[rank], split_weight_states[rank]), dim=0)
+    if bias is not None:
+        bias_gate, bias_states = bias.chunk(2, dim=0)
+        split_bias_gate = bias_gate.split(get_shard_size_list(total_size, world_size, "mlp"), dim=0)
+        split_bias_states = bias_states.split(get_shard_size_list(total_size, world_size, "mlp"), dim=0)
+        return shard_weight, torch.cat((split_bias_gate[rank], split_bias_states[rank]), dim=0)
+
+    return shard_weight, None
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index e1703562d180..3029a79698dc 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -16,7 +16,7 @@
 from .auto_tp import AutoTP, ReplaceWithTensorSlicing, Loading
 
 from deepspeed import comm as dist
-from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd
+from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads
 
 from .load_checkpoint import load_model_with_checkpoint
 import time
@@ -290,6 +290,10 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         # 4.2 set n_embd
         set_n_embd(n_embd)
 
+        # 4.3 set attention_heads
+        if hasattr(model_config, 'num_attention_heads'):
+            set_num_attention_heads(getattr(model_config, 'num_attention_heads'))
+
         # 5. Set linear policies
         _autotp.update_linear_policies()
 
diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py
index 79c19b5f1272..6758c7a657f6 100644
--- a/deepspeed/module_inject/tp_shard.py
+++ b/deepspeed/module_inject/tp_shard.py
@@ -12,6 +12,11 @@ def set_num_kv_heads(num):
     num_kv_heads = num
 
 
+def set_num_attention_heads(num):
+    global num_attention_heads
+    num_attention_heads = num
+
+
 def set_n_embd(num):
     global n_embd
     n_embd = num
@@ -22,6 +27,11 @@ def get_num_kv_heads():
     return num_kv_heads
 
 
+def get_num_attention_heads():
+    global num_attention_heads
+    return num_attention_heads
+
+
 def get_shard_size(total_size, mp_size, name=None, rank=None):
     global num_kv_heads
     last_linear = ["lm_head", "embed_out"]
diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py
index b5e4e33425d0..66fe29fbbea2 100644
--- a/deepspeed/runtime/compiler.py
+++ b/deepspeed/runtime/compiler.py
@@ -83,84 +83,85 @@ def validate_enabled(cls, field_value, values):
         return field_value
 
 
-class CompiledModuleWrapper(torch.nn.Module):
-
-    def __init__(self, module, compile_config: Union[CompileConfig, None] = None):
-        super().__init__()
-
-        assert is_compile_supported(), "torch.compile is not supported on this version of PyTorch."
-
-        modules = self.__dict__.get('_modules')
-        modules['wrapped'] = module
-        self.__dict__['wrapped'] = module
-        self._is_compiled = False
-        self._backend = get_backend_fn(compile_config.backend)
-        self._compile_kwargs = compile_config.kwargs
-        self._compiler_fn = None
-
-    def __getattr__(self, name):
-        return getattr(self.__dict__['wrapped'], name)
-
-    def set_backend(self, backend: Union[str, Callable]):
-        """Set the backend for torch.compile.
-
-        Args:
-            backend (Union[str, Callable]): backend name or a function that takes a torch.nn.Module and returns a compiled module.
-            You can directly pass a function that works as a backend.
-            See also `backend` field in `CompileConfig` for more details.
-        """
-        self._backend = get_backend_fn(backend)
-
-    def set_torch_compile_kwargs(self, kwargs: Dict[str, Union[str, Any]]) -> None:
-        """Set kwargs for torch.compile. Kwargs that are set in DeepSpeed config will be overwritten.
-        You can also pass a backend name with "backend" key to change the backend.
-
-        Args:
-            kwargs (Dict[str, Union[str, Any]]): kwargs passed to torch.compile.
-        """
-
-        if "backend" in kwargs:
-            raise ValueError("backend cannot be set as compile kwargs. Use set_backend instead.")
-        self._compile_kwargs.update(kwargs)
-
-    def set_compiler_fn(self, compiler_fn: Callable) -> None:
-        """Set a function to be used for compiling the module.
-        This function should take a torch.nn.Module as input and return a compiled module.
-        Note that other compile options are ignored when a compiler_fn is set.
-
-        Example:
-        ```python
-            def my_compiler_fn(module: torch.nn.Module):
-                ...
-                return torch.compile(module, ...)
-
-            engine.set_compiler_fn(my_compiler_fn)
-        ```
-        """
-        self._compiler_fn = compiler_fn
-
-    def forward(self, *args, **kwargs) -> Any:
-        if not self.is_compiled:
-            if self._compiler_fn is None:
-                self.__dict__['wrapped'] = torch.compile(self.wrapped, backend=self._backend, **self._compile_kwargs)
-            else:
-                self.__dict__['wrapped'] = self._compiler_fn(self.wrapped)
-            self._is_compiled = True
-
-        return self.__dict__['wrapped'](*args, **kwargs)
-
-    @property
-    def is_compiled(self) -> bool:
-        return self._is_compiled
-
-    @property
-    def backend(self) -> Union[str, Callable]:
-        return self._backend
-
-    @property
-    def torch_compile_kwargs(self) -> Dict[str, Any]:
-        return self._compile_kwargs
-
-    @property
-    def compiler_fn(self) -> Union[Callable, None]:
-        return self._compiler_fn
+def CompiledModuleWrapper(mod, compile_config: Union[CompileConfig, None] = None):
+
+    class wrapper(mod.__class__):
+
+        def __init__(self, module, compile_config: Union[CompileConfig, None] = None):
+            self.__dict__ = module.__dict__.copy()
+
+            assert is_compile_supported(), "torch.compile is not supported on this version of PyTorch."
+
+            self.__dict__['wrapped'] = module
+            self._is_compiled = False
+            self._backend = get_backend_fn(compile_config.backend)
+            self._compile_kwargs = compile_config.kwargs
+            self._compiler_fn = None
+
+        def set_backend(self, backend: Union[str, Callable]):
+            """Set the backend for torch.compile.
+
+            Args:
+                backend (Union[str, Callable]): backend name or a function that takes a torch.nn.Module and returns a compiled module.
+                You can directly pass a function that works as a backend.
+                See also `backend` field in `CompileConfig` for more details.
+            """
+            self._backend = get_backend_fn(backend)
+
+        def set_torch_compile_kwargs(self, kwargs: Dict[str, Union[str, Any]]) -> None:
+            """Set kwargs for torch.compile. Kwargs that are set in DeepSpeed config will be overwritten.
+            You can also pass a backend name with "backend" key to change the backend.
+
+            Args:
+                kwargs (Dict[str, Union[str, Any]]): kwargs passed to torch.compile.
+            """
+
+            if "backend" in kwargs:
+                raise ValueError("backend cannot be set as compile kwargs. Use set_backend instead.")
+            self._compile_kwargs.update(kwargs)
+
+        def set_compiler_fn(self, compiler_fn: Callable) -> None:
+            """Set a function to be used for compiling the module.
+            This function should take a torch.nn.Module as input and return a compiled module.
+            Note that other compile options are ignored when a compiler_fn is set.
+
+            Example:
+            ```python
+                def my_compiler_fn(module: torch.nn.Module):
+                    ...
+                    return torch.compile(module, ...)
+
+                engine.set_compiler_fn(my_compiler_fn)
+            ```
+            """
+            self._compiler_fn = compiler_fn
+
+        def forward(self, *args, **kwargs) -> Any:
+            if not self.is_compiled:
+                if self._compiler_fn is None:
+                    self.__dict__['wrapped'] = torch.compile(self.wrapped,
+                                                             backend=self._backend,
+                                                             **self._compile_kwargs)
+                else:
+                    self.__dict__['wrapped'] = self._compiler_fn(self.wrapped)
+                self._is_compiled = True
+
+            return self.__dict__['wrapped'](*args, **kwargs)
+
+        @property
+        def is_compiled(self) -> bool:
+            return self._is_compiled
+
+        @property
+        def backend(self) -> Union[str, Callable]:
+            return self._backend
+
+        @property
+        def torch_compile_kwargs(self) -> Dict[str, Any]:
+            return self._compile_kwargs
+
+        @property
+        def compiler_fn(self) -> Union[Callable, None]:
+            return self._compiler_fn
+
+    return wrapper(mod, compile_config)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 9a2b943b0992..34263444c1b7 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -469,13 +469,6 @@ def __getattr__(self, name):
             return getattr(self, name)
         elif name in dir(_module):
             return getattr(_module, name)
-        elif isinstance(_module, CompiledModuleWrapper):
-            try:
-                return getattr(_module, name)
-            except AttributeError:
-                raise AttributeError(
-                    f"None of {type(self).__name__}, CompiledModuleWrapper, or the wrapped model has the attribute '{name}'"
-                )
         else:
             raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index bf1693307ea7..49093bb73c8f 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -241,7 +241,7 @@ def _get_norm_mask_idx(self, group):
                     group_mask_idx_list.append([grad_flat_st_idx, grad_flat_en_idx])
             grad_flat_st_idx = grad_flat_en_idx
 
-        return torch.tensor(group_mask_idx_list, device=get_accelerator().current_device())
+        return torch.tensor(group_mask_idx_list, device=get_accelerator().current_device_name())
 
     def step(self, closure=None):
         """
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 1dda7f1aad32..be8fe1a368c6 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -67,9 +67,7 @@ class PipelineEngine(DeepSpeedEngine):
 
     def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         super().__init__(*super_args, **super_kwargs)
-        assert isinstance(self.module, PipelineModule) \
-            or (hasattr(self.module, 'wrapped') and isinstance(self.module.wrapped, PipelineModule)), \
-            "model must base PipelineModule"
+        assert isinstance(self.module, PipelineModule), "model must base PipelineModule"
 
         assert self.zero_optimization_stage(
         ) < ZeroStageEnum.gradients, "ZeRO-2 and ZeRO-3 are incompatible with pipeline parallelism"
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 7744b2ee8b98..2c01c3475a70 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -171,7 +171,7 @@ def get_norm_with_moe_layers_fast(all_groups_norm, group):
     # This implementation standardizes the grad_norm across ranks. A more precise implementation can be found in 'get_norm_with_moe_layers'.
     # Need to allreduce (avg) the norms across different ranks because moe params will not be synced during allreduce
     scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=group))
-    scaled_norm_tensor = torch.tensor(scaled_norm, device=get_accelerator().current_device(), dtype=torch.float)
+    scaled_norm_tensor = torch.tensor(scaled_norm, device=get_accelerator().current_device_name(), dtype=torch.float)
     dist.all_reduce(scaled_norm_tensor, group=group)
     all_groups_norm = scaled_norm_tensor.item()
     #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}")
@@ -424,9 +424,11 @@ def get_flattened_grad_norm(parameters, norm_type=2, mpu=None, grad_norm_mask=No
                 # # mask_tensor_ = torch.zeros_like(p, device=p.device, dtype=bool)
                 # # for mask_idx in grad_norm_mask[idx]:
                 # #   mask_tensor_[mask_idx[0]:mask_idx[1]] = True
-                cum_sum_pairs = torch.tensor([1, -1], device=get_accelerator().current_device(),
+                cum_sum_pairs = torch.tensor([1, -1], device=get_accelerator().current_device_name(),
                                              dtype=p.dtype).repeat(grad_norm_mask[idx].shape[0], 1)
-                mask_tensor = torch.zeros(p.shape[0] + 1, device=get_accelerator().current_device(), dtype=p.dtype)
+                mask_tensor = torch.zeros(p.shape[0] + 1,
+                                          device=get_accelerator().current_device_name(),
+                                          dtype=p.dtype)
                 mask_tensor = mask_tensor.scatter_(0, grad_norm_mask[idx].view(-1),
                                                    cum_sum_pairs.view(-1)).cumsum(0).bool()[:-1]
 
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index c6ff216edfcb..13ca29c9fceb 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1409,7 +1409,7 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
         norm_is_nan = total_norm.isnan()
         inf_or_nan = norm_is_nan.logical_or(norm_is_inf)
 
-        err = torch.tensor(-1.0, device=self.device, dtype=torch.float)
+        err = torch.tensor(-1.0, device=inf_or_nan.device, dtype=torch.float)
         total_norm = inf_or_nan * err + inf_or_nan.logical_not() * total_norm
 
         return total_norm
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
index d39f9fe3d651..fdff9430a4e6 100644
--- a/tests/unit/moe/test_moe.py
+++ b/tests/unit/moe/test_moe.py
@@ -177,7 +177,7 @@ class TestTopk(DistributedTest):
     world_size = 2
 
     def test(self):
-        device = get_accelerator().current_device()
+        device = get_accelerator().current_device_name()
         if dist.get_rank() == 0:
             logits = torch.rand(2, 2, device=device)
         elif dist.get_rank() == 1: