From 9e2fdf57c04bae65827b2b03ad2b696eb6e8dec7 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 25 Jun 2024 13:20:57 +0200
Subject: [PATCH] Removing IPEX_AVAIL. (#2115)

* Removing IPEX_AVAIL.

Chose to unify CPU and XPU under `ipex`. Most code is exactly similar
except for a very few spots.

The biggest number of spots is the kv-cache layout and the flash_xxx.py
files.
Since those files should be removed soon and factored away, we should
not need them.

* Forgot a few places.

* Unrelated change.

* Fixing HF_TOKEN.

* HF_TOKEN
---
 .github/workflows/build.yaml                  |  2 +-
 .github/workflows/client-tests.yaml           |  2 +-
 .github/workflows/integration_tests.yaml      |  2 +-
 .github/workflows/load_test.yaml              |  2 +-
 .github/workflows/tests.yaml                  |  2 +-
 clients/python/text_generation/types.py       |  2 +-
 .../layers/attention/__init__.py              |  6 ++---
 .../layers/attention/{xpu.py => ipex.py}      |  0
 .../layers/layernorm.py                       |  5 ++--
 .../text_generation_server/layers/rotary.py   |  6 ++---
 .../layers/tensor_parallel.py                 | 12 +++++-----
 .../custom_modeling/flash_dbrx_modeling.py    |  4 ++--
 .../custom_modeling/flash_mixtral_modeling.py |  4 ++--
 .../models/flash_causal_lm.py                 |  9 +++-----
 .../models/flash_gpt2.py                      | 12 +++++-----
 .../models/flash_llama.py                     | 12 +++++-----
 .../models/flash_mistral.py                   | 12 +++++-----
 .../models/flash_neox.py                      | 12 +++++-----
 .../text_generation_server/models/flash_rw.py | 12 +++++-----
 .../models/flash_santacoder.py                | 12 +++++-----
 server/text_generation_server/utils/dist.py   |  4 ++--
 .../utils/import_utils.py                     | 23 +++++++++++--------
 22 files changed, 79 insertions(+), 78 deletions(-)
 rename server/text_generation_server/layers/attention/{xpu.py => ipex.py} (100%)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 90fb9d45f21..0eb198f4959 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -178,6 +178,6 @@ jobs:
           export DOCKER_VOLUME=/mnt/cache
           export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
           export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
-          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
           echo $DOCKER_IMAGE
           pytest -s -vv integration-tests
diff --git a/.github/workflows/client-tests.yaml b/.github/workflows/client-tests.yaml
index ef7c217cdc0..ff2928c4f42 100644
--- a/.github/workflows/client-tests.yaml
+++ b/.github/workflows/client-tests.yaml
@@ -22,5 +22,5 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest pytest-asyncio
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
           make python-client-tests
diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml
index 4e111afe90c..59a8d304419 100644
--- a/.github/workflows/integration_tests.yaml
+++ b/.github/workflows/integration_tests.yaml
@@ -37,5 +37,5 @@ jobs:
           export DOCKER_VOLUME=/mnt/cache
           export DOCKER_IMAGE=${{ inputs.docker_image }}
           export DOCKER_DEVICES=${{ inputs.docker_devices }}
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
           pytest -s -vv integration-tests
diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml
index a10c942847c..637df4727e1 100644
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@@ -28,7 +28,7 @@ jobs:
 
       - name: Start starcoder
         run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
           sleep 10
           wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
 
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index e21344d1041..f983b6ed85a 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -72,7 +72,7 @@ jobs:
       - name: Run server tests
         run: |
           pip install pytest
-          export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
           pytest -s -vv server/tests
       - name: Pre-commit checks
         run: |
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index 497468d9f8c..a56edaca759 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -455,6 +455,6 @@ class DeployedModel(BaseModel):
     # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
     # with model_ prefixes, since this disables guardrails for colliding fields:
     # https://github.com/pydantic/pydantic/issues/9177
-    model_config  = ConfigDict(protected_namespaces=())
+    model_config = ConfigDict(protected_namespaces=())
     model_id: str
     sha: str
diff --git a/server/text_generation_server/layers/attention/__init__.py b/server/text_generation_server/layers/attention/__init__.py
index a53c8e3b791..e74180e7a86 100644
--- a/server/text_generation_server/layers/attention/__init__.py
+++ b/server/text_generation_server/layers/attention/__init__.py
@@ -1,4 +1,4 @@
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 import os
 
 if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
@@ -7,7 +7,7 @@
     from .cuda import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
 elif SYSTEM == "rocm":
     from .rocm import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
-elif IPEX_AVAIL:
-    from .xpu import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
+elif SYSTEM == "ipex":
+    from .ipex import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
 else:
     raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
diff --git a/server/text_generation_server/layers/attention/xpu.py b/server/text_generation_server/layers/attention/ipex.py
similarity index 100%
rename from server/text_generation_server/layers/attention/xpu.py
rename to server/text_generation_server/layers/attention/ipex.py
diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py
index 93e83dfa6f4..ce5289f9337 100644
--- a/server/text_generation_server/layers/layernorm.py
+++ b/server/text_generation_server/layers/layernorm.py
@@ -3,7 +3,6 @@
 from accelerate import init_empty_weights
 from text_generation_server.utils.import_utils import (
     SYSTEM,
-    IPEX_AVAIL,
 )
 
 
@@ -83,7 +82,7 @@ def forward(self, hidden_states, residual=None):
 
             return super().forward(hidden_states), residual
 
-elif IPEX_AVAIL:
+elif SYSTEM == "ipex":
     import intel_extension_for_pytorch as ipex
 
     class FastLayerNorm(nn.LayerNorm):
@@ -112,7 +111,7 @@ def load(cls, prefix, weights, eps=1e-6):
         return cls(weight, eps)
 
     def forward(self, hidden_states, residual=None):
-        if IPEX_AVAIL:
+        if SYSTEM == "ipex":
             out = ipex.llm.functional.add_rms_norm(
                 residual,
                 hidden_states,
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
index 1892cf69eec..b14005e6751 100644
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@@ -2,14 +2,14 @@
 import torch
 from torch import nn
 
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
 if SYSTEM == "cuda":
     from flash_attn.layers.rotary import RotaryEmbedding
     import rotary_emb
 elif SYSTEM == "rocm":
     from vllm._C import ops
-elif IPEX_AVAIL:
+elif SYSTEM == "ipex":
     import intel_extension_for_pytorch as ipex
 
 
@@ -69,7 +69,7 @@ def forward(
 
             # Inplace operation, updating query and key.
             ops.rotary_embedding(query, key, head_size, cos, sin, True)
-        elif IPEX_AVAIL:
+        elif SYSTEM == "ipex":
             ipex.llm.functional.rotary_embedding(
                 query, key, sin, cos, query.size(-1), True
             )
diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py
index 510dc2c6f56..038de25815a 100644
--- a/server/text_generation_server/layers/tensor_parallel.py
+++ b/server/text_generation_server/layers/tensor_parallel.py
@@ -3,9 +3,9 @@
 from typing import Iterable, List
 from text_generation_server.layers.linear import get_linear, FastLinear
 from text_generation_server.layers.exl2 import Exl2Weight
-from text_generation_server.utils.import_utils import IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
-if IPEX_AVAIL:
+if SYSTEM == "ipex":
     import intel_extension_for_pytorch as ipex
 
 
@@ -100,7 +100,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
                 local_out = gather_input.T
 
             torch.mm(input, self.linear.weight.T, out=local_out)
-            if IPEX_AVAIL:
+            if SYSTEM == "ipex":
                 ipex.distributed.all_gather_into_tensor(
                     world_out, gather_input, group=self.process_group
                 )
@@ -117,7 +117,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         world_output = [
             torch.empty_like(output) for _ in range(self.process_group.size())
         ]
-        if IPEX_AVAIL:
+        if SYSTEM == "ipex":
             ipex.distributed.all_gather(world_output, output, group=self.process_group)
         else:
             torch.distributed.all_gather(world_output, output, group=self.process_group)
@@ -217,7 +217,7 @@ def load(cls, config, prefix: str, weights, bias: bool):
     def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
         out = super().forward(input)
         if self.process_group.size() > 1 and reduce:
-            if IPEX_AVAIL:
+            if SYSTEM == "ipex":
                 ipex.distributed.all_reduce(out, group=self.process_group)
             else:
                 torch.distributed.all_reduce(out, group=self.process_group)
@@ -257,7 +257,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         )
         out = torch.nn.functional.embedding(input, self.weight)
         if self.reduce and self.process_group.size() > 1:
-            if IPEX_AVAIL:
+            if SYSTEM == "ipex":
                 ipex.distributed.all_reduce(out, group=self.process_group)
             else:
                 torch.distributed.all_reduce(out, group=self.process_group)
diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
index 56292250b88..f81bfa101db 100644
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -20,9 +20,9 @@
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
-from text_generation_server.utils.import_utils import IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
-if not IPEX_AVAIL:
+if SYSTEM != "ipex":
     from vllm.model_executor.layers.fused_moe import fused_moe
 
 from text_generation_server.layers.attention import (
diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
index 6ea954110ef..2f7619afe71 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -24,9 +24,9 @@
 import numpy as np
 
 from torch import nn
-from text_generation_server.utils.import_utils import IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
-if not IPEX_AVAIL:
+if SYSTEM != "ipex":
     from vllm.model_executor.layers.fused_moe import fused_moe
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 633b066b86a..aa43107f161 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -15,7 +15,7 @@
 
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from text_generation_server.utils.chunks import concat_text_chunks
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models import Model
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.utils.dist import RANK
@@ -768,12 +768,9 @@ def init_kv_cache(
         empty_cache()
 
         element_size = torch.tensor([], dtype=dtype).element_size()
-        if SYSTEM == "xpu":
-            x = 1
-        else:
-            x = BLOCK_SIZE // element_size
+        x = BLOCK_SIZE // element_size
 
-        if IPEX_AVAIL and SYSTEM == "cpu":
+        if SYSTEM == "ipex" and device == torch.device("cpu"):
             self.kv_cache = [
                 (
                     torch.empty(
diff --git a/server/text_generation_server/models/flash_gpt2.py b/server/text_generation_server/models/flash_gpt2.py
index 43f374e5ff5..75c7203ac02 100644
--- a/server/text_generation_server/models/flash_gpt2.py
+++ b/server/text_generation_server/models/flash_gpt2.py
@@ -15,7 +15,7 @@
     weight_files,
     Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -34,12 +34,12 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
-            device = torch.device(f"xpu:{rank}")
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
             dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
         else:
             raise NotImplementedError("FlashGPT2 is only available on GPU")
 
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
index e023c4e0617..76c522e392b 100644
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -17,7 +17,7 @@
 
 tracer = trace.get_tracer(__name__)
 
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
 
 class FlashLlama(FlashCausalLM):
@@ -34,12 +34,12 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
-            device = torch.device(f"xpu:{rank}")
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
             dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
         else:
             raise NotImplementedError("FlashLlama is only available on GPU")
 
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
index 0c57048776b..78a09cf5780 100644
--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -16,7 +16,7 @@
     weight_files,
     Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -38,12 +38,12 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
-            device = torch.device(f"xpu:{rank}")
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
             dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
         else:
             raise NotImplementedError("FlashMistral is only available on GPU")
 
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
index 69d47e57c26..9c82bf523e8 100644
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -14,7 +14,7 @@
     weight_files,
     Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -33,12 +33,12 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
-            device = torch.device(f"xpu:{rank}")
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
             dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
         else:
             raise NotImplementedError("FlashNeoX is only available on GPU")
 
diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py
index e087dcf1062..e8087f230a0 100644
--- a/server/text_generation_server/models/flash_rw.py
+++ b/server/text_generation_server/models/flash_rw.py
@@ -15,7 +15,7 @@
     weight_files,
     Weights,
 )
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -34,12 +34,12 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
-            device = torch.device(f"xpu:{rank}")
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
             dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
         else:
             raise NotImplementedError("FlashRW is only available on GPU")
 
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
index 9626af601b6..83a6b92c97e 100644
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -18,7 +18,7 @@
     Weights,
 )
 
-from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
 tracer = trace.get_tracer(__name__)
 
@@ -37,12 +37,12 @@ def __init__(
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
             dtype = torch.float16 if dtype is None else dtype
-        elif SYSTEM == "xpu":
-            device = torch.device(f"xpu:{rank}")
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+            else:
+                device = torch.device("cpu")
             dtype = torch.float16 if dtype is None else dtype
-        elif IPEX_AVAIL:
-            device = torch.device("cpu")
-            dtype = torch.bfloat16 if dtype is None else dtype
         else:
             raise NotImplementedError("FlashSantacoderSharded is only available on GPU")
 
diff --git a/server/text_generation_server/utils/dist.py b/server/text_generation_server/utils/dist.py
index 7d38756380f..36d63e86d2c 100644
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@@ -3,7 +3,7 @@
 
 from datetime import timedelta
 from loguru import logger
-from text_generation_server.utils.import_utils import IPEX_AVAIL
+from text_generation_server.utils.import_utils import SYSTEM
 
 # Tensor Parallelism settings
 RANK = int(os.getenv("RANK", "0"))
@@ -69,7 +69,7 @@ def initialize_torch_distributed():
 
         if not torch.distributed.is_initialized():
             # Call the init process.
-            if IPEX_AVAIL:
+            if SYSTEM == "ipex":
                 import intel_extension_for_pytorch as ipex
 
                 ipex.distributed.init_process_group(
diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py
index a244417a46d..6d9217215af 100644
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@@ -37,7 +37,10 @@ def get_cpu_free_memory(device, memory_fraction):
     return free_memory
 
 
-IPEX_AVAIL = is_ipex_available()
+def noop(*args, **kwargs):
+    pass
+
+
 SYSTEM = None
 if torch.version.hip is not None:
     SYSTEM = "rocm"
@@ -49,17 +52,19 @@ def get_cpu_free_memory(device, memory_fraction):
     empty_cache = torch.cuda.empty_cache
     synchronize = torch.cuda.synchronize
     get_free_memory = get_cuda_free_memory
-elif IPEX_AVAIL and hasattr(torch, "xpu") and torch.xpu.is_available():
-    SYSTEM = "xpu"
-    empty_cache = torch.xpu.empty_cache
-    synchronize = torch.xpu.synchronize
-    get_free_memory = get_xpu_free_memory
+elif is_ipex_available():
+    SYSTEM = "ipex"
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        empty_cache = torch.xpu.empty_cache
+        synchronize = torch.xpu.synchronize
+        get_free_memory = get_xpu_free_memory
+    else:
+        empty_cache = noop
+        synchronize = noop
+        get_free_memory = get_cpu_free_memory
 else:
     SYSTEM = "cpu"
 
-    def noop(*args, **kwargs):
-        pass
-
     empty_cache = noop
     synchronize = noop
     get_free_memory = get_cpu_free_memory