From 9e2fdf57c04bae65827b2b03ad2b696eb6e8dec7 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 25 Jun 2024 13:20:57 +0200 Subject: [PATCH] Removing IPEX_AVAIL. (#2115) * Removing IPEX_AVAIL. Chose to unify CPU and XPU under `ipex`. Most code is exactly similar except for a very few spots. The biggest number of spots is the kv-cache layout and the flash_xxx.py files. Since those files should be removed soon and factored away, we should not need them. * Forgot a few places. * Unrelated change. * Fixing HF_TOKEN. * HF_TOKEN --- .github/workflows/build.yaml | 2 +- .github/workflows/client-tests.yaml | 2 +- .github/workflows/integration_tests.yaml | 2 +- .github/workflows/load_test.yaml | 2 +- .github/workflows/tests.yaml | 2 +- clients/python/text_generation/types.py | 2 +- .../layers/attention/__init__.py | 6 ++--- .../layers/attention/{xpu.py => ipex.py} | 0 .../layers/layernorm.py | 5 ++-- .../text_generation_server/layers/rotary.py | 6 ++--- .../layers/tensor_parallel.py | 12 +++++----- .../custom_modeling/flash_dbrx_modeling.py | 4 ++-- .../custom_modeling/flash_mixtral_modeling.py | 4 ++-- .../models/flash_causal_lm.py | 9 +++----- .../models/flash_gpt2.py | 12 +++++----- .../models/flash_llama.py | 12 +++++----- .../models/flash_mistral.py | 12 +++++----- .../models/flash_neox.py | 12 +++++----- .../text_generation_server/models/flash_rw.py | 12 +++++----- .../models/flash_santacoder.py | 12 +++++----- server/text_generation_server/utils/dist.py | 4 ++-- .../utils/import_utils.py | 23 +++++++++++-------- 22 files changed, 79 insertions(+), 78 deletions(-) rename server/text_generation_server/layers/attention/{xpu.py => ipex.py} (100%) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 90fb9d45f21..0eb198f4959 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -178,6 +178,6 @@ jobs: export DOCKER_VOLUME=/mnt/cache export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }} export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }} - export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} + export HF_TOKEN=${{ secrets.HF_TOKEN }} echo $DOCKER_IMAGE pytest -s -vv integration-tests diff --git a/.github/workflows/client-tests.yaml b/.github/workflows/client-tests.yaml index ef7c217cdc0..ff2928c4f42 100644 --- a/.github/workflows/client-tests.yaml +++ b/.github/workflows/client-tests.yaml @@ -22,5 +22,5 @@ jobs: - name: Run tests run: | pip install pytest pytest-asyncio - export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} + export HF_TOKEN=${{ secrets.HF_TOKEN }} make python-client-tests diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml index 4e111afe90c..59a8d304419 100644 --- a/.github/workflows/integration_tests.yaml +++ b/.github/workflows/integration_tests.yaml @@ -37,5 +37,5 @@ jobs: export DOCKER_VOLUME=/mnt/cache export DOCKER_IMAGE=${{ inputs.docker_image }} export DOCKER_DEVICES=${{ inputs.docker_devices }} - export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} + export HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -s -vv integration-tests diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml index a10c942847c..637df4727e1 100644 --- a/.github/workflows/load_test.yaml +++ b/.github/workflows/load_test.yaml @@ -28,7 +28,7 @@ jobs: - name: Start starcoder run: | - docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768 + docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768 sleep 10 wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index e21344d1041..f983b6ed85a 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -72,7 +72,7 @@ jobs: - name: Run server tests run: | pip install pytest - export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} + export HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -s -vv server/tests - name: Pre-commit checks run: | diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py index 497468d9f8c..a56edaca759 100644 --- a/clients/python/text_generation/types.py +++ b/clients/python/text_generation/types.py @@ -455,6 +455,6 @@ class DeployedModel(BaseModel): # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members # with model_ prefixes, since this disables guardrails for colliding fields: # https://github.com/pydantic/pydantic/issues/9177 - model_config = ConfigDict(protected_namespaces=()) + model_config = ConfigDict(protected_namespaces=()) model_id: str sha: str diff --git a/server/text_generation_server/layers/attention/__init__.py b/server/text_generation_server/layers/attention/__init__.py index a53c8e3b791..e74180e7a86 100644 --- a/server/text_generation_server/layers/attention/__init__.py +++ b/server/text_generation_server/layers/attention/__init__.py @@ -1,4 +1,4 @@ -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM import os if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false": @@ -7,7 +7,7 @@ from .cuda import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING elif SYSTEM == "rocm": from .rocm import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING -elif IPEX_AVAIL: - from .xpu import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING +elif SYSTEM == "ipex": + from .ipex import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING else: raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention") diff --git a/server/text_generation_server/layers/attention/xpu.py b/server/text_generation_server/layers/attention/ipex.py similarity index 100% rename from server/text_generation_server/layers/attention/xpu.py rename to server/text_generation_server/layers/attention/ipex.py diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py index 93e83dfa6f4..ce5289f9337 100644 --- a/server/text_generation_server/layers/layernorm.py +++ b/server/text_generation_server/layers/layernorm.py @@ -3,7 +3,6 @@ from accelerate import init_empty_weights from text_generation_server.utils.import_utils import ( SYSTEM, - IPEX_AVAIL, ) @@ -83,7 +82,7 @@ def forward(self, hidden_states, residual=None): return super().forward(hidden_states), residual -elif IPEX_AVAIL: +elif SYSTEM == "ipex": import intel_extension_for_pytorch as ipex class FastLayerNorm(nn.LayerNorm): @@ -112,7 +111,7 @@ def load(cls, prefix, weights, eps=1e-6): return cls(weight, eps) def forward(self, hidden_states, residual=None): - if IPEX_AVAIL: + if SYSTEM == "ipex": out = ipex.llm.functional.add_rms_norm( residual, hidden_states, diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py index 1892cf69eec..b14005e6751 100644 --- a/server/text_generation_server/layers/rotary.py +++ b/server/text_generation_server/layers/rotary.py @@ -2,14 +2,14 @@ import torch from torch import nn -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM if SYSTEM == "cuda": from flash_attn.layers.rotary import RotaryEmbedding import rotary_emb elif SYSTEM == "rocm": from vllm._C import ops -elif IPEX_AVAIL: +elif SYSTEM == "ipex": import intel_extension_for_pytorch as ipex @@ -69,7 +69,7 @@ def forward( # Inplace operation, updating query and key. ops.rotary_embedding(query, key, head_size, cos, sin, True) - elif IPEX_AVAIL: + elif SYSTEM == "ipex": ipex.llm.functional.rotary_embedding( query, key, sin, cos, query.size(-1), True ) diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py index 510dc2c6f56..038de25815a 100644 --- a/server/text_generation_server/layers/tensor_parallel.py +++ b/server/text_generation_server/layers/tensor_parallel.py @@ -3,9 +3,9 @@ from typing import Iterable, List from text_generation_server.layers.linear import get_linear, FastLinear from text_generation_server.layers.exl2 import Exl2Weight -from text_generation_server.utils.import_utils import IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM -if IPEX_AVAIL: +if SYSTEM == "ipex": import intel_extension_for_pytorch as ipex @@ -100,7 +100,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: local_out = gather_input.T torch.mm(input, self.linear.weight.T, out=local_out) - if IPEX_AVAIL: + if SYSTEM == "ipex": ipex.distributed.all_gather_into_tensor( world_out, gather_input, group=self.process_group ) @@ -117,7 +117,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: world_output = [ torch.empty_like(output) for _ in range(self.process_group.size()) ] - if IPEX_AVAIL: + if SYSTEM == "ipex": ipex.distributed.all_gather(world_output, output, group=self.process_group) else: torch.distributed.all_gather(world_output, output, group=self.process_group) @@ -217,7 +217,7 @@ def load(cls, config, prefix: str, weights, bias: bool): def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor: out = super().forward(input) if self.process_group.size() > 1 and reduce: - if IPEX_AVAIL: + if SYSTEM == "ipex": ipex.distributed.all_reduce(out, group=self.process_group) else: torch.distributed.all_reduce(out, group=self.process_group) @@ -257,7 +257,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: ) out = torch.nn.functional.embedding(input, self.weight) if self.reduce and self.process_group.size() > 1: - if IPEX_AVAIL: + if SYSTEM == "ipex": ipex.distributed.all_reduce(out, group=self.process_group) else: torch.distributed.all_reduce(out, group=self.process_group) diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py index 56292250b88..f81bfa101db 100644 --- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py @@ -20,9 +20,9 @@ from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple, Any -from text_generation_server.utils.import_utils import IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM -if not IPEX_AVAIL: +if SYSTEM != "ipex": from vllm.model_executor.layers.fused_moe import fused_moe from text_generation_server.layers.attention import ( diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index 6ea954110ef..2f7619afe71 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -24,9 +24,9 @@ import numpy as np from torch import nn -from text_generation_server.utils.import_utils import IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM -if not IPEX_AVAIL: +if SYSTEM != "ipex": from vllm.model_executor.layers.fused_moe import fused_moe from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 633b066b86a..aa43107f161 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -15,7 +15,7 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from text_generation_server.utils.chunks import concat_text_chunks -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.models import Model from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.utils.dist import RANK @@ -768,12 +768,9 @@ def init_kv_cache( empty_cache() element_size = torch.tensor([], dtype=dtype).element_size() - if SYSTEM == "xpu": - x = 1 - else: - x = BLOCK_SIZE // element_size + x = BLOCK_SIZE // element_size - if IPEX_AVAIL and SYSTEM == "cpu": + if SYSTEM == "ipex" and device == torch.device("cpu"): self.kv_cache = [ ( torch.empty( diff --git a/server/text_generation_server/models/flash_gpt2.py b/server/text_generation_server/models/flash_gpt2.py index 43f374e5ff5..75c7203ac02 100644 --- a/server/text_generation_server/models/flash_gpt2.py +++ b/server/text_generation_server/models/flash_gpt2.py @@ -15,7 +15,7 @@ weight_files, Weights, ) -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -34,12 +34,12 @@ def __init__( if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif SYSTEM == "xpu": - device = torch.device(f"xpu:{rank}") + elif SYSTEM == "ipex": + if hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device(f"xpu:{rank}") + else: + device = torch.device("cpu") dtype = torch.float16 if dtype is None else dtype - elif IPEX_AVAIL: - device = torch.device("cpu") - dtype = torch.bfloat16 if dtype is None else dtype else: raise NotImplementedError("FlashGPT2 is only available on GPU") diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py index e023c4e0617..76c522e392b 100644 --- a/server/text_generation_server/models/flash_llama.py +++ b/server/text_generation_server/models/flash_llama.py @@ -17,7 +17,7 @@ tracer = trace.get_tracer(__name__) -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM class FlashLlama(FlashCausalLM): @@ -34,12 +34,12 @@ def __init__( if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif SYSTEM == "xpu": - device = torch.device(f"xpu:{rank}") + elif SYSTEM == "ipex": + if hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device(f"xpu:{rank}") + else: + device = torch.device("cpu") dtype = torch.float16 if dtype is None else dtype - elif IPEX_AVAIL: - device = torch.device("cpu") - dtype = torch.bfloat16 if dtype is None else dtype else: raise NotImplementedError("FlashLlama is only available on GPU") diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py index 0c57048776b..78a09cf5780 100644 --- a/server/text_generation_server/models/flash_mistral.py +++ b/server/text_generation_server/models/flash_mistral.py @@ -16,7 +16,7 @@ weight_files, Weights, ) -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -38,12 +38,12 @@ def __init__( if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif SYSTEM == "xpu": - device = torch.device(f"xpu:{rank}") + elif SYSTEM == "ipex": + if hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device(f"xpu:{rank}") + else: + device = torch.device("cpu") dtype = torch.float16 if dtype is None else dtype - elif IPEX_AVAIL: - device = torch.device("cpu") - dtype = torch.bfloat16 if dtype is None else dtype else: raise NotImplementedError("FlashMistral is only available on GPU") diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py index 69d47e57c26..9c82bf523e8 100644 --- a/server/text_generation_server/models/flash_neox.py +++ b/server/text_generation_server/models/flash_neox.py @@ -14,7 +14,7 @@ weight_files, Weights, ) -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -33,12 +33,12 @@ def __init__( if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif SYSTEM == "xpu": - device = torch.device(f"xpu:{rank}") + elif SYSTEM == "ipex": + if hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device(f"xpu:{rank}") + else: + device = torch.device("cpu") dtype = torch.float16 if dtype is None else dtype - elif IPEX_AVAIL: - device = torch.device("cpu") - dtype = torch.bfloat16 if dtype is None else dtype else: raise NotImplementedError("FlashNeoX is only available on GPU") diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py index e087dcf1062..e8087f230a0 100644 --- a/server/text_generation_server/models/flash_rw.py +++ b/server/text_generation_server/models/flash_rw.py @@ -15,7 +15,7 @@ weight_files, Weights, ) -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -34,12 +34,12 @@ def __init__( if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif SYSTEM == "xpu": - device = torch.device(f"xpu:{rank}") + elif SYSTEM == "ipex": + if hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device(f"xpu:{rank}") + else: + device = torch.device("cpu") dtype = torch.float16 if dtype is None else dtype - elif IPEX_AVAIL: - device = torch.device("cpu") - dtype = torch.bfloat16 if dtype is None else dtype else: raise NotImplementedError("FlashRW is only available on GPU") diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py index 9626af601b6..83a6b92c97e 100644 --- a/server/text_generation_server/models/flash_santacoder.py +++ b/server/text_generation_server/models/flash_santacoder.py @@ -18,7 +18,7 @@ Weights, ) -from text_generation_server.utils.import_utils import SYSTEM, IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM tracer = trace.get_tracer(__name__) @@ -37,12 +37,12 @@ def __init__( if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") dtype = torch.float16 if dtype is None else dtype - elif SYSTEM == "xpu": - device = torch.device(f"xpu:{rank}") + elif SYSTEM == "ipex": + if hasattr(torch, "xpu") and torch.xpu.is_available(): + device = torch.device(f"xpu:{rank}") + else: + device = torch.device("cpu") dtype = torch.float16 if dtype is None else dtype - elif IPEX_AVAIL: - device = torch.device("cpu") - dtype = torch.bfloat16 if dtype is None else dtype else: raise NotImplementedError("FlashSantacoderSharded is only available on GPU") diff --git a/server/text_generation_server/utils/dist.py b/server/text_generation_server/utils/dist.py index 7d38756380f..36d63e86d2c 100644 --- a/server/text_generation_server/utils/dist.py +++ b/server/text_generation_server/utils/dist.py @@ -3,7 +3,7 @@ from datetime import timedelta from loguru import logger -from text_generation_server.utils.import_utils import IPEX_AVAIL +from text_generation_server.utils.import_utils import SYSTEM # Tensor Parallelism settings RANK = int(os.getenv("RANK", "0")) @@ -69,7 +69,7 @@ def initialize_torch_distributed(): if not torch.distributed.is_initialized(): # Call the init process. - if IPEX_AVAIL: + if SYSTEM == "ipex": import intel_extension_for_pytorch as ipex ipex.distributed.init_process_group( diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py index a244417a46d..6d9217215af 100644 --- a/server/text_generation_server/utils/import_utils.py +++ b/server/text_generation_server/utils/import_utils.py @@ -37,7 +37,10 @@ def get_cpu_free_memory(device, memory_fraction): return free_memory -IPEX_AVAIL = is_ipex_available() +def noop(*args, **kwargs): + pass + + SYSTEM = None if torch.version.hip is not None: SYSTEM = "rocm" @@ -49,17 +52,19 @@ def get_cpu_free_memory(device, memory_fraction): empty_cache = torch.cuda.empty_cache synchronize = torch.cuda.synchronize get_free_memory = get_cuda_free_memory -elif IPEX_AVAIL and hasattr(torch, "xpu") and torch.xpu.is_available(): - SYSTEM = "xpu" - empty_cache = torch.xpu.empty_cache - synchronize = torch.xpu.synchronize - get_free_memory = get_xpu_free_memory +elif is_ipex_available(): + SYSTEM = "ipex" + if hasattr(torch, "xpu") and torch.xpu.is_available(): + empty_cache = torch.xpu.empty_cache + synchronize = torch.xpu.synchronize + get_free_memory = get_xpu_free_memory + else: + empty_cache = noop + synchronize = noop + get_free_memory = get_cpu_free_memory else: SYSTEM = "cpu" - def noop(*args, **kwargs): - pass - empty_cache = noop synchronize = noop get_free_memory = get_cpu_free_memory