Merge branch 'main' into feature/get-trace-id-from-req-headers

huggingface · Dec 12, 2024 · e4d7a67 · e4d7a67
2 parents ba72c18 + bf59118
commit e4d7a67
Show file tree

Hide file tree

Showing 191 changed files with 2,814 additions and 75,673 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -137,7 +137,7 @@ jobs:
         uses: docker/[email protected]
         with:
           flavor: |
-            latest=auto
+            latest=false
           images: |
             registry.internal.huggingface.tech/api-inference/community/text-generation-inference
             ghcr.io/huggingface/text-generation-inference

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,6 +4,7 @@ repos:
     hooks:
     -   id: check-yaml
     -   id: end-of-file-fixer
+        exclude: crate-hashes.json
     -   id: trailing-whitespace
         exclude: docs/source/reference/launcher.md
 -   repo: https://github.com/psf/black

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,7 +20,7 @@ default-members = [
 resolver = "2"
 
 [workspace.package]
-version = "2.4.2-dev0"
+version = "3.0.2-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"

diff --git a/Dockerfile_intel b/Dockerfile_intel
@@ -45,7 +45,7 @@ RUN cargo build --profile release-opt --frozen
 
 # Text Generation Inference base image for Intel
 
-FROM intel/intel-extension-for-pytorch:2.3.110-xpu AS xpu
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS xpu
 
 USER root
 
@@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/
 
 RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
 
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit=2024.2.1-98 xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
 
 # Text Generation Inference base env
 ENV HF_HOME=/data \
@@ -114,15 +114,8 @@ RUN cd server && \
     pip install -r requirements_intel.txt && \
     pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
 
-ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
-ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
-ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
-ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
-ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
-ENV PATH=/opt/conda/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
 ENV CCL_ZE_IPC_EXCHANGE=sockets
-ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
-ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
 #ENV TORCH_LLM_ALLREDUCE=1
 #ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
 
@@ -197,9 +190,10 @@ RUN pip install triton py-libnuma
 
 WORKDIR /usr/src
 
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 2e1c98f74ec1b35ad8dd1ebe7dd4b25470f2fd41
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout b7b552baf64283b594665b8687430fe92990e497
 RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout v2.4.0+cpu+rc0
 
+RUN sed -i 's/VERSION_MINOR 6/VERSION_MINOR 5/' intel-extension-for-pytorch/version.txt
 RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update --init --recursive && python setup.py install
 
 RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .

diff --git a/README.md b/README.md
@@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
+3.0.0   ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
 ```
 
 And then you can make requests like
@@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \
 
 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0-rocm --model-id $model` instead of the command above.
 
 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@@ -151,7 +151,7 @@ model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
@@ -196,14 +196,26 @@ Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with T
 
 You can also opt to install `text-generation-inference` locally.
 
-First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
-Python 3.9, e.g. using `conda`:
+First clone the repository and change directoy into it:
+
+```shell
+git clone https://github.com/huggingface/text-generation-inference
+cd text-generation-inference
+```
+
+Then [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
+Python 3.9, e.g. using `conda` or `python venv`:
 
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 
+#using conda
 conda create -n text-generation-inference python=3.11
 conda activate text-generation-inference
+
+#using pyton venv
+python3 -m venv .venv
+source .venv/bin/activate
 ```
 
 You may also need to install Protoc.

diff --git a/assets/v3_benchmarks.png b/assets/v3_benchmarks.png
diff --git a/backends/v2/src/backend.rs b/backends/v2/src/backend.rs
@@ -104,6 +104,10 @@ impl Backend for BackendV2 {
         }
         .is_ok()
     }
+
+    fn start_health(&self) -> bool {
+        true
+    }
 }
 
 /// Batching logic

diff --git a/backends/v2/src/queue.rs b/backends/v2/src/queue.rs
@@ -436,6 +436,7 @@ mod tests {
                 stopping_parameters: ValidStoppingParameters {
                     ignore_eos_token: false,
                     max_new_tokens: 1,
+                    max_total_new_tokens: 1024,
                     stop_sequences: vec![],
                 },
                 top_n_tokens: 0,

diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs
@@ -111,6 +111,10 @@ impl Backend for BackendV3 {
         }
         .is_ok()
     }
+
+    fn start_health(&self) -> bool {
+        true
+    }
 }
 
 /// Batching logic

diff --git a/backends/v3/src/client/sharded_client.rs b/backends/v3/src/client/sharded_client.rs
@@ -217,8 +217,8 @@ impl Health for ShardedClient {
             input_chunks: Some(Input {
                 chunks: vec![Chunk::Text("liveness".into()).into()],
             }),
-            truncate: 10,
-            add_special_tokens: true,
+            truncate: 1,
+            add_special_tokens: false,
             prefill_logprobs: false,
             parameters: Some(NextTokenChooserParameters {
                 temperature: 1.0,
@@ -241,7 +241,7 @@ impl Health for ShardedClient {
             top_n_tokens: 0,
             // Block 0 is reserved for health checks
             blocks: vec![0],
-            slots: (0..16).collect(),
+            slots: vec![0],
             cache_len: 0,
             adapter_id: None,
             chunk_len: None,

diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
@@ -573,6 +573,7 @@ mod tests {
                 stopping_parameters: ValidStoppingParameters {
                     ignore_eos_token: false,
                     max_new_tokens: 1,
+                    max_total_new_tokens: 1024,
                     stop_sequences: vec![],
                 },
                 top_n_tokens: 0,

diff --git a/crate-hashes.json b/crate-hashes.json
@@ -0,0 +1,3 @@
+{
+  "git+https://github.com/dottxt-ai/outlines-core.git?rev=ba10c619fc9bf3c487e43f49bdecb95a24bb465c#[email protected]": "1j9dcd831b0bmmjk2n4aag3x47qnqmkpg4gqpvwwyic7744llbfm"
+}
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "2.4.2-dev0"
+    "version": "3.0.1-dev0"
   },
   "paths": {
     "/": {
@@ -1013,6 +1013,7 @@
             "type": "integer",
             "format": "int32",
             "description": "The maximum number of tokens that can be generated in the chat completion.",
+            "default": "1024",
             "example": "32",
             "nullable": true,
             "minimum": 0
@@ -1329,7 +1330,8 @@
             "type": "integer",
             "format": "int32",
             "description": "The maximum number of tokens that can be generated in the chat completion.",
-            "default": "32",
+            "default": "1024",
+            "example": "32",
             "nullable": true,
             "minimum": 0
           },
@@ -1591,7 +1593,7 @@
             "type": "integer",
             "format": "int32",
             "description": "Maximum number of tokens to generate.",
-            "default": "100",
+            "default": "1024",
             "example": "20",
             "nullable": true,
             "minimum": 0

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -54,6 +54,8 @@
     title: API Reference
   title: Reference
 - sections:
+  - local: conceptual/chunking
+    title: V3 update, caching and chunking
   - local: conceptual/streaming
     title: Streaming
   - local: conceptual/quantization

diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
@@ -19,6 +19,6 @@ docker run --gpus all \
     --shm-size 1g \
     -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 \
     --model-id $model
 ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -104,6 +104,10 @@ impl Backend for BackendV2 { @@
             }
             .is_ok()
         }
+        fn start_health(&self) -> bool {
+            true
+        }
     }
     /// Batching logic
@@ Expand Down @@