Skip to content

Commit

Permalink
Merge branch 'main' into feature/get-trace-id-from-req-headers
Browse files Browse the repository at this point in the history
  • Loading branch information
kozistr authored Dec 12, 2024
2 parents ba72c18 + bf59118 commit e4d7a67
Show file tree
Hide file tree
Showing 191 changed files with 2,814 additions and 75,673 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ jobs:
uses: docker/[email protected]
with:
flavor: |
latest=auto
latest=false
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
ghcr.io/huggingface/text-generation-inference
Expand Down
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ repos:
hooks:
- id: check-yaml
- id: end-of-file-fixer
exclude: crate-hashes.json
- id: trailing-whitespace
exclude: docs/source/reference/launcher.md
- repo: https://github.com/psf/black
Expand Down
38 changes: 31 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ default-members = [
resolver = "2"

[workspace.package]
version = "2.4.2-dev0"
version = "3.0.2-dev0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"
Expand Down
16 changes: 5 additions & 11 deletions Dockerfile_intel
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ RUN cargo build --profile release-opt --frozen

# Text Generation Inference base image for Intel

FROM intel/intel-extension-for-pytorch:2.3.110-xpu AS xpu
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS xpu

USER root

Expand Down Expand Up @@ -87,7 +87,7 @@ RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https:/

RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit=2024.2.1-98 xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-pti-dev-0.9

# Text Generation Inference base env
ENV HF_HOME=/data \
Expand All @@ -114,15 +114,8 @@ RUN cd server && \
pip install -r requirements_intel.txt && \
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir

ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
ENV PATH=/opt/conda/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/intel/oneapi/pti/0.9/lib:/opt/conda/lib
ENV CCL_ZE_IPC_EXCHANGE=sockets
ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
#ENV TORCH_LLM_ALLREDUCE=1
#ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0

Expand Down Expand Up @@ -197,9 +190,10 @@ RUN pip install triton py-libnuma

WORKDIR /usr/src

RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout 2e1c98f74ec1b35ad8dd1ebe7dd4b25470f2fd41
RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout b7b552baf64283b594665b8687430fe92990e497
RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout v2.4.0+cpu+rc0

RUN sed -i 's/VERSION_MINOR 6/VERSION_MINOR 5/' intel-extension-for-pytorch/version.txt
RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update --init --recursive && python setup.py install

RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
Expand Down
22 changes: 17 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
volume=$PWD/data

docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
3.0.0 ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
```

And then you can make requests like
Expand Down Expand Up @@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \

**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.

**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1-rocm --model-id $model` instead of the command above.
**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0-rocm --model-id $model` instead of the command above.

To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
```
Expand Down Expand Up @@ -151,7 +151,7 @@ model=meta-llama/Meta-Llama-3.1-8B-Instruct
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
token=<your cli READ token>

docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 --model-id $model
docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.0 --model-id $model
```

### A note on Shared Memory (shm)
Expand Down Expand Up @@ -196,14 +196,26 @@ Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with T

You can also opt to install `text-generation-inference` locally.

First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
Python 3.9, e.g. using `conda`:
First clone the repository and change directoy into it:

```shell
git clone https://github.com/huggingface/text-generation-inference
cd text-generation-inference
```

Then [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
Python 3.9, e.g. using `conda` or `python venv`:

```shell
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
#using conda
conda create -n text-generation-inference python=3.11
conda activate text-generation-inference
#using pyton venv
python3 -m venv .venv
source .venv/bin/activate
```

You may also need to install Protoc.
Expand Down
Binary file added assets/v3_benchmarks.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions backends/v2/src/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@ impl Backend for BackendV2 {
}
.is_ok()
}

fn start_health(&self) -> bool {
true
}
}

/// Batching logic
Expand Down
1 change: 1 addition & 0 deletions backends/v2/src/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ mod tests {
stopping_parameters: ValidStoppingParameters {
ignore_eos_token: false,
max_new_tokens: 1,
max_total_new_tokens: 1024,
stop_sequences: vec![],
},
top_n_tokens: 0,
Expand Down
4 changes: 4 additions & 0 deletions backends/v3/src/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ impl Backend for BackendV3 {
}
.is_ok()
}

fn start_health(&self) -> bool {
true
}
}

/// Batching logic
Expand Down
6 changes: 3 additions & 3 deletions backends/v3/src/client/sharded_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,8 @@ impl Health for ShardedClient {
input_chunks: Some(Input {
chunks: vec![Chunk::Text("liveness".into()).into()],
}),
truncate: 10,
add_special_tokens: true,
truncate: 1,
add_special_tokens: false,
prefill_logprobs: false,
parameters: Some(NextTokenChooserParameters {
temperature: 1.0,
Expand All @@ -241,7 +241,7 @@ impl Health for ShardedClient {
top_n_tokens: 0,
// Block 0 is reserved for health checks
blocks: vec![0],
slots: (0..16).collect(),
slots: vec![0],
cache_len: 0,
adapter_id: None,
chunk_len: None,
Expand Down
1 change: 1 addition & 0 deletions backends/v3/src/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,7 @@ mod tests {
stopping_parameters: ValidStoppingParameters {
ignore_eos_token: false,
max_new_tokens: 1,
max_total_new_tokens: 1024,
stop_sequences: vec![],
},
top_n_tokens: 0,
Expand Down
3 changes: 3 additions & 0 deletions crate-hashes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"git+https://github.com/dottxt-ai/outlines-core.git?rev=ba10c619fc9bf3c487e43f49bdecb95a24bb465c#[email protected]": "1j9dcd831b0bmmjk2n4aag3x47qnqmkpg4gqpvwwyic7744llbfm"
}
8 changes: 5 additions & 3 deletions docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "2.4.2-dev0"
"version": "3.0.1-dev0"
},
"paths": {
"/": {
Expand Down Expand Up @@ -1013,6 +1013,7 @@
"type": "integer",
"format": "int32",
"description": "The maximum number of tokens that can be generated in the chat completion.",
"default": "1024",
"example": "32",
"nullable": true,
"minimum": 0
Expand Down Expand Up @@ -1329,7 +1330,8 @@
"type": "integer",
"format": "int32",
"description": "The maximum number of tokens that can be generated in the chat completion.",
"default": "32",
"default": "1024",
"example": "32",
"nullable": true,
"minimum": 0
},
Expand Down Expand Up @@ -1591,7 +1593,7 @@
"type": "integer",
"format": "int32",
"description": "Maximum number of tokens to generate.",
"default": "100",
"default": "1024",
"example": "20",
"nullable": true,
"minimum": 0
Expand Down
2 changes: 2 additions & 0 deletions docs/source/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
title: API Reference
title: Reference
- sections:
- local: conceptual/chunking
title: V3 update, caching and chunking
- local: conceptual/streaming
title: Streaming
- local: conceptual/quantization
Expand Down
2 changes: 1 addition & 1 deletion docs/source/basic_tutorials/gated_model_access.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ docker run --gpus all \
--shm-size 1g \
-e HF_TOKEN=$token \
-p 8080:80 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.4.1 \
-v $volume:/data ghcr.io/huggingface/text-generation-inference:3.0.1 \
--model-id $model
```
Loading

0 comments on commit e4d7a67

Please sign in to comment.