huggingface · OlivierDehaene · Sep 28, 2023 · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023
diff --git a/README.md b/README.md
@@ -68,6 +68,7 @@ to power Hugging Chat, the Inference API and Inference Endpoint.
 - [MPT](https://huggingface.co/mosaicml/mpt-30b)
 - [Llama V2](https://huggingface.co/meta-llama)
 - [Code Llama](https://huggingface.co/codellama)
+- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
 
 Other architectures are supported on a best effort basis using:
 

diff --git a/clients/python/README.md b/clients/python/README.md
@@ -140,6 +140,8 @@ class Parameters:
     watermark: bool
     # Get decoder input token logprobs and ids
     decoder_input_details: bool
+    # Return the N most likely tokens at each step
+    top_n_tokens: Optional[int] 
 
 # Decoder input tokens
 class InputToken:
@@ -189,6 +191,8 @@ class BestOfSequence:
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]] 
 
 
 # `generate` details
@@ -203,6 +207,8 @@ class Details:
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]]
     # Additional sequences when using the `best_of` parameter
     best_of_sequences: Optional[List[BestOfSequence]]
 
@@ -229,6 +235,8 @@ class StreamDetails:
 class StreamResponse:
     # Generated token
     token: Token
+    # Most likely tokens
+    top_tokens: Optional[List[Token]] 
     # Complete generated text
     # Only available when the generation is finished
     generated_text: Optional[str]

diff --git a/clients/python/poetry.lock b/clients/python/poetry.lock
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation"
-version = "0.6.0"
+version = "0.6.1"
 description = "Hugging Face Text Generation Python Client"
 license = "Apache-2.0"
 authors = ["Olivier Dehaene <[email protected]>"]

diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py
@@ -482,7 +482,6 @@ async def generate_stream(
             headers=self.headers, cookies=self.cookies, timeout=self.timeout
         ) as session:
             async with session.post(self.base_url, json=request.dict()) as resp:
-
                 if resp.status != 200:
                     raise parse_error(resp.status, await resp.json())
 

diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
@@ -40,7 +40,7 @@ class Parameters(BaseModel):
     # Get decoder input token logprobs and ids
     decoder_input_details: bool = False
     # Return the N most likely tokens at each step
-    top_n_tokens: Optional[int]
+    top_n_tokens: Optional[int] = None
 
     @validator("best_of")
     def valid_best_of(cls, field_value, values):
@@ -188,7 +188,7 @@ class BestOfSequence(BaseModel):
     # Generated tokens
     tokens: List[Token]
     # Most likely tokens
-    top_tokens: Optional[List[List[Token]]]
+    top_tokens: Optional[List[List[Token]]] = None
 
 
 # `generate` details
@@ -204,7 +204,7 @@ class Details(BaseModel):
     # Generated tokens
     tokens: List[Token]
     # Most likely tokens
-    top_tokens: Optional[List[List[Token]]]
+    top_tokens: Optional[List[List[Token]]] = None
     # Additional sequences when using the `best_of` parameter
     best_of_sequences: Optional[List[BestOfSequence]] = None
 
@@ -232,7 +232,7 @@ class StreamResponse(BaseModel):
     # Generated token
     token: Token
     # Most likely tokens
-    top_tokens: Optional[List[Token]]
+    top_tokens: Optional[List[Token]] = None
     # Complete generated text
     # Only available when the generation is finished
     generated_text: Optional[str] = None

diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md
@@ -34,10 +34,17 @@ Options:
           [env: NUM_SHARD=]
 
       --quantize <QUANTIZE>
-          Whether you want the model to be quantized. This will use `bitsandbytes` for quantization on the fly, or `gptq`. 4bit quantization is available through `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options
+          Whether you want the model to be quantized
 
           [env: QUANTIZE=]
-          [possible values: bitsandbytes, bitsandbytes-nf4, bitsandbytes-fp4, gptq, awq]
+
+          Possible values:
+          - awq:              4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=awq. Should replace GPTQ models whereever possible because of the better latency
+          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from https://github.com/NetEase-FuXi/EETQ.git
+          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq. text-generation-inference will use exllama (faster) kernels whereever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
+          - bitsandbytes:     Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
 
       --dtype <DTYPE>
           The dtype to be forced upon the model. This option cannot be used with `--quantize`

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
@@ -18,6 +18,8 @@ The following models are optimized and can be served with TGI, which uses custom
 - [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)
 - [MPT](https://huggingface.co/mosaicml/mpt-30b)
 - [Llama V2](https://huggingface.co/meta-llama)
+- [Code Llama](https://huggingface.co/codellama)
+- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
 
 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
 

diff --git a/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -12.9140625,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -10.7578125,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 28747,
+        "logprob": -0.54785156,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 3169,
+        "logprob": -1.4091797,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 307,
+        "logprob": -3.0273438,
+        "special": false,
+        "text": " n"
+      },
+      {
+        "id": 327,
+        "logprob": -0.94433594,
+        "special": false,
+        "text": " ="
+      },
+      {
+        "id": 28705,
+        "logprob": -0.81347656,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -1.2958984,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28734,
+        "logprob": -2.0644531,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 387,
+        "logprob": -1.9580078,
+        "special": false,
+        "text": " -"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.5073242,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -1.1816406,
+        "special": false,
+        "text": "1"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": ": Let n = 10 - 1"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -12.9140625,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -10.7578125,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 28747,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 3169,
+        "logprob": -0.1307373,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 332,
+        "logprob": -2.3359375,
+        "special": false,
+        "text": " u"
+      },
+      {
+        "id": 347,
+        "logprob": 0.0,
+        "special": false,
+        "text": " be"
+      },
+      {
+        "id": 325,
+        "logprob": -1.0234375,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 28734,
+        "logprob": -2.0292969,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 648,
+        "logprob": -1.0439453,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.24499512,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.5073242,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 387,
+        "logprob": -1.5507812,
+        "special": false,
+        "text": " -"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request: Let u be (0 + 3 -"
+}