diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml index 8bc60eff6ff..b406d43b8f0 100644 --- a/.github/workflows/trufflehog.yml +++ b/.github/workflows/trufflehog.yml @@ -16,4 +16,3 @@ jobs: fetch-depth: 0 - name: Secret Scanning uses: trufflesecurity/trufflehog@main - diff --git a/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json b/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json new file mode 100644 index 00000000000..8339083215b --- /dev/null +++ b/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json @@ -0,0 +1,23 @@ +{ + "choices": [ + { + "finish_reason": "eos_token", + "index": 0, + "logprobs": null, + "message": { + "content": "{\n \"temperature\": [\n 35,\n 34,\n 36\n ],\n \"unit\": \"°c\"\n}", + "role": "assistant" + } + } + ], + "created": 1718044128, + "id": "", + "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "object": "text_completion", + "system_fingerprint": "2.0.5-dev0-native", + "usage": { + "completion_tokens": 39, + "prompt_tokens": 136, + "total_tokens": 175 + } +} diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py new file mode 100644 index 00000000000..9c4c048e81c --- /dev/null +++ b/integration-tests/models/test_grammar_response_format_llama.py @@ -0,0 +1,101 @@ +import pytest +import requests +from pydantic import BaseModel +from typing import List + + +@pytest.fixture(scope="module") +def llama_grammar_handle(launcher): + with launcher( + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + num_shard=1, + disable_grammar_support=False, + use_flash_attention=False, + max_batch_prefill_tokens=3000, + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def llama_grammar(llama_grammar_handle): + await llama_grammar_handle.health(300) + return llama_grammar_handle.client + + +@pytest.mark.asyncio +async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot): + + class Weather(BaseModel): + unit: str + temperature: List[int] + + # send the request + response = requests.post( + f"{llama_grammar.base_url}/v1/chat/completions", + headers=llama_grammar.headers, + json={ + "model": "tgi", + "messages": [ + { + "role": "system", + "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}", + }, + { + "role": "user", + "content": "What's the weather like the next 3 days in San Francisco, CA?", + }, + ], + "seed": 42, + "max_tokens": 500, + "response_format": {"type": "json_object", "value": Weather.schema()}, + }, + ) + + chat_completion = response.json() + called = chat_completion["choices"][0]["message"]["content"] + + assert response.status_code == 200 + assert ( + called + == '{\n "temperature": [\n 35,\n 34,\n 36\n ],\n "unit": "°c"\n}' + ) + assert chat_completion == response_snapshot + + +@pytest.mark.asyncio +async def test_grammar_response_format_llama_error_if_tools_not_installed( + llama_grammar, +): + class Weather(BaseModel): + unit: str + temperature: List[int] + + # send the request + response = requests.post( + f"{llama_grammar.base_url}/v1/chat/completions", + headers=llama_grammar.headers, + json={ + "model": "tgi", + "messages": [ + { + "role": "system", + "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}", + }, + { + "role": "user", + "content": "What's the weather like the next 3 days in San Francisco, CA?", + }, + ], + "seed": 42, + "max_tokens": 500, + "tools": [], + "response_format": {"type": "json_object", "value": Weather.schema()}, + }, + ) + + # 422 means the server was unable to process the request because it contains invalid data. + assert response.status_code == 422 + assert response.json() == { + "error": "Grammar and tools are mutually exclusive", + "error_type": "grammar and tools", + } diff --git a/router/src/lib.rs b/router/src/lib.rs index b6902c497c1..1016019d92d 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -89,6 +89,7 @@ pub(crate) enum GrammarType { /// JSON Schema is a declarative language that allows to annotate JSON documents /// with types and descriptions. #[serde(rename = "json")] + #[serde(alias = "json_object")] #[schema(example = json ! ({"properties": {"location":{"type": "string"}}}))] Json(serde_json::Value), #[serde(rename = "regex")] @@ -791,6 +792,13 @@ pub(crate) struct ChatRequest { #[schema(nullable = true, example = "null")] #[serde(deserialize_with = "deserialize_tool_choice::deserialize")] pub tool_choice: Option, + + /// Response format constraints for the generation. + /// + /// NOTE: A request can use `response_format` OR `tools` but not both. + #[serde(default)] + #[schema(nullable = true, default = "null", example = "null")] + pub response_format: Option, } fn default_tool_prompt() -> Option { diff --git a/router/src/server.rs b/router/src/server.rs index 30479b0e4dc..e3c2c4f95e3 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1016,6 +1016,7 @@ async fn chat_completions( tool_choice, tool_prompt, temperature, + response_format, .. } = req; @@ -1030,6 +1031,18 @@ async fn chat_completions( other => (true, other), }; + // response_format and tools are mutually exclusive + if response_format.is_some() && tools.as_ref().is_some() { + metrics::increment_counter!("tgi_request_failure", "err" => "validation"); + return Err(( + StatusCode::UNPROCESSABLE_ENTITY, + Json(ErrorResponse { + error: "Grammar and tools are mutually exclusive".to_string(), + error_type: "grammar and tools".to_string(), + }), + )); + } + // extract tool grammar if present let tool_grammar = match ToolGrammar::apply(tools, tool_choice) { Ok(grammar) => grammar, @@ -1046,16 +1059,21 @@ async fn chat_completions( } }; - let grammar_with_prompt = tool_grammar + // determine the appropriate arguments for apply_chat_template + let tools_grammar_prompt = tool_grammar .as_ref() .map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt)); - let typed_grammar = grammar_with_prompt - .as_ref() - .map(|(grammar, _)| grammar.clone()); + let (tools_grammar_prompt, grammar) = match response_format { + Some(response_format) => (None, Some(response_format)), + None => ( + tools_grammar_prompt.clone(), + tools_grammar_prompt.map(|(grammar, _)| grammar.clone()), + ), + }; // apply chat template to flatten the request into a single input - let inputs = match infer.apply_chat_template(messages, grammar_with_prompt) { + let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) { Ok(inputs) => inputs, Err(err) => { metrics::increment_counter!("tgi_request_failure", "err" => "validation"); @@ -1091,7 +1109,7 @@ async fn chat_completions( decoder_input_details: !stream, seed, top_n_tokens: req.top_logprobs, - grammar: typed_grammar, + grammar, }, };