huggingface · Narsil · Sep 18, 2024 · Sep 18, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
@@ -28,11 +28,17 @@ class ToolCall(BaseModel):
     function: dict
 
 
+class Chunk(BaseModel):
+    type: str
+    text: Optional[str] = None
+    image_url: Any = None
+
+
 class Message(BaseModel):
     # Role of the message sender
     role: str
     # Content of the message
-    content: Optional[str] = None
+    content: Optional[Union[str, List[Chunk]]] = None
     # Optional name of the message sender
     name: Optional[str] = None
     # Tool calls associated with the chat completion

diff --git a/flake.lock b/flake.lock
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_load.json
@@ -0,0 +1,106 @@
+[
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a small village, a rooster named Cluck Norris ruled the coop with an iron beak",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727097740,
+    "id": "",
+    "model": "s0409/model-3",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 20,
+      "prompt_tokens": 24,
+      "total_tokens": 44
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a small village, a rooster named Cluck Norris ruled the coop with an iron beak",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727097740,
+    "id": "",
+    "model": "s0409/model-3",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 20,
+      "prompt_tokens": 24,
+      "total_tokens": 44
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a small village, a rooster named Cluck Norris ruled the coop with an iron beak",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727097740,
+    "id": "",
+    "model": "s0409/model-3",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 20,
+      "prompt_tokens": 24,
+      "total_tokens": 44
+    }
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null,
+        "message": {
+          "content": "In a small village, a rooster named Cluck Norris ruled the coop with an iron beak",
+          "name": null,
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "usage": null
+      }
+    ],
+    "created": 1727097740,
+    "id": "",
+    "model": "s0409/model-3",
+    "object": "chat.completion",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 20,
+      "prompt_tokens": 24,
+      "total_tokens": 44
+    }
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json b/integration-tests/models/__snapshots__/test_mllama/test_mllama_simpl.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "In a small village, a rooster named Cluck Norris ruled the coop with an iron beak",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1727090615,
+  "id": "",
+  "model": "s0409/model-3",
+  "object": "chat.completion",
+  "system_fingerprint": "2.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 20,
+    "prompt_tokens": 24,
+    "total_tokens": 44
+  }
+}
diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py
@@ -0,0 +1,108 @@
+import pytest
+import base64
+import asyncio
+
+
+@pytest.fixture(scope="module")
+def mllama_handle(launcher):
+    with launcher("s0409/model-3", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def mllama(mllama_handle):
+    await mllama_handle.health(300)
+    return mllama_handle.client
+
+
+# TODO fix the server parsser to count inline image tokens correctly
+def get_chicken():
+    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+def get_cow_beach():
+    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+@pytest.mark.asyncio
+async def test_mllama_simpl(mllama, response_snapshot):
+    # chicken = get_chicken()
+    response = await mllama.chat(
+        max_tokens=20,
+        temperature=0.0,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Can you tell me a very short story based on the image?",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
+                        },
+                    },
+                ],
+            },
+        ],
+    )
+
+    assert response.usage == {
+        "completion_tokens": 20,
+        "prompt_tokens": 24,
+        "total_tokens": 44,
+    }
+    assert (
+        response.choices[0].message.content
+        == "In a small village, a rooster named Cluck Norris ruled the coop with an iron beak"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_mllama_load(mllama, generate_load, response_snapshot):
+    futures = [
+        mllama.chat(
+            max_tokens=20,
+            temperature=0.0,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Can you tell me a very short story based on the image?",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
+                            },
+                        },
+                    ],
+                },
+            ],
+        )
+        for i in range(4)
+    ]
+    responses = await asyncio.gather(*futures)
+
+    generated_texts = [response.choices[0].message.content for response in responses]
+
+    assert (
+        generated_texts[0]
+        == "In a small village, a rooster named Cluck Norris ruled the coop with an iron beak"
+    )
+    assert len(generated_texts) == 4
+    assert generated_texts, all(
+        [text == generated_texts[0] for text in generated_texts]
+    )
+
+    assert responses == response_snapshot
diff --git a/router/src/config.rs b/router/src/config.rs
@@ -146,6 +146,7 @@ pub enum Config {
     ClipVisionModel(ClipVisionModel),
     Mistral,
     Idefics,
+    Mllama,
     Idefics2(Idefics2),
     Ssm,
     GptBigcode,

diff --git a/router/src/validation.rs b/router/src/validation.rs
@@ -567,6 +567,7 @@ fn image_tokens(
     use HubPreprocessorConfig::*;
     match config {
         Idefics => "<image>".to_string(),
+        Mllama => "<|image|>".to_string(),
         Idefics2(config) => {
             const FAKE: &str = "<fake_token_around_image>";
             const IMAGE: &str = "<image>";
@@ -618,7 +619,7 @@ fn prepare_input(
     use Config::*;
     static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
     let (tokenizer_query, input_chunks) = match config {
-        Some(config @ (Idefics | Idefics2(_) | Paligemma(_) | LlavaNext(_))) => {
+        Some(config @ (Idefics | Mllama | Idefics2(_) | Paligemma(_) | LlavaNext(_))) => {
             let mut input_chunks = Vec::new();
             let mut tokenizer_query = String::with_capacity(inputs.len());
             let mut start = 0;