Add support for Deepseek V2 (#2224)

Deepseek V2 is a MoE model from Deepseek. Relevant variations compared to other models: - Grouped top-K in expert selection. - mscale in yarn is calculated using the `mscale` and `mscale_all_dim` configuration options. - `mscale_all_dim` is also used in scaling attention softmax. - Permuting of the query/key representations before applying rotary embeddings. - Some projections cannot be sharded (`q_a_proj`, `kv_a_proj_with_mqa`). So, we need weight loads that supports quantized weights. To this end `{Weights,WeightLoader}.get_weight` was added. - The query/key head dimensionality differs from that of the value, so we need to pad during attention. - Heads with size 192, needs an extension to our paged attention fork and we need to ensure that the KV cache is allocated with the correct size. - Shared experts.
huggingface · Jul 26, 2024 · 8f3fa17 · 8f3fa17
1 parent 6bdf8d7
commit 8f3fa17
Show file tree

Hide file tree

Showing 14 changed files with 1,826 additions and 41 deletions.
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
@@ -5,6 +5,7 @@ Text Generation Inference enables serving optimized models on specific hardware
 
 ## Supported Models
 
+- [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)

diff --git a/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2.json b/integration-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 100000,
+        "logprob": null,
+        "text": "<｜begin▁of▁sentence｜>"
+      },
+      {
+        "id": 3533,
+        "logprob": -9.625,
+        "text": "Test"
+      },
+      {
+        "id": 3102,
+        "logprob": -11.1875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 185,
+        "logprob": -1.5546875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 549,
+        "logprob": -2.84375,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 1727,
+        "logprob": -2.34375,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 3102,
+        "logprob": -0.8359375,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 317,
+        "logprob": -1.0859375,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 254,
+        "logprob": -1.5390625,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 1022,
+        "logprob": -1.1875,
+        "special": false,
+        "text": " first"
+      },
+      {
+        "id": 3458,
+        "logprob": -0.35546875,
+        "special": false,
+        "text": " step"
+      },
+      {
+        "id": 279,
+        "logprob": -0.8828125,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 254,
+        "logprob": -0.71484375,
+        "special": false,
+        "text": " the"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nThe test request is the first step in the"
+}
diff --git a/...-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_all_params.json b/...-tests/models/__snapshots__/test_flash_deepseek_v2/test_flash_deepseek_v2_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 100000,
+        "logprob": null,
+        "text": "<｜begin▁of▁sentence｜>"
+      },
+      {
+        "id": 3533,
+        "logprob": -9.625,
+        "text": "Test"
+      },
+      {
+        "id": 3102,
+        "logprob": -11.1875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 2143,
+        "logprob": -1.828125,
+        "special": false,
+        "text": " sent"
+      },
+      {
+        "id": 10081,
+        "logprob": -0.36914062,
+        "special": false,
+        "text": " successfully"
+      },
+      {
+        "id": 13,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 185,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 1380,
+        "logprob": -0.38671875,
+        "special": false,
+        "text": "We"
+      },
+      {
+        "id": 543,
+        "logprob": -0.12695312,
+        "special": false,
+        "text": " will"
+      },
+      {
+        "id": 752,
+        "logprob": -0.20117188,
+        "special": false,
+        "text": " get"
+      },
+      {
+        "id": 279,
+        "logprob": 0.0,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 5402,
+        "logprob": 0.0,
+        "special": false,
+        "text": " touch"
+      },
+      {
+        "id": 366,
+        "logprob": 0.0,
+        "special": false,
+        "text": " with"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request sent successfully.\nWe will get in touch with"
+}