Skip to content

Commit

Permalink
ultravox
Browse files Browse the repository at this point in the history
Signed-off-by: Roger Wang <[email protected]>
  • Loading branch information
ywang96 committed Jan 4, 2025
1 parent 3d60be0 commit 0a1d1e2
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `fixie-ai/ultravox-v0_3`
-
- ✅︎
-
- ✅︎
```

<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
Expand Down
18 changes: 12 additions & 6 deletions vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
"""PyTorch Ultravox model."""

import math
import os
from functools import cached_property
from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
TypedDict, Union)
Expand Down Expand Up @@ -36,8 +36,10 @@
from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
init_vllm_registered_model, maybe_prefix,
merge_multimodal_embeddings,
merge_multimodal_embeddings_from_map)

_AUDIO_PLACEHOLDER_TOKEN = 128002
_AUDIO_TOKENS_PER_SECOND = 6.25


Expand Down Expand Up @@ -449,11 +451,15 @@ def get_input_embeddings(
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:

# TODO(ywang96): use merge_multimodal_embeddings after
# v0 is deprecated
merge_multimodal_embeddings_from_map(
inputs_embeds, multimodal_embeddings,
attn_metadata.multi_modal_placeholder_index_maps["audio"])
# TODO(ywang96): remove this block after v0 is deprecated.
if os.environ.get("VLLM_USE_V1") == "0":
merge_multimodal_embeddings_from_map(
inputs_embeds, multimodal_embeddings,
attn_metadata.multi_modal_placeholder_index_maps["audio"])
else:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
_AUDIO_PLACEHOLDER_TOKEN)
return inputs_embeds

def forward(self,
Expand Down

0 comments on commit 0a1d1e2

Please sign in to comment.