Skip to content

Commit

Permalink
feat: expose user_speech_committed and agent_speech_committed for mul…
Browse files Browse the repository at this point in the history
…timodal agents (#1001)
  • Loading branch information
longcw authored Oct 30, 2024
1 parent dcef5f1 commit 8404cf3
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 6 deletions.
5 changes: 5 additions & 0 deletions .changeset/proud-wombats-perform.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"livekit-agents": patch
---

expose transcriptions for multimodal agents
14 changes: 9 additions & 5 deletions livekit-agents/livekit/agents/multimodal/agent_playout.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,15 @@ async def _capture_task():

await utils.aio.gracefully_cancel(read_text_task)

if not first_frame:
if not handle.interrupted:
handle._tr_fwd.segment_playout_finished()
# make sure the text_data.sentence_stream is closed
handle._tr_fwd.mark_text_segment_end()

self.emit("playout_stopped", handle.interrupted)
if not first_frame and not handle.interrupted:
handle._tr_fwd.segment_playout_finished()

handle._done_fut.set_result(None)
await handle._tr_fwd.aclose()
handle._done_fut.set_result(None)

# emit playout_stopped after the transcription forwarder has been closed
if not first_frame:
self.emit("playout_stopped", handle.interrupted)
34 changes: 33 additions & 1 deletion livekit-agents/livekit/agents/multimodal/multimodal_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import aiohttp
from livekit import rtc
from livekit.agents import llm, stt, tokenize, transcription, utils, vad
from livekit.agents.llm import ChatMessage

from .._constants import ATTRIBUTE_AGENT_STATE
from .._types import AgentState
Expand All @@ -18,6 +19,9 @@
"user_stopped_speaking",
"agent_started_speaking",
"agent_stopped_speaking",
"user_speech_committed",
"agent_speech_committed",
"agent_speech_interrupted",
]


Expand Down Expand Up @@ -172,9 +176,16 @@ def _input_speech_transcription_completed(
alternatives=[stt.SpeechData(language="", text=ev.transcript)],
)
)
user_msg = ChatMessage.create(text=ev.transcript, role="user")
self.emit("user_speech_committed", user_msg)
logger.debug(
"committed user speech",
extra={"user_transcript": ev.transcript},
)

@self._session.on("input_speech_started")
def _input_speech_started():
self.emit("user_started_speaking")
self._update_state("listening")
if self._playing_handle is not None and not self._playing_handle.done():
self._playing_handle.interrupt()
Expand All @@ -185,7 +196,9 @@ def _input_speech_started():
audio_end_ms=int(self._playing_handle.audio_samples / 24000 * 1000),
)

self._playing_handle = None
@self._session.on("input_speech_stopped")
def _input_speech_stopped():
self.emit("user_stopped_speaking")

def _update_state(self, state: AgentState, delay: float = 0.0):
"""Set the current state of the agent"""
Expand Down Expand Up @@ -220,6 +233,25 @@ def _on_playout_stopped(interrupted: bool) -> None:
self.emit("agent_stopped_speaking")
self._update_state("listening")

if self._playing_handle is not None:
collected_text = self._playing_handle._tr_fwd.played_text
if interrupted:
collected_text += "..."

msg = ChatMessage.create(text=collected_text, role="assistant")
if interrupted:
self.emit("agent_speech_interrupted", msg)
else:
self.emit("agent_speech_committed", msg)

logger.debug(
"committed agent speech",
extra={
"agent_transcript": collected_text,
"interrupted": interrupted,
},
)

self._agent_playout.on("playout_started", _on_playout_started)
self._agent_playout.on("playout_stopped", _on_playout_stopped)

Expand Down

0 comments on commit 8404cf3

Please sign in to comment.