feat: expose user_speech_committed and agent_speech_committed for mul…

…timodal agents (#1001)
livekit · Oct 30, 2024 · 8404cf3 · 8404cf3
1 parent dcef5f1
commit 8404cf3
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 6 deletions.
diff --git a/.changeset/proud-wombats-perform.md b/.changeset/proud-wombats-perform.md
@@ -0,0 +1,5 @@
+---
+"livekit-agents": patch
+---
+
+expose transcriptions for multimodal agents
diff --git a/livekit-agents/livekit/agents/multimodal/agent_playout.py b/livekit-agents/livekit/agents/multimodal/agent_playout.py
@@ -165,11 +165,15 @@ async def _capture_task():
 
             await utils.aio.gracefully_cancel(read_text_task)
 
-            if not first_frame:
-                if not handle.interrupted:
-                    handle._tr_fwd.segment_playout_finished()
+            # make sure the text_data.sentence_stream is closed
+            handle._tr_fwd.mark_text_segment_end()
 
-                self.emit("playout_stopped", handle.interrupted)
+            if not first_frame and not handle.interrupted:
+                handle._tr_fwd.segment_playout_finished()
 
-            handle._done_fut.set_result(None)
             await handle._tr_fwd.aclose()
+            handle._done_fut.set_result(None)
+
+            # emit playout_stopped after the transcription forwarder has been closed
+            if not first_frame:
+                self.emit("playout_stopped", handle.interrupted)
diff --git a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py
@@ -7,6 +7,7 @@
 import aiohttp
 from livekit import rtc
 from livekit.agents import llm, stt, tokenize, transcription, utils, vad
+from livekit.agents.llm import ChatMessage
 
 from .._constants import ATTRIBUTE_AGENT_STATE
 from .._types import AgentState
@@ -18,6 +19,9 @@
     "user_stopped_speaking",
     "agent_started_speaking",
     "agent_stopped_speaking",
+    "user_speech_committed",
+    "agent_speech_committed",
+    "agent_speech_interrupted",
 ]
 
 
@@ -172,9 +176,16 @@ def _input_speech_transcription_completed(
                     alternatives=[stt.SpeechData(language="", text=ev.transcript)],
                 )
             )
+            user_msg = ChatMessage.create(text=ev.transcript, role="user")
+            self.emit("user_speech_committed", user_msg)
+            logger.debug(
+                "committed user speech",
+                extra={"user_transcript": ev.transcript},
+            )
 
         @self._session.on("input_speech_started")
         def _input_speech_started():
+            self.emit("user_started_speaking")
             self._update_state("listening")
             if self._playing_handle is not None and not self._playing_handle.done():
                 self._playing_handle.interrupt()
@@ -185,7 +196,9 @@ def _input_speech_started():
                     audio_end_ms=int(self._playing_handle.audio_samples / 24000 * 1000),
                 )
 
-                self._playing_handle = None
+        @self._session.on("input_speech_stopped")
+        def _input_speech_stopped():
+            self.emit("user_stopped_speaking")
 
     def _update_state(self, state: AgentState, delay: float = 0.0):
         """Set the current state of the agent"""
@@ -220,6 +233,25 @@ def _on_playout_stopped(interrupted: bool) -> None:
             self.emit("agent_stopped_speaking")
             self._update_state("listening")
 
+            if self._playing_handle is not None:
+                collected_text = self._playing_handle._tr_fwd.played_text
+                if interrupted:
+                    collected_text += "..."
+
+                msg = ChatMessage.create(text=collected_text, role="assistant")
+                if interrupted:
+                    self.emit("agent_speech_interrupted", msg)
+                else:
+                    self.emit("agent_speech_committed", msg)
+
+                logger.debug(
+                    "committed agent speech",
+                    extra={
+                        "agent_transcript": collected_text,
+                        "interrupted": interrupted,
+                    },
+                )
+
         self._agent_playout.on("playout_started", _on_playout_started)
         self._agent_playout.on("playout_stopped", _on_playout_stopped)