livekit · longcw · Apr 23, 2026 · Apr 17, 2026 · Apr 21, 2026 · Apr 23, 2026
diff --git a/examples/voice_agents/timed_agent_transcript.py b/examples/voice_agents/timed_agent_transcript.py
@@ -4,10 +4,10 @@
 
 from dotenv import load_dotenv
 
-from livekit.agents import Agent, AgentServer, AgentSession, JobContext, cli, inference
+from livekit.agents import Agent, AgentServer, AgentSession, JobContext, cli, inference, room_io
+from livekit.agents.types import TimedString
 from livekit.agents.voice.agent import ModelSettings
-from livekit.agents.voice.io import TimedString
-from livekit.plugins import silero
+from livekit.plugins import cartesia, silero
 
 logger = logging.getLogger("my-worker")
 logger.setLevel(logging.INFO)
@@ -43,13 +43,23 @@ async def entrypoint(ctx: JobContext):
     session = AgentSession(
         stt=inference.STT("deepgram/nova-3"),
         llm=inference.LLM("google/gemini-2.5-flash"),
-        tts=inference.TTS("cartesia/sonic-3"),
+        tts=cartesia.TTS(),
         vad=silero.VAD.load(),
         # enable TTS-aligned transcript, can be configured at the Agent level as well
         use_tts_aligned_transcript=True,
     )
 
-    await session.start(agent=MyAgent(), room=ctx.room)
+    await session.start(
+        agent=MyAgent(),
+        room=ctx.room,
+        room_options=room_io.RoomOptions(
+            text_output=room_io.TextOutputOptions(
+                # Optional: get the timed transcript from the `lk.transcription` datastream topic as JSON dict
+                json_format=True,
+                sync_transcription=False,
+            )
+        ),
+    )
 
     session.generate_reply(instructions="say hello to the user")
 

diff --git a/livekit-agents/livekit/agents/voice/room_io/_output.py b/livekit-agents/livekit/agents/voice/room_io/_output.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import json
 import time
 
 from livekit import rtc
@@ -13,6 +14,7 @@
     ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID,
     ATTRIBUTE_TRANSCRIPTION_TRACK_ID,
     TOPIC_TRANSCRIPTION,
+    TimedString,
 )
 from .. import io
 from ..transcription import find_micro_track_id
@@ -365,13 +367,15 @@ def __init__(
         is_delta_stream: bool = True,
         participant: rtc.Participant | str | None = None,
         attributes: dict[str, str] | None = None,
+        json_format: bool = False,
     ):
         self._room, self._is_delta_stream = room, is_delta_stream
         self._track_id: str | None = None
         self._participant_identity: str | None = None
         self._additional_attributes = attributes or {}
 
         self._writer: rtc.TextStreamWriter | None = None
+        self._json_format = json_format
 
         self._room.on("track_published", self._on_track_published)
         self._room.on("local_track_published", self._on_local_track_published)
@@ -440,6 +444,15 @@ async def capture_text(self, text: str) -> None:
             self._reset_state()
             self._capturing = True
 
+        if self._json_format:
+            text_dict = {"text": str(text)}
+            if isinstance(text, TimedString):
+                if utils.is_given(text.start_time):
+                    text_dict["start_time"] = text.start_time
+                if utils.is_given(text.end_time):
+                    text_dict["end_time"] = text.end_time
+            text = json.dumps(text_dict) + "\n"
+
         self._latest_text = text
 
         try:
@@ -530,6 +543,7 @@ def __init__(
         is_delta_stream: bool = True,
         participant: rtc.Participant | str | None = None,
         next_in_chain: io.TextOutput | None = None,
+        json_format: bool = False,
     ) -> None:
         super().__init__(label="RoomIO", next_in_chain=next_in_chain)
 
@@ -545,6 +559,7 @@ def __init__(
                 room=room,
                 is_delta_stream=is_delta_stream,
                 participant=participant,
+                json_format=json_format,
             ),
         ]
         self.__closed = False

diff --git a/livekit-agents/livekit/agents/voice/room_io/room_io.py b/livekit-agents/livekit/agents/voice/room_io/room_io.py
@@ -156,6 +156,7 @@ async def start(self) -> None:
                 is_delta_stream=True,
                 participant=None,
                 next_in_chain=output_text_options.next_in_chain,
+                json_format=output_text_options.json_format,
             )
 
             # use the RoomIO's audio output if available, otherwise use the agent's audio output

diff --git a/livekit-agents/livekit/agents/voice/room_io/types.py b/livekit-agents/livekit/agents/voice/room_io/types.py
@@ -99,6 +99,8 @@ class TextOutputOptions:
     Only effective if `sync_transcription` is True."""
     next_in_chain: TextOutput | None = None
     """The next text output in the chain for the agent. If provided, the agent's transcription will be passed to it."""
+    json_format: bool = False
+    """Send the transcription as JSON dict for each chunk, including start and end timestamps if it's a TimedString."""
 
 
 @dataclass