Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions examples/voice_agents/timed_agent_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

from dotenv import load_dotenv

from livekit.agents import Agent, AgentServer, AgentSession, JobContext, cli, inference
from livekit.agents import Agent, AgentServer, AgentSession, JobContext, cli, inference, room_io
from livekit.agents.types import TimedString
from livekit.agents.voice.agent import ModelSettings
from livekit.agents.voice.io import TimedString
from livekit.plugins import silero
from livekit.plugins import cartesia, silero

logger = logging.getLogger("my-worker")
logger.setLevel(logging.INFO)
Expand Down Expand Up @@ -43,13 +43,23 @@ async def entrypoint(ctx: JobContext):
session = AgentSession(
stt=inference.STT("deepgram/nova-3"),
llm=inference.LLM("google/gemini-2.5-flash"),
tts=inference.TTS("cartesia/sonic-3"),
tts=cartesia.TTS(),
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does inference not support this? if not we should let the team know.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do have these options to enable the timestamp in tts inference (added in #4949), but it seems there is no timestamps returned when enabled. will forward to the team.

vad=silero.VAD.load(),
# enable TTS-aligned transcript, can be configured at the Agent level as well
use_tts_aligned_transcript=True,
)

await session.start(agent=MyAgent(), room=ctx.room)
await session.start(
agent=MyAgent(),
room=ctx.room,
room_options=room_io.RoomOptions(
text_output=room_io.TextOutputOptions(
# Optional: get the timed transcript from the `lk.transcription` datastream topic as JSON dict
json_format=True,
sync_transcription=False,
)
),
)

session.generate_reply(instructions="say hello to the user")

Expand Down
15 changes: 15 additions & 0 deletions livekit-agents/livekit/agents/voice/room_io/_output.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import asyncio
import json
import time

from livekit import rtc
Expand All @@ -13,6 +14,7 @@
ATTRIBUTE_TRANSCRIPTION_SEGMENT_ID,
ATTRIBUTE_TRANSCRIPTION_TRACK_ID,
TOPIC_TRANSCRIPTION,
TimedString,
)
from .. import io
from ..transcription import find_micro_track_id
Expand Down Expand Up @@ -365,13 +367,15 @@ def __init__(
is_delta_stream: bool = True,
participant: rtc.Participant | str | None = None,
attributes: dict[str, str] | None = None,
json_format: bool = False,
):
self._room, self._is_delta_stream = room, is_delta_stream
self._track_id: str | None = None
self._participant_identity: str | None = None
self._additional_attributes = attributes or {}

self._writer: rtc.TextStreamWriter | None = None
self._json_format = json_format

self._room.on("track_published", self._on_track_published)
self._room.on("local_track_published", self._on_local_track_published)
Expand Down Expand Up @@ -440,6 +444,15 @@ async def capture_text(self, text: str) -> None:
self._reset_state()
self._capturing = True

if self._json_format:
text_dict = {"text": str(text)}
if isinstance(text, TimedString):
if utils.is_given(text.start_time):
text_dict["start_time"] = text.start_time
if utils.is_given(text.end_time):
text_dict["end_time"] = text.end_time
text = json.dumps(text_dict) + "\n"

self._latest_text = text

try:
Expand Down Expand Up @@ -530,6 +543,7 @@ def __init__(
is_delta_stream: bool = True,
participant: rtc.Participant | str | None = None,
next_in_chain: io.TextOutput | None = None,
json_format: bool = False,
) -> None:
super().__init__(label="RoomIO", next_in_chain=next_in_chain)

Expand All @@ -545,6 +559,7 @@ def __init__(
room=room,
is_delta_stream=is_delta_stream,
participant=participant,
json_format=json_format,
),
]
self.__closed = False
Expand Down
1 change: 1 addition & 0 deletions livekit-agents/livekit/agents/voice/room_io/room_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ async def start(self) -> None:
is_delta_stream=True,
participant=None,
next_in_chain=output_text_options.next_in_chain,
json_format=output_text_options.json_format,
)

# use the RoomIO's audio output if available, otherwise use the agent's audio output
Expand Down
2 changes: 2 additions & 0 deletions livekit-agents/livekit/agents/voice/room_io/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ class TextOutputOptions:
Only effective if `sync_transcription` is True."""
next_in_chain: TextOutput | None = None
"""The next text output in the chain for the agent. If provided, the agent's transcription will be passed to it."""
json_format: bool = False
"""Send the transcription as JSON dict for each chunk, including start and end timestamps if it's a TimedString."""


@dataclass
Expand Down
Loading