livekit · theomonnom · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/examples/voice_agents/email_example.py b/examples/voice_agents/email_example.py
@@ -41,7 +41,7 @@ async def register_for_event(self, context: RunContext):
         "Start the registration process for the event."
 
         email_result = await beta.workflows.GetEmailTask(
-            instructions=beta.workflows.InstructionParts(
+            instructions=beta.workflows.WorkflowInstructions(
                 persona=(
                     "You are capturing the email address of the user for the event registration. "
                     "You are only a single step in a broader system responsible solely for capturing an email address."

diff --git a/livekit-agents/livekit/agents/__init__.py b/livekit-agents/livekit/agents/__init__.py
@@ -74,6 +74,7 @@
     AgentSession,
     AgentStateChangedEvent,
     AgentTask,
+    AudioRecognition,
     CloseEvent,
     CloseReason,
     ConversationItemAddedEvent,
@@ -183,6 +184,7 @@ def __getattr__(name: str) -> typing.Any:
     "RunContext",
     "Plugin",
     "AgentSession",
+    "AudioRecognition",
     "RecordingOptions",
     "text_transforms",
     "AgentEvent",

diff --git a/livekit-agents/livekit/agents/beta/workflows/__init__.py b/livekit-agents/livekit/agents/beta/workflows/__init__.py
@@ -6,7 +6,7 @@
 from .name import GetNameResult, GetNameTask
 from .phone_number import GetPhoneNumberResult, GetPhoneNumberTask
 from .task_group import TaskCompletedEvent, TaskGroup, TaskGroupResult
-from .utils import InstructionParts
+from .utils import WorkflowInstructions
 from .warm_transfer import WarmTransferResult, WarmTransferTask
 
 __all__ = [
@@ -18,7 +18,7 @@
     "GetDOBResult",
     "GetDOBTask",
     "GetDtmfResult",
-    "InstructionParts",
+    "WorkflowInstructions",
     "GetCreditCardResult",
     "GetCreditCardTask",
     "GetNameTask",

diff --git a/livekit-agents/livekit/agents/beta/workflows/address.py b/livekit-agents/livekit/agents/beta/workflows/address.py
@@ -11,7 +11,7 @@
 from ...utils import is_given
 from ...voice.agent import AgentTask
 from ...voice.events import RunContext
-from .utils import InstructionParts
+from .utils import WorkflowInstructions
 
 if TYPE_CHECKING:
     from ...voice.turn import TurnDetectionMode
@@ -26,7 +26,7 @@ class GetAddressTask(AgentTask[GetAddressResult]):
     def __init__(
         self,
         *,
-        instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
+        instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
         chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
         turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
         tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
@@ -40,23 +40,22 @@ def __init__(
         extra_instructions: str = "",
     ) -> None:
         if not is_given(instructions):
-            instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
+            instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
         elif extra_instructions:
             logger.warning("`extra_instructions` will be ignored when `instructions` is provided")
 
-        if isinstance(instructions, InstructionParts):
-            instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
-                persona=instructions.persona if is_given(instructions.persona) else PERSONA,
-                extra=instructions.extra,
+        if isinstance(instructions, WorkflowInstructions):
+            instructions = instructions.resolve(
+                template=INSTRUCTIONS_TEMPLATE,
+                default_persona=PERSONA,
                 _modality_specific=Instructions(audio=AUDIO_SPECIFIC, text=TEXT_SPECIFIC),
                 _confirmation=Instructions(
-                    # confirmation is enabled by default for audio, disabled by default for text
                     audio=CONFIRMATION_INSTRUCTION if require_confirmation is not False else "",
                     text=CONFIRMATION_INSTRUCTION if require_confirmation is True else "",
                 ),
             )
 
-        assert is_given(instructions)  # for type checking
+        assert isinstance(instructions, (str, Instructions))  # for type checking
         super().__init__(
             instructions=instructions,
             chat_ctx=chat_ctx,

diff --git a/livekit-agents/livekit/agents/beta/workflows/email_address.py b/livekit-agents/livekit/agents/beta/workflows/email_address.py
@@ -12,7 +12,7 @@
 from ...utils import is_given
 from ...voice.agent import AgentTask
 from ...voice.events import RunContext
-from .utils import InstructionParts
+from .utils import WorkflowInstructions
 
 if TYPE_CHECKING:
     from ...voice.turn import TurnDetectionMode
@@ -27,7 +27,7 @@ class GetEmailTask(AgentTask[GetEmailResult]):
     def __init__(
         self,
         *,
-        instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
+        instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
         chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
         turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
         tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
@@ -41,23 +41,22 @@ def __init__(
         extra_instructions: str = "",
     ) -> None:
         if not is_given(instructions):
-            instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
+            instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
         elif extra_instructions:
             logger.warning("`extra_instructions` will be ignored when `instructions` is provided")
 
-        if isinstance(instructions, InstructionParts):
-            instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
-                persona=instructions.persona if is_given(instructions.persona) else PERSONA,
-                extra=instructions.extra,
+        if isinstance(instructions, WorkflowInstructions):
+            instructions = instructions.resolve(
+                template=INSTRUCTIONS_TEMPLATE,
+                default_persona=PERSONA,
                 _modality_specific=Instructions(audio=AUDIO_SPECIFIC, text=TEXT_SPECIFIC),
                 _confirmation=Instructions(
-                    # confirmation is enabled by default for audio, disabled by default for text
                     audio=CONFIRMATION_INSTRUCTION if require_confirmation is not False else "",
                     text=CONFIRMATION_INSTRUCTION if require_confirmation is True else "",
                 ),
             )
 
-        assert is_given(instructions)  # for type checking
+        assert isinstance(instructions, (str, Instructions))  # for type checking
         super().__init__(
             instructions=instructions,
             chat_ctx=chat_ctx,

diff --git a/livekit-agents/livekit/agents/beta/workflows/utils.py b/livekit-agents/livekit/agents/beta/workflows/utils.py
@@ -1,10 +1,11 @@
 from __future__ import annotations
 
-from dataclasses import dataclass
 from enum import Enum
+from typing import Any
 
 from ...llm.chat_context import Instructions
 from ...types import NOT_GIVEN, NotGivenOr
+from ...utils import is_given
 
 
 class DtmfEvent(str, Enum):
@@ -44,19 +45,40 @@ def format_dtmf(events: list[DtmfEvent]) -> str:
     return " ".join(event.value for event in events)
 
 
-@dataclass
-class InstructionParts:
+class WorkflowInstructions(Instructions):
     """Customizable instruction sections for built-in workflow tasks.
 
+    Extends :class:`Instructions` with ``persona`` and ``extra`` fields
+    that workflow tasks resolve against their own templates and defaults.
+
     Each field overrides that section when set; leave as ``NOT_GIVEN`` to
     preserve the workflow's built-in default. Set to ``""`` to remove a
     section entirely.
-
-    Args:
-        persona: Agent persona/identity — who the agent is and how it behaves.
-        extra: Extra instructions appended to the prompt. The simplest hook for
-            adding domain context without touching defaults.
     """
 
-    persona: NotGivenOr[Instructions | str] = NOT_GIVEN
-    extra: Instructions | str = ""
+    def __init__(
+        self,
+        audio: str = "",
+        *,
+        text: str | None = None,
+        persona: NotGivenOr[Instructions | str] = NOT_GIVEN,
+        extra: Instructions | str = "",
+    ) -> None:
+        super().__init__(audio, text=text)
+        self.persona: NotGivenOr[Instructions | str] = persona
+        self.extra: Instructions | str = extra
+
+    def resolve(
+        self,
+        *,
+        template: str,
+        default_persona: str,
+        **format_kwargs: Any,
+    ) -> Instructions:
+        """Resolve into a final :class:`Instructions` by formatting the template."""
+        return Instructions.resolve_template(
+            template,
+            persona=self.persona if is_given(self.persona) else default_persona,
+            extra=self.extra,
+            **format_kwargs,
+        )
diff --git a/livekit-agents/livekit/agents/beta/workflows/warm_transfer.py b/livekit-agents/livekit/agents/beta/workflows/warm_transfer.py
@@ -25,7 +25,7 @@
     BuiltinAudioClip,
     PlayHandle,
 )
-from .utils import InstructionParts
+from .utils import WorkflowInstructions
 
 if TYPE_CHECKING:
     from ...voice.turn import TurnDetectionMode
@@ -46,7 +46,7 @@ def __init__(
         sip_number: NotGivenOr[str] = NOT_GIVEN,
         sip_headers: NotGivenOr[dict[str, str]] = NOT_GIVEN,
         hold_audio: NotGivenOr[AudioSource | AudioConfig | list[AudioConfig] | None] = NOT_GIVEN,
-        instructions: NotGivenOr[InstructionParts | Instructions | str] = NOT_GIVEN,
+        instructions: NotGivenOr[WorkflowInstructions | Instructions | str] = NOT_GIVEN,
         chat_ctx: NotGivenOr[llm.ChatContext] = NOT_GIVEN,
         turn_detection: NotGivenOr[TurnDetectionMode | None] = NOT_GIVEN,
         tools: NotGivenOr[list[llm.Tool | llm.Toolset]] = NOT_GIVEN,
@@ -79,19 +79,19 @@ def __init__(
         """
 
         if not is_given(instructions):
-            instructions = InstructionParts(persona=PERSONA, extra=extra_instructions)
+            instructions = WorkflowInstructions(persona=PERSONA, extra=extra_instructions)
         elif extra_instructions:
             logger.warning("`extra_instructions` will be ignored when `instructions` is provided")
 
-        if isinstance(instructions, InstructionParts):
+        if isinstance(instructions, WorkflowInstructions):
             conversation_history = self._format_conversation_history(chat_ctx)
-            instructions = Instructions(INSTRUCTIONS_TEMPLATE).format(
-                persona=instructions.persona if is_given(instructions.persona) else PERSONA,
-                extra=instructions.extra,
+            instructions = instructions.resolve(
+                template=INSTRUCTIONS_TEMPLATE,
+                default_persona=PERSONA,
                 _conversation_history=conversation_history,
             )
 
-        assert is_given(instructions)  # for type checking
+        assert isinstance(instructions, (str, Instructions))  # for type checking
         super().__init__(
             instructions=instructions,
             chat_ctx=NOT_GIVEN,  # don't pass the chat_ctx

diff --git a/livekit-agents/livekit/agents/evals/judge.py b/livekit-agents/livekit/agents/evals/judge.py
@@ -78,7 +78,7 @@ def _get_latest_instructions(chat_ctx: ChatContext) -> str | None:
     """
     for item in reversed(chat_ctx.items):
         if item.type == "agent_config_update" and item.instructions:
-            return item.instructions
+            return str(item.instructions)
     return None
 
 

diff --git a/livekit-agents/livekit/agents/inference/tts.py b/livekit-agents/livekit/agents/inference/tts.py
@@ -417,6 +417,29 @@ def __init__(
         )
         self._streams = weakref.WeakSet[SynthesizeStream]()
 
+    class Markup(tts.TTS.Markup):
+        def __init__(self, gateway_tts: TTS) -> None:
+            super().__init__(gateway_tts)
+            self._gateway_tts = gateway_tts
+
+        def _upstream_provider(self) -> str:
+            return self._gateway_tts._opts.model.split("/")[0]
+
+        def llm_instructions(self) -> str | None:
+            from ..tts._provider_format import llm_instructions
+
+            return llm_instructions(self._upstream_provider())
+
+        def to_text(self, text: str) -> str:
+            from ..tts._provider_format import strip_markup
+
+            return strip_markup(self._upstream_provider(), text)
+
+        def convert(self, text: str) -> str:
+            from ..tts._provider_format import convert_markup
+
+            return convert_markup(self._upstream_provider(), text)
+
     @classmethod
     def from_model_string(cls, model: str) -> TTS:
         """Create a TTS instance from a model string

diff --git a/livekit-agents/livekit/agents/llm/__init__.py b/livekit-agents/livekit/agents/llm/__init__.py
@@ -38,6 +38,7 @@
     RealtimeSessionReconnectedEvent,
     RemoteItemAddedEvent,
 )
+from .response_field import Response, ResponseField
 from .tool_context import (
     FunctionTool,
     ProviderTool,
@@ -108,6 +109,8 @@
     "RealtimeSessionRestoredEvent",
     "LLMError",
     "RemoteItemAddedEvent",
+    "Response",
+    "ResponseField",
 ]
 
 # Cleanup docs of unexported modules

diff --git a/livekit-agents/livekit/agents/llm/_provider_format/anthropic.py b/livekit-agents/livekit/agents/llm/_provider_format/anthropic.py
@@ -52,10 +52,13 @@ def to_chat_ctx(
 
         if msg.type == "message":
             for c in msg.content:
-                if c and isinstance(c, str):
-                    content.append({"text": c, "type": "text"})
-                elif isinstance(c, llm.ImageContent):
+                if isinstance(c, llm.ImageContent):
                     content.append(_to_image_content(c))
+                elif isinstance(c, llm.AudioContent):
+                    pass
+                elif c:
+                    # str or Instructions
+                    content.append({"text": str(c), "type": "text"})
         elif msg.type == "function_call":
             content.append(
                 {

diff --git a/livekit-agents/livekit/agents/llm/_provider_format/aws.py b/livekit-agents/livekit/agents/llm/_provider_format/aws.py
@@ -46,10 +46,13 @@ def to_chat_ctx(
 
         if msg.type == "message":
             for content in msg.content:
-                if content and isinstance(content, str):
-                    current_content.append({"text": content})
-                elif isinstance(content, llm.ImageContent):
+                if isinstance(content, llm.ImageContent):
                     current_content.append(_build_image(content))
+                elif isinstance(content, llm.AudioContent):
+                    pass
+                elif content:
+                    # str or Instructions
+                    current_content.append({"text": str(content)})
         elif msg.type == "function_call":
             current_content.append(
                 {

diff --git a/livekit-agents/livekit/agents/llm/_provider_format/google.py b/livekit-agents/livekit/agents/llm/_provider_format/google.py
@@ -51,12 +51,15 @@ def to_chat_ctx(
 
         if msg.type == "message":
             for content in msg.content:
-                if content and isinstance(content, str):
-                    parts.append({"text": content})
+                if isinstance(content, llm.ImageContent):
+                    parts.append(_to_image_part(content))
+                elif isinstance(content, llm.AudioContent):
+                    pass
                 elif content and isinstance(content, dict):
                     parts.append({"text": json.dumps(content)})
-                elif isinstance(content, llm.ImageContent):
-                    parts.append(_to_image_part(content))
+                elif content:
+                    # str or Instructions
+                    parts.append({"text": str(content)})
         elif msg.type == "function_call":
             fc_part: dict[str, Any] = {
                 "function_call": {

diff --git a/livekit-agents/livekit/agents/llm/_provider_format/mistralai.py b/livekit-agents/livekit/agents/llm/_provider_format/mistralai.py
@@ -34,7 +34,11 @@ def to_conversations_ctx(
         if group.message:
             item = group.message
             if isinstance(item, llm.ChatMessage) and item.role in ("system", "developer"):
-                text_parts = [c for c in item.content if isinstance(c, str)]
+                text_parts = [
+                    str(c)
+                    for c in item.content
+                    if not isinstance(c, (llm.ImageContent, llm.AudioContent))
+                ]
                 instructions = "\n".join(text_parts) if text_parts else None
                 continue
 
@@ -83,12 +87,15 @@ def _build_content(msg: llm.ChatMessage) -> str | list[dict[str, Any]]:
     text_content = ""
 
     for content in msg.content:
-        if isinstance(content, str):
+        if isinstance(content, llm.ImageContent):
+            list_content.append(_to_image_content(content))
+        elif isinstance(content, llm.AudioContent):
+            pass
+        else:
+            # str or Instructions
             if text_content:
                 text_content += "\n"
-            text_content += content
-        elif isinstance(content, llm.ImageContent):
-            list_content.append(_to_image_content(content))
+            text_content += str(content)
 
     if not list_content:
         return text_content