GetStream · Nash0x7E2 · Apr 16, 2026 · Mar 20, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -1460,8 +1460,12 @@ async def forward_audio(event: RealtimeAudioOutputEvent):
                     await self._audio_track.write(event.data)
 
             @self.events.subscribe
-            async def on_audio_done(_: RealtimeAudioOutputDoneEvent):
-                if self._audio_track is not None and not self.audio_publishers:
+            async def on_audio_done(event: RealtimeAudioOutputDoneEvent):
+                if (
+                    event.interrupted
+                    and self._audio_track is not None
+                    and not self.audio_publishers
+                ):
                     await self._audio_track.flush()
 
         # Set up video track if video publishers are available

diff --git a/plugins/xai/.env.example b/plugins/xai/.env.example
@@ -0,0 +1,10 @@
+# xAI API key — used for LLM, Realtime, and TTS
+XAI_API_KEY=...
+
+# Stream API credentials (for Realtime and TTS examples)
+STREAM_API_KEY=...
+STREAM_API_SECRET=...
+EXAMPLE_BASE_URL=https://demo.visionagents.ai
+
+# Required by the TTS example for STT
+DEEPGRAM_API_KEY=...
diff --git a/plugins/xai/README.md b/plugins/xai/README.md
@@ -145,11 +145,88 @@ Create a response with full control over parameters.
 
 - `XAI_API_KEY`: Your xAI API key (required if not provided in constructor)
 
+## Text-to-Speech (TTS)
+
+The plugin also ships an `xai.TTS` class powered by [xAI's Grok Voice API](https://docs.x.ai/docs/guides/voice/tts). It provides five expressive voices with inline speech tags for fine-grained delivery control.
+
+### Usage
+
+```python
+from vision_agents.plugins import xai
+
+# Default voice (eve) — energetic, upbeat
+tts = xai.TTS()
+
+# Specify a voice
+tts = xai.TTS(voice="ara")   # warm, friendly
+tts = xai.TTS(voice="leo")   # authoritative, strong
+tts = xai.TTS(voice="rex")   # confident, clear
+tts = xai.TTS(voice="sal")   # smooth, balanced
+
+# Custom output format
+tts = xai.TTS(
+    voice="rex",
+    codec="mp3",
+    sample_rate=44100,
+    bit_rate=192000,
+)
+
+# Explicit API key (otherwise reads XAI_API_KEY env var)
+tts = xai.TTS(api_key="xai-your-key-here")
+```
+
+### Configuration
+
+| Parameter     | Type   | Default   | Description                                                           |
+|---------------|--------|-----------|-----------------------------------------------------------------------|
+| `api_key`     | str    | env var   | xAI API key. Falls back to `XAI_API_KEY` environment variable.        |
+| `voice`       | str    | `"eve"`   | Voice ID: `"eve"`, `"ara"`, `"leo"`, `"rex"`, or `"sal"`.            |
+| `language`    | str    | `"en"`    | BCP-47 language code or `"auto"` for detection.                       |
+| `codec`       | str    | `"pcm"`   | Output codec: `"pcm"`, `"mp3"`, `"wav"`, `"mulaw"`, `"alaw"`.       |
+| `sample_rate` | int    | `24000`   | Sample rate: `8000`–`48000` Hz.                                       |
+| `bit_rate`    | int    | `None`    | MP3 bit rate (only used with `codec="mp3"`).                          |
+| `base_url`    | str    | `None`    | Override the xAI TTS API endpoint.                                    |
+| `session`     | object | `None`    | Optional pre-existing `aiohttp.ClientSession`.                        |
+
+### Voices
+
+| Voice | Tone                     | Best For                                       |
+|-------|--------------------------|------------------------------------------------|
+| `eve` | Energetic, upbeat        | Demos, announcements, upbeat content (default) |
+| `ara` | Warm, friendly           | Conversational interfaces, hospitality         |
+| `leo` | Authoritative, strong    | Instructional, educational, healthcare         |
+| `rex` | Confident, clear         | Business, corporate, customer support          |
+| `sal` | Smooth, balanced         | Versatile — works for any context              |
+
+### Speech tags
+
+Add expressiveness to synthesized speech with inline and wrapping tags:
+
+**Inline tags** (placed where the expression should occur):
+- Pauses: `[pause]` `[long-pause]` `[hum-tune]`
+- Laughter: `[laugh]` `[chuckle]` `[giggle]` `[cry]`
+- Mouth sounds: `[tsk]` `[tongue-click]` `[lip-smack]`
+- Breathing: `[breath]` `[inhale]` `[exhale]` `[sigh]`
+
+**Wrapping tags** (wrap text to change delivery):
+- Volume: `<soft>text</soft>` `<loud>text</loud>` `<shout>text</shout>`
+- Pitch/speed: `<high-pitch>text</high-pitch>` `<low-pitch>text</low-pitch>` `<slow>text</slow>` `<fast>text</fast>`
+- Style: `<whisper>text</whisper>` `<sing>text</sing>`
+
+### MP3 output
+
+MP3 decoding requires `pydub`. Install it via the `mp3` extra:
+
+```bash
+uv add "vision-agents-plugins-xai[mp3]"
+```
+
 ## Requirements
 
 - Python 3.10+
 - `xai-sdk`
 - `vision-agents-core`
+- Optional: `pydub` (for MP3 decoding via the `mp3` extra)
 
 ## License
 

diff --git a/plugins/xai/example/README.md b/plugins/xai/example/README.md
@@ -1,4 +1,21 @@
-# xAI Realtime Voice Agent Example
+# xAI Examples
+
+This directory contains examples for the xAI plugin. Two integration paths are demonstrated:
+
+1. **Realtime voice agent** — `xai_realtime_example.py`: end-to-end voice conversation using xAI's realtime WebSocket API.
+2. **TTS-based pipelined agent** — `xai_tts_customer_support_example.py`: Deepgram STT + xAI Grok LLM + xAI TTS, configured as a SaaS customer support agent with the `rex` voice.
+
+## TTS example
+
+```bash
+uv run plugins/xai/example/xai_tts_customer_support_example.py
+```
+
+The TTS example additionally requires `DEEPGRAM_API_KEY` for STT.
+
+---
+
+## Realtime voice agent
 
 This example demonstrates how to build a real-time voice conversation AI using xAI's Grok Voice Agent API with Vision Agents.
 

diff --git a/plugins/xai/example/xai_tts_customer_support_example.py b/plugins/xai/example/xai_tts_customer_support_example.py
@@ -0,0 +1,106 @@
+"""xAI TTS — Customer Support Example
+
+A voice agent that handles customer support for a SaaS product.
+Uses the 'rex' voice (confident, clear) for a professional support experience,
+backed by Grok for the LLM.
+
+Requirements (environment variables):
+    XAI_API_KEY          — xAI / Grok API key (used for both LLM and TTS)
+    DEEPGRAM_API_KEY     — Deepgram STT key
+    STREAM_API_KEY       — Stream API key
+    STREAM_API_SECRET    — Stream API secret
+"""
+
+import asyncio
+import logging
+
+from dotenv import load_dotenv
+from vision_agents.core import Agent, Runner, User
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import deepgram, getstream, smart_turn, xai
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+CUSTOMER_SUPPORT_INSTRUCTIONS = """\
+You are Alex, a Tier-1 support agent for "CloudSync Pro," a cloud storage
+and collaboration platform.
+
+Your personality:
+- Professional, solution-oriented, and patient
+- Empathetic when customers are frustrated, but focused on resolution
+- Technically competent without being condescending
+
+Your responsibilities:
+- Help customers troubleshoot common issues (login problems, sync errors,
+  file sharing permissions, billing questions)
+- Walk customers through step-by-step solutions clearly
+- Escalate complex issues to Tier-2 support when needed
+- Collect relevant details: account email, error messages, device/OS info
+- Confirm the issue is resolved before ending the call
+
+Troubleshooting knowledge base:
+- Sync errors: Check internet connection → restart app → clear cache →
+  re-authenticate → escalate
+- Login issues: Verify email → reset password → check 2FA → escalate
+- File sharing: Check permissions → verify recipient email → resend invite
+- Billing: Explain plan tiers (Free / Pro $9.99/mo / Team $24.99/mo) →
+  process upgrades/downgrades → refund requests go to billing team
+
+Tone guidelines:
+- Acknowledge the frustration: "I understand how inconvenient that must be."
+- Be direct with solutions: "Here's what we can do right now."
+- End with confirmation: "Is there anything else I can help you with?"
+- Never blame the customer for the issue
+
+Response style (voice channel — keep it tight):
+- Reply in 1-2 short sentences. Never more than 30 words per turn.
+- Ask one clarifying question at a time.
+- Skip preambles ("Sure, I can help with that"). Get straight to the answer.
+"""
+
+
+async def create_agent(**kwargs) -> Agent:
+    """Create a customer support agent with xAI TTS (rex voice)."""
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="Alex - CloudSync Support", id="agent"),
+        instructions=CUSTOMER_SUPPORT_INSTRUCTIONS,
+        tts=xai.TTS(voice="rex"),
+        stt=deepgram.STT(eager_turn_detection=True),
+        llm=xai.LLM(model="grok-4-fast-non-reasoning"),
+        turn_detection=smart_turn.TurnDetection(
+            silence_duration_ms=2000,
+            speech_probability_threshold=0.5,
+        ),
+        # Stream LLM output to TTS sentence-by-sentence so the first audio
+        # plays well before the LLM finishes generating.
+        streaming_tts=True,
+    )
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    """Join the call and greet the customer."""
+    call = await agent.create_call(call_type, call_id)
+
+    logger.info("Starting Customer Support Agent...")
+
+    async with agent.join(call):
+        logger.info("Agent joined call")
+
+        await asyncio.sleep(3)
+        await agent.llm.simple_response(
+            text=(
+                "Thank you for contacting CloudSync Pro support. "
+                "My name is Alex. I'm here to help you get things sorted out. "
+                "Could you start by telling me what issue you're experiencing?"
+            )
+        )
+
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
diff --git a/plugins/xai/pyproject.toml b/plugins/xai/pyproject.toml
@@ -16,6 +16,9 @@ dependencies = [
     "aiohttp>=3.13.3,<4",
 ]
 
+[project.optional-dependencies]
+mp3 = ["pydub>=0.25"]
+
 [project.urls]
 Documentation = "https://visionagents.ai/"
 Website = "https://visionagents.ai/"

diff --git a/plugins/xai/tests/test_xai_realtime.py b/plugins/xai/tests/test_xai_realtime.py
@@ -109,9 +109,12 @@ async def test_default_configuration(self):
         """Test that default configuration is set correctly."""
         realtime = Realtime(api_key="test-key")
         assert realtime.voice == "Ara"
-        assert realtime.sample_rate == 48000
+        # xAI realtime emits PCM at 24 kHz natively.
+        assert realtime.sample_rate == 24000
         assert realtime.turn_detection == "server_vad"
         assert realtime.provider_name == "xai"
+        # VAD interrupt defaults to False to avoid mic-echo cancellation.
+        assert realtime.vad_interrupt_response is False
         # Web search and X search enabled by default
         assert realtime.web_search is True
         assert realtime.x_search is True
@@ -123,10 +126,12 @@ async def test_custom_configuration(self):
             api_key="test-key",
             voice="Rex",
             turn_detection=None,
+            vad_interrupt_response=True,
         )
         assert realtime.voice == "Rex"
-        assert realtime.sample_rate == 48000  # Always 48kHz
+        assert realtime.sample_rate == 24000
         assert realtime.turn_detection is None
+        assert realtime.vad_interrupt_response is True
 
     async def test_search_tools_can_be_disabled(self):
         """Test that web_search and x_search can be disabled."""