livekit · shizhigu · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/examples/.env.example b/examples/.env.example
@@ -1,3 +1,90 @@
-LIVEKIT_API_SECRET="<your livekit api secret>"
-LIVEKIT_API_KEY="<your livekit api key>"
-LIVEKIT_URL="<your livekit ws url>"
+# LiveKit connection. Required for examples that connect to a LiveKit room.
+# Everything below this first block is optional and only needed by specific examples.
+LIVEKIT_URL="wss://your-project.livekit.cloud"
+LIVEKIT_API_KEY="your_livekit_api_key"
+LIVEKIT_API_SECRET="your_livekit_api_secret"
+
+# Common model providers used by examples that instantiate provider plugins directly.
+# LiveKit Inference routes through LiveKit Cloud, so those examples only need the
+# LiveKit credentials above.
+# OPENAI_API_KEY="sk-..."
+# DEEPGRAM_API_KEY="..."
+# CARTESIA_API_KEY="..."
+# ELEVEN_API_KEY="..."
+# GOOGLE_API_KEY="..."
+# XAI_API_KEY="..."
+# PHONIC_API_KEY="..."
+# ULTRAVOX_API_KEY="..."
+# NVIDIA_API_KEY="..."
+# SPEECHMATICS_API_KEY="..."
+# SPEECHMATICS_RT_URL="wss://eu2.rt.speechmatics.com/v2"
+# RIME_API_KEY="..."
+# NEUPHONIC_API_KEY="..."
+# INWORLD_API_KEY="..."
+
+# AWS examples use the standard AWS SDK credential chain.
+# AWS_ACCESS_KEY_ID="..."
+# AWS_SECRET_ACCESS_KEY="..."
+# AWS_SESSION_TOKEN="..."
+# AWS_REGION="us-east-1"
+
+# Tracing and external tool integrations
+# LANGFUSE_PUBLIC_KEY="..."
+# LANGFUSE_SECRET_KEY="..."
+# LANGFUSE_HOST="https://cloud.langfuse.com"
+# ZAPIER_MCP_SERVER="..."
+
+# Avatar provider examples
+# ANAM_API_KEY="..."
+# ANAM_AVATAR_ID="..."
+# AVATARIO_API_KEY="..."
+# AVATARIO_AVATAR_ID="..."
+# AVATARTALK_API_KEY="..."
+# AVATARTALK_API_URL="..."
+# AVATARTALK_AVATAR="..."
+# AVATARTALK_EMOTION="..."
+# BEY_API_KEY="..."
+# BEY_AVATAR_ID="..."
+# BITHUMAN_API_SECRET="..."
+# BITHUMAN_MODEL_PATH="/path/to/model.imx"
+# DID_API_KEY="..."
+# DID_AGENT_ID="..."
+# KEYFRAME_API_KEY="..."
+# KEYFRAME_PERSONA_ID="..."
+# KEYFRAME_PERSONA_SLUG="public:luna"
+# LEMONSLICE_API_KEY="..."
+# LEMONSLICE_IMAGE_URL="https://example.com/avatar.png"
+# LIVEAVATAR_API_KEY="..."
+# LIVEAVATAR_AVATAR_ID="..."
+# RUNWAYML_API_SECRET="..."
+# RUNWAY_AVATAR_PRESET_ID="..."
+# RUNWAY_AVATAR_ID="..."
+# SIMLI_API_KEY="..."
+# SIMLI_FACE_ID="..."
+# TAVUS_API_KEY="..."
+# TAVUS_PERSONA_ID="..."
+# TAVUS_REPLICA_ID="..."
+# TRUGEN_API_KEY="..."
+# TRUGEN_AVATAR_ID="..."
+
+# Audio wave avatar local runner
+# AVATAR_DISPATCHER_URL="http://localhost:8089/launch"
+# Room access token generated for a specific room by the LiveKit CLI or SDK.
+# LIVEKIT_TOKEN="..."
+
+# SIP and telephony examples
+# LIVEKIT_SIP_OUTBOUND_TRUNK="ST_..."
+# LIVEKIT_SUPERVISOR_PHONE_NUMBER="+12003004000"
+# LIVEKIT_SIP_NUMBER="+15005006000"
+# SIP_OUTBOUND_TRUNK_ID="ST_..."
+
+# Optional dispatch-name overrides used by telephony examples
+# DTMF_AGENT_DISPATCH_NAME="my-telephony-agent"
+# BANK_IVR_DISPATCH_NAME="bank-ivr-agent"
+# PHONE_TREE_AGENT_DISPATCH_NAME="my-telephony-agent"
+
+# End-to-end encryption primitive example. Generate a strong shared value.
+# LIVEKIT_E2EE_KEY="shared-secret"
+
+# Front desk example
+# CAL_API_KEY="..."
diff --git a/examples/README.md b/examples/README.md
@@ -54,7 +54,7 @@ To run the examples, you'll need:
 
 - A [LiveKit Cloud](https://cloud.livekit.io) account or a local [LiveKit server](https://github.com/livekit/livekit)
 - API keys for the model providers you want to use in a `.env` file
-- Python 3.9 or higher
+- Python 3.10 or higher
 - [uv](https://docs.astral.sh/uv/)
 
 ### Environment file

diff --git a/examples/avatar_agents/audio_wave/README.md b/examples/avatar_agents/audio_wave/README.md
@@ -16,10 +16,10 @@ This example demonstrates how to create an animated avatar that responds to audi
 
 1. Start the avatar dispatcher server:
 ```bash
-python examples/avatar/dispatcher.py [--port 8089]
+uv run python examples/avatar_agents/audio_wave/dispatcher.py [--port 8089]
 ```
 
 2. Start the agent worker:
 ```bash
-python examples/avatar/agent_worker.py dev [--avatar-url http://localhost:8089/launch]
+uv run python examples/avatar_agents/audio_wave/agent_worker.py dev [--avatar-url http://localhost:8089/launch]
 ```
diff --git a/examples/telephony/bank-ivr/README.md b/examples/telephony/bank-ivr/README.md
@@ -36,24 +36,24 @@ The example consists of three parts:
 You need a LiveKit configured SIP trunk to allow the agents to receive inbound calls and make outbound calls. See the [LiveKit telephony integration guide](https://docs.livekit.io/agents/start/telephony/) for instructions on setting up SIP trunks.
 
 1.  **Verify the dataset**
-    Edit `examples/bank-ivr/data.json` if you want to customize the mock banking data.
+    Edit `examples/telephony/bank-ivr/data.json` if you want to customize the mock banking data.
 
 2.  **Start the Mock Bank (The Target)**
     ```bash
-    uv run python examples/bank-ivr/ivr_system_agent.py dev
+    uv run python examples/telephony/bank-ivr/ivr_system_agent.py dev
     ```
     This agent acts as the IVR system waiting for calls.
 
 3.  **Start the Navigator Agent (The Caller)**
     Open a new terminal. This agent will wait for a dispatch job to tell it to call.
     ```bash
-    uv run python examples/bank-ivr/ivr_navigator_agent.py dev
+    uv run python examples/telephony/bank-ivr/ivr_navigator_agent.py dev
     ```
 
 4.  **Trigger the Call**
     Open a third terminal. This script tells the Navigator to call the Bank with a specific goal.
     ```bash
-    uv run python examples/bank-ivr/dial_bank_agent.py --phone "+1234567890" --request "check balance for all accounts I have"
+    uv run python examples/telephony/bank-ivr/dial_bank_agent.py --phone "+1234567890" --request "check balance for all accounts I have"
     ```
     *Note: Replace the phone number with the number that routes to your `ivr_system_agent.py` via your SIP setup.*
 

diff --git a/examples/voice_agents/README.md b/examples/voice_agents/README.md
@@ -36,7 +36,6 @@ session = AgentSession(
 
 > **Note:** Realtime models use provider plugins directly as they are not supported by LiveKit Inference. These examples require provider-specific API keys (e.g., `OPENAI_API_KEY`).
 
-- [`getting_started.py`](./getting_started.py) - OpenAI Realtime model with noise cancellation
 - [`weather_agent.py`](./weather_agent.py) - OpenAI Realtime API with function calls for weather information
 - [`realtime_video_agent.py`](./realtime_video_agent.py) - Google Gemini with multimodal video and voice capabilities
 - [`realtime_joke_teller.py`](./realtime_joke_teller.py) - Amazon Nova Sonic real-time model with function calls
@@ -81,7 +80,6 @@ session = AgentSession(
 
 - [`background_audio.py`](./background_audio.py) - Playing background audio or ambient sounds during conversations
 - [`push_to_talk.py`](./push_to_talk.py) - Push-to-talk interaction
-- [`tts_text_pacing.py`](./tts_text_pacing.py) - Pacing control for TTS requests
 - [`speaker_id_multi_speaker.py`](./speaker_id_multi_speaker.py) - Multi-speaker identification
 
 ### 📊 Tracing & Error Handling

diff --git a/tests/test_agent_session.py b/tests/test_agent_session.py
@@ -76,6 +76,8 @@ async def on_user_turn_completed(self, turn_ctx: ChatContext, new_message: ChatM
 
 
 SESSION_TIMEOUT = 60.0
+EVENT_TIMESTAMP_EARLY_TOLERANCE = 0.75
+EVENT_TIMESTAMP_LATE_TOLERANCE = 1.25
 
 
 async def test_events_and_metrics() -> None:
@@ -110,35 +112,51 @@ async def test_events_and_metrics() -> None:
     assert conversation_events[1].item.type == "message"
     assert conversation_events[1].item.role == "user"
     assert conversation_events[1].item.text_content == "Hello, how are you?"
-    check_timestamp(conversation_events[1].created_at - t_origin, 3.0, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        conversation_events[1].created_at - t_origin, 3.0, speed_factor=speed
+    )
     assert conversation_events[2].item.type == "message"
     assert conversation_events[2].item.role == "assistant"
     assert conversation_events[2].item.text_content == "I'm doing well, thank you!"
-    check_timestamp(conversation_events[2].created_at - t_origin, 5.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        conversation_events[2].created_at - t_origin, 5.5, speed_factor=speed
+    )
 
     # user_input_transcribed
     assert len(user_transcription_events) >= 1
     assert user_transcription_events[-1].transcript == "Hello, how are you?"
     assert user_transcription_events[-1].is_final is True
-    check_timestamp(user_transcription_events[-1].created_at - t_origin, 2.7, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        user_transcription_events[-1].created_at - t_origin, 2.7, speed_factor=speed
+    )
 
     # user_state_changed
     assert len(user_state_events) == 2
-    check_timestamp(user_state_events[0].created_at - t_origin, 0.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        user_state_events[0].created_at - t_origin, 0.5, speed_factor=speed
+    )
     assert user_state_events[0].new_state == "speaking"
-    check_timestamp(user_state_events[1].created_at - t_origin, 3.0, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        user_state_events[1].created_at - t_origin, 3.0, speed_factor=speed
+    )
     assert user_state_events[1].new_state == "listening"
 
     # agent_state_changed
     assert len(agent_state_events) == 4
     assert agent_state_events[0].old_state == "initializing"
     assert agent_state_events[0].new_state == "listening"
     assert agent_state_events[1].new_state == "thinking"
-    check_timestamp(agent_state_events[1].created_at - t_origin, 3.0, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        agent_state_events[1].created_at - t_origin, 3.0, speed_factor=speed
+    )
     assert agent_state_events[2].new_state == "speaking"
-    check_timestamp(agent_state_events[2].created_at - t_origin, 3.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        agent_state_events[2].created_at - t_origin, 3.5, speed_factor=speed
+    )
     assert agent_state_events[3].new_state == "listening"
-    check_timestamp(agent_state_events[3].created_at - t_origin, 5.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        agent_state_events[3].created_at - t_origin, 5.5, speed_factor=speed
+    )
 
     # metrics
     metrics_events = [ev for ev in metrics_events if ev.metrics.type != "vad_metrics"]
@@ -196,7 +214,9 @@ async def test_tool_call() -> None:
     assert (
         agent_state_events[3].new_state == "thinking"
     )  # from speaking to thinking when tool call is executed
-    check_timestamp(agent_state_events[3].created_at - t_origin, 5.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        agent_state_events[3].created_at - t_origin, 5.5, speed_factor=speed
+    )
     assert agent_state_events[4].new_state == "speaking"
     assert agent_state_events[5].new_state == "listening"
 
@@ -271,13 +291,17 @@ async def test_interruption(
     assert agent_state_events[1].new_state == "thinking"
     assert agent_state_events[2].new_state == "speaking"
     assert agent_state_events[3].new_state == "listening"
-    check_timestamp(
+    check_wallclock_event_timestamp(
         agent_state_events[3].created_at - t_origin, expected_interruption_time, speed_factor=speed
     )
     assert agent_state_events[4].new_state == "thinking"
-    check_timestamp(agent_state_events[4].created_at - t_origin, 6.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        agent_state_events[4].created_at - t_origin, 6.5, speed_factor=speed
+    )
     assert agent_state_events[5].new_state == "listening"
-    check_timestamp(agent_state_events[5].created_at - t_origin, 6.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        agent_state_events[5].created_at - t_origin, 6.5, speed_factor=speed
+    )
 
     assert len(playback_finished_events) == 1
     assert playback_finished_events[0].interrupted is True
@@ -423,7 +447,7 @@ async def test_interruption_before_speaking(
     assert agent_state_events[0].new_state == "listening"
     assert agent_state_events[1].new_state == "thinking"  # without speaking state
     assert agent_state_events[2].new_state == "listening"
-    check_timestamp(
+    check_wallclock_event_timestamp(
         agent_state_events[2].created_at - t_origin, expected_interruption_time, speed_factor=speed
     )  # interrupted at 3.5s
     assert agent_state_events[3].new_state == "thinking"
@@ -480,7 +504,9 @@ async def test_interrupt_before_speaking_with_pausable_audio() -> None:
     assert agent_state_events[0].new_state == "listening"
     assert agent_state_events[1].new_state == "thinking"
     assert agent_state_events[2].new_state == "listening"
-    check_timestamp(agent_state_events[2].created_at - t_origin, 3.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        agent_state_events[2].created_at - t_origin, 3.5, speed_factor=speed
+    )
 
     # nothing audible reached the transport — the pause cleanup emits a single
     # playback_finished with interrupted=True and playback_position=0
@@ -528,7 +554,9 @@ async def test_false_interruption_before_speaking_resumes() -> None:
 
     # playout was postponed: the noise ran 3.0–3.3s, so "speaking" should fire at
     # ~3.8s (resume on VAD EOS=3.3s + 0.5s min_silence_duration)
-    check_timestamp(speaking_events[0].created_at - t_origin, 3.8, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        speaking_events[0].created_at - t_origin, 3.8, speed_factor=speed
+    )
 
     # the reply plays to completion (not interrupted); playback_position covers the
     # full audio duration
@@ -592,22 +620,34 @@ async def test_generate_reply() -> None:
     assert conversation_events[1].item.type == "message"
     assert conversation_events[1].item.role == "assistant"
     assert conversation_events[1].item.text_content == "What can I do for you!"
-    check_timestamp(conversation_events[1].created_at - t_origin, 2.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        conversation_events[1].created_at - t_origin, 2.5, speed_factor=speed
+    )
     assert conversation_events[2].item.type == "message"
     assert conversation_events[2].item.role == "user"
     assert conversation_events[2].item.text_content == "bye"
-    check_timestamp(conversation_events[2].created_at - t_origin, 4.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        conversation_events[2].created_at - t_origin, 4.5, speed_factor=speed
+    )
     assert conversation_events[3].item.type == "message"
     assert conversation_events[3].item.role == "assistant"
     assert conversation_events[3].item.text_content == "session.say from on_user_turn_completed"
-    check_timestamp(
-        conversation_events[3].created_at - t_origin, 5.5, speed_factor=speed, max_abs_diff=1.0
+    check_wallclock_event_timestamp(
+        conversation_events[3].created_at - t_origin,
+        5.5,
+        speed_factor=speed,
+        max_early_diff=1.0,
+        max_late_diff=1.0,
     )
     assert conversation_events[4].item.type == "message"
     assert conversation_events[4].item.role == "assistant"
     assert conversation_events[4].item.text_content == "Goodbye! have a nice day!"
-    check_timestamp(
-        conversation_events[4].created_at - t_origin, 9.0, speed_factor=speed, max_abs_diff=1.0
+    check_wallclock_event_timestamp(
+        conversation_events[4].created_at - t_origin,
+        9.0,
+        speed_factor=speed,
+        max_early_diff=1.0,
+        max_late_diff=1.0,
     )
 
     # chat context
@@ -671,7 +711,9 @@ async def test_aec_warmup() -> None:
     assert agent_state_events[2].new_state == "speaking"
     # interruption delayed to 5.5s (EOU), not 4.5s (VAD was blocked by warmup)
     speaking_to_listening = next(e for e in agent_state_events[3:] if e.new_state == "listening")
-    check_timestamp(speaking_to_listening.created_at - t_origin, 5.5, speed_factor=speed)
+    check_wallclock_event_timestamp(
+        speaking_to_listening.created_at - t_origin, 5.5, speed_factor=speed
+    )
 
 
 async def test_start_boundary_does_not_block_vad_interruption() -> None:
@@ -833,6 +875,8 @@ async def test_preemptive_generation(preemptive_generation: dict, expected_laten
     assert agent_state_events[1].new_state == "thinking"
     assert agent_state_events[2].new_state == "speaking"
     t_agent_start_speaking = agent_state_events[2].created_at
+    # This compares two event timestamps from the same turn, so scheduler
+    # latency largely cancels out and the latency invariant can stay strict.
     check_timestamp(
         t_agent_start_speaking - t_user_stop_speaking,
         t_target=expected_latency,
@@ -1032,6 +1076,38 @@ def check_timestamp(
     )
 
 
+def check_wallclock_event_timestamp(
+    t_event: float,
+    t_target: float,
+    *,
+    speed_factor: float = 1.0,
+    max_early_diff: float = EVENT_TIMESTAMP_EARLY_TOLERANCE,
+    max_late_diff: float = EVENT_TIMESTAMP_LATE_TOLERANCE,
+) -> None:
+    """
+    Check wall-clock event timestamps with separate early and late bounds.
+
+    These tests run the fake pipeline faster than real time. Under CI load, asyncio
+    callbacks can be dispatched slightly late after scaling, so the default keeps
+    the previous early tolerance while allowing a larger late tolerance. An event
+    that fires too early is still a behavioral regression.
+    Use this for created_at event timestamps; keep check_timestamp for deterministic
+    pipeline durations, playback positions, and same-turn deltas.
+    """
+    t_event = t_event * speed_factor
+    diff = t_event - t_target
+    print(
+        "check_wallclock_event_timestamp: "
+        f"t_event: {t_event}, t_target: {t_target}, "
+        f"max_early_diff: {max_early_diff}, "
+        f"max_late_diff: {max_late_diff}"
+    )
+    assert -max_early_diff <= diff <= max_late_diff, (
+        f"event timestamp {t_event} differs from target {t_target} by {diff}; "
+        f"allowed range is -{max_early_diff}/+{max_late_diff}"
+    )
+
+
 async def test_silent_tool_call_pause_state_does_not_leak_into_tool_reply() -> None:
     speed = 5.0
     actions = FakeActions()