Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 90 additions & 3 deletions examples/.env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,90 @@
LIVEKIT_API_SECRET="<your livekit api secret>"
LIVEKIT_API_KEY="<your livekit api key>"
LIVEKIT_URL="<your livekit ws url>"
# LiveKit connection. Required for examples that connect to a LiveKit room.
# Everything below this first block is optional and only needed by specific examples.
LIVEKIT_URL="wss://your-project.livekit.cloud"
LIVEKIT_API_KEY="your_livekit_api_key"
LIVEKIT_API_SECRET="your_livekit_api_secret"

# Common model providers used by examples that instantiate provider plugins directly.
# LiveKit Inference routes through LiveKit Cloud, so those examples only need the
# LiveKit credentials above.
# OPENAI_API_KEY="sk-..."
# DEEPGRAM_API_KEY="..."
# CARTESIA_API_KEY="..."
# ELEVEN_API_KEY="..."
# GOOGLE_API_KEY="..."
# XAI_API_KEY="..."
# PHONIC_API_KEY="..."
# ULTRAVOX_API_KEY="..."
# NVIDIA_API_KEY="..."
# SPEECHMATICS_API_KEY="..."
# SPEECHMATICS_RT_URL="wss://eu2.rt.speechmatics.com/v2"
# RIME_API_KEY="..."
# NEUPHONIC_API_KEY="..."
# INWORLD_API_KEY="..."

# AWS examples use the standard AWS SDK credential chain.
# AWS_ACCESS_KEY_ID="..."
# AWS_SECRET_ACCESS_KEY="..."
# AWS_SESSION_TOKEN="..."
# AWS_REGION="us-east-1"

# Tracing and external tool integrations
# LANGFUSE_PUBLIC_KEY="..."
# LANGFUSE_SECRET_KEY="..."
# LANGFUSE_HOST="https://cloud.langfuse.com"
# ZAPIER_MCP_SERVER="..."

# Avatar provider examples
# ANAM_API_KEY="..."
# ANAM_AVATAR_ID="..."
# AVATARIO_API_KEY="..."
# AVATARIO_AVATAR_ID="..."
# AVATARTALK_API_KEY="..."
# AVATARTALK_API_URL="..."
# AVATARTALK_AVATAR="..."
# AVATARTALK_EMOTION="..."
# BEY_API_KEY="..."
# BEY_AVATAR_ID="..."
# BITHUMAN_API_SECRET="..."
# BITHUMAN_MODEL_PATH="/path/to/model.imx"
# DID_API_KEY="..."
# DID_AGENT_ID="..."
# KEYFRAME_API_KEY="..."
# KEYFRAME_PERSONA_ID="..."
# KEYFRAME_PERSONA_SLUG="public:luna"
# LEMONSLICE_API_KEY="..."
# LEMONSLICE_IMAGE_URL="https://example.com/avatar.png"
# LIVEAVATAR_API_KEY="..."
# LIVEAVATAR_AVATAR_ID="..."
# RUNWAYML_API_SECRET="..."
# RUNWAY_AVATAR_PRESET_ID="..."
# RUNWAY_AVATAR_ID="..."
# SIMLI_API_KEY="..."
# SIMLI_FACE_ID="..."
# TAVUS_API_KEY="..."
# TAVUS_PERSONA_ID="..."
# TAVUS_REPLICA_ID="..."
# TRUGEN_API_KEY="..."
# TRUGEN_AVATAR_ID="..."

# Audio wave avatar local runner
# AVATAR_DISPATCHER_URL="http://localhost:8089/launch"
# Room access token generated for a specific room by the LiveKit CLI or SDK.
# LIVEKIT_TOKEN="..."

# SIP and telephony examples
# LIVEKIT_SIP_OUTBOUND_TRUNK="ST_..."
# LIVEKIT_SUPERVISOR_PHONE_NUMBER="+12003004000"
# LIVEKIT_SIP_NUMBER="+15005006000"
# SIP_OUTBOUND_TRUNK_ID="ST_..."

# Optional dispatch-name overrides used by telephony examples
# DTMF_AGENT_DISPATCH_NAME="my-telephony-agent"
# BANK_IVR_DISPATCH_NAME="bank-ivr-agent"
# PHONE_TREE_AGENT_DISPATCH_NAME="my-telephony-agent"

# End-to-end encryption primitive example. Generate a strong shared value.
# LIVEKIT_E2EE_KEY="shared-secret"

# Front desk example
# CAL_API_KEY="..."
2 changes: 1 addition & 1 deletion examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ To run the examples, you'll need:

- A [LiveKit Cloud](https://cloud.livekit.io) account or a local [LiveKit server](https://github.com/livekit/livekit)
- API keys for the model providers you want to use in a `.env` file
- Python 3.9 or higher
- Python 3.10 or higher
- [uv](https://docs.astral.sh/uv/)

### Environment file
Expand Down
4 changes: 2 additions & 2 deletions examples/avatar_agents/audio_wave/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ This example demonstrates how to create an animated avatar that responds to audi

1. Start the avatar dispatcher server:
```bash
python examples/avatar/dispatcher.py [--port 8089]
uv run python examples/avatar_agents/audio_wave/dispatcher.py [--port 8089]
```

2. Start the agent worker:
```bash
python examples/avatar/agent_worker.py dev [--avatar-url http://localhost:8089/launch]
uv run python examples/avatar_agents/audio_wave/agent_worker.py dev [--avatar-url http://localhost:8089/launch]
```
8 changes: 4 additions & 4 deletions examples/telephony/bank-ivr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,24 @@ The example consists of three parts:
You need a LiveKit configured SIP trunk to allow the agents to receive inbound calls and make outbound calls. See the [LiveKit telephony integration guide](https://docs.livekit.io/agents/start/telephony/) for instructions on setting up SIP trunks.

1. **Verify the dataset**
Edit `examples/bank-ivr/data.json` if you want to customize the mock banking data.
Edit `examples/telephony/bank-ivr/data.json` if you want to customize the mock banking data.

2. **Start the Mock Bank (The Target)**
```bash
uv run python examples/bank-ivr/ivr_system_agent.py dev
uv run python examples/telephony/bank-ivr/ivr_system_agent.py dev
```
This agent acts as the IVR system waiting for calls.

3. **Start the Navigator Agent (The Caller)**
Open a new terminal. This agent will wait for a dispatch job to tell it to call.
```bash
uv run python examples/bank-ivr/ivr_navigator_agent.py dev
uv run python examples/telephony/bank-ivr/ivr_navigator_agent.py dev
```

4. **Trigger the Call**
Open a third terminal. This script tells the Navigator to call the Bank with a specific goal.
```bash
uv run python examples/bank-ivr/dial_bank_agent.py --phone "+1234567890" --request "check balance for all accounts I have"
uv run python examples/telephony/bank-ivr/dial_bank_agent.py --phone "+1234567890" --request "check balance for all accounts I have"
```
*Note: Replace the phone number with the number that routes to your `ivr_system_agent.py` via your SIP setup.*

Expand Down
2 changes: 0 additions & 2 deletions examples/voice_agents/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ session = AgentSession(

> **Note:** Realtime models use provider plugins directly as they are not supported by LiveKit Inference. These examples require provider-specific API keys (e.g., `OPENAI_API_KEY`).

- [`getting_started.py`](./getting_started.py) - OpenAI Realtime model with noise cancellation
- [`weather_agent.py`](./weather_agent.py) - OpenAI Realtime API with function calls for weather information
- [`realtime_video_agent.py`](./realtime_video_agent.py) - Google Gemini with multimodal video and voice capabilities
- [`realtime_joke_teller.py`](./realtime_joke_teller.py) - Amazon Nova Sonic real-time model with function calls
Expand Down Expand Up @@ -81,7 +80,6 @@ session = AgentSession(

- [`background_audio.py`](./background_audio.py) - Playing background audio or ambient sounds during conversations
- [`push_to_talk.py`](./push_to_talk.py) - Push-to-talk interaction
- [`tts_text_pacing.py`](./tts_text_pacing.py) - Pacing control for TTS requests
- [`speaker_id_multi_speaker.py`](./speaker_id_multi_speaker.py) - Multi-speaker identification

### 📊 Tracing & Error Handling
Expand Down
120 changes: 98 additions & 22 deletions tests/test_agent_session.py
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Incomplete transformation: wall-clock timestamp check not converted in test_start_boundary_does_not_block_vad_interruption

The PR systematically converts all check_timestamp(*.created_at - t_origin, ...) calls to check_wallclock_event_timestamp(...) to use asymmetric tolerance (tighter early, looser late) and stabilize CI flakiness. However, line 757 in test_start_boundary_does_not_block_vad_interruption still uses the old check_timestamp with the symmetric max_abs_diff=0.75 for a wall-clock created_at - t_origin comparison. The new function's docstring at tests/test_agent_session.py:1094 explicitly states: "Use this for created_at event timestamps; keep check_timestamp for deterministic pipeline durations, playback positions, and same-turn deltas." This test remains susceptible to the same CI flakiness the PR aims to fix.

(Refers to line 757)

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ async def on_user_turn_completed(self, turn_ctx: ChatContext, new_message: ChatM


SESSION_TIMEOUT = 60.0
EVENT_TIMESTAMP_EARLY_TOLERANCE = 0.75
EVENT_TIMESTAMP_LATE_TOLERANCE = 1.25


async def test_events_and_metrics() -> None:
Expand Down Expand Up @@ -110,35 +112,51 @@ async def test_events_and_metrics() -> None:
assert conversation_events[1].item.type == "message"
assert conversation_events[1].item.role == "user"
assert conversation_events[1].item.text_content == "Hello, how are you?"
check_timestamp(conversation_events[1].created_at - t_origin, 3.0, speed_factor=speed)
check_wallclock_event_timestamp(
conversation_events[1].created_at - t_origin, 3.0, speed_factor=speed
)
assert conversation_events[2].item.type == "message"
assert conversation_events[2].item.role == "assistant"
assert conversation_events[2].item.text_content == "I'm doing well, thank you!"
check_timestamp(conversation_events[2].created_at - t_origin, 5.5, speed_factor=speed)
check_wallclock_event_timestamp(
conversation_events[2].created_at - t_origin, 5.5, speed_factor=speed
)

# user_input_transcribed
assert len(user_transcription_events) >= 1
assert user_transcription_events[-1].transcript == "Hello, how are you?"
assert user_transcription_events[-1].is_final is True
check_timestamp(user_transcription_events[-1].created_at - t_origin, 2.7, speed_factor=speed)
check_wallclock_event_timestamp(
user_transcription_events[-1].created_at - t_origin, 2.7, speed_factor=speed
)

# user_state_changed
assert len(user_state_events) == 2
check_timestamp(user_state_events[0].created_at - t_origin, 0.5, speed_factor=speed)
check_wallclock_event_timestamp(
user_state_events[0].created_at - t_origin, 0.5, speed_factor=speed
)
assert user_state_events[0].new_state == "speaking"
check_timestamp(user_state_events[1].created_at - t_origin, 3.0, speed_factor=speed)
check_wallclock_event_timestamp(
user_state_events[1].created_at - t_origin, 3.0, speed_factor=speed
)
assert user_state_events[1].new_state == "listening"

# agent_state_changed
assert len(agent_state_events) == 4
assert agent_state_events[0].old_state == "initializing"
assert agent_state_events[0].new_state == "listening"
assert agent_state_events[1].new_state == "thinking"
check_timestamp(agent_state_events[1].created_at - t_origin, 3.0, speed_factor=speed)
check_wallclock_event_timestamp(
agent_state_events[1].created_at - t_origin, 3.0, speed_factor=speed
)
assert agent_state_events[2].new_state == "speaking"
check_timestamp(agent_state_events[2].created_at - t_origin, 3.5, speed_factor=speed)
check_wallclock_event_timestamp(
agent_state_events[2].created_at - t_origin, 3.5, speed_factor=speed
)
assert agent_state_events[3].new_state == "listening"
check_timestamp(agent_state_events[3].created_at - t_origin, 5.5, speed_factor=speed)
check_wallclock_event_timestamp(
agent_state_events[3].created_at - t_origin, 5.5, speed_factor=speed
)

# metrics
metrics_events = [ev for ev in metrics_events if ev.metrics.type != "vad_metrics"]
Expand Down Expand Up @@ -196,7 +214,9 @@ async def test_tool_call() -> None:
assert (
agent_state_events[3].new_state == "thinking"
) # from speaking to thinking when tool call is executed
check_timestamp(agent_state_events[3].created_at - t_origin, 5.5, speed_factor=speed)
check_wallclock_event_timestamp(
agent_state_events[3].created_at - t_origin, 5.5, speed_factor=speed
)
assert agent_state_events[4].new_state == "speaking"
assert agent_state_events[5].new_state == "listening"

Expand Down Expand Up @@ -271,13 +291,17 @@ async def test_interruption(
assert agent_state_events[1].new_state == "thinking"
assert agent_state_events[2].new_state == "speaking"
assert agent_state_events[3].new_state == "listening"
check_timestamp(
check_wallclock_event_timestamp(
agent_state_events[3].created_at - t_origin, expected_interruption_time, speed_factor=speed
)
assert agent_state_events[4].new_state == "thinking"
check_timestamp(agent_state_events[4].created_at - t_origin, 6.5, speed_factor=speed)
check_wallclock_event_timestamp(
agent_state_events[4].created_at - t_origin, 6.5, speed_factor=speed
)
assert agent_state_events[5].new_state == "listening"
check_timestamp(agent_state_events[5].created_at - t_origin, 6.5, speed_factor=speed)
check_wallclock_event_timestamp(
agent_state_events[5].created_at - t_origin, 6.5, speed_factor=speed
)

assert len(playback_finished_events) == 1
assert playback_finished_events[0].interrupted is True
Expand Down Expand Up @@ -423,7 +447,7 @@ async def test_interruption_before_speaking(
assert agent_state_events[0].new_state == "listening"
assert agent_state_events[1].new_state == "thinking" # without speaking state
assert agent_state_events[2].new_state == "listening"
check_timestamp(
check_wallclock_event_timestamp(
agent_state_events[2].created_at - t_origin, expected_interruption_time, speed_factor=speed
) # interrupted at 3.5s
assert agent_state_events[3].new_state == "thinking"
Expand Down Expand Up @@ -480,7 +504,9 @@ async def test_interrupt_before_speaking_with_pausable_audio() -> None:
assert agent_state_events[0].new_state == "listening"
assert agent_state_events[1].new_state == "thinking"
assert agent_state_events[2].new_state == "listening"
check_timestamp(agent_state_events[2].created_at - t_origin, 3.5, speed_factor=speed)
check_wallclock_event_timestamp(
agent_state_events[2].created_at - t_origin, 3.5, speed_factor=speed
)

# nothing audible reached the transport — the pause cleanup emits a single
# playback_finished with interrupted=True and playback_position=0
Expand Down Expand Up @@ -528,7 +554,9 @@ async def test_false_interruption_before_speaking_resumes() -> None:

# playout was postponed: the noise ran 3.0–3.3s, so "speaking" should fire at
# ~3.8s (resume on VAD EOS=3.3s + 0.5s min_silence_duration)
check_timestamp(speaking_events[0].created_at - t_origin, 3.8, speed_factor=speed)
check_wallclock_event_timestamp(
speaking_events[0].created_at - t_origin, 3.8, speed_factor=speed
)

# the reply plays to completion (not interrupted); playback_position covers the
# full audio duration
Expand Down Expand Up @@ -592,22 +620,34 @@ async def test_generate_reply() -> None:
assert conversation_events[1].item.type == "message"
assert conversation_events[1].item.role == "assistant"
assert conversation_events[1].item.text_content == "What can I do for you!"
check_timestamp(conversation_events[1].created_at - t_origin, 2.5, speed_factor=speed)
check_wallclock_event_timestamp(
conversation_events[1].created_at - t_origin, 2.5, speed_factor=speed
)
assert conversation_events[2].item.type == "message"
assert conversation_events[2].item.role == "user"
assert conversation_events[2].item.text_content == "bye"
check_timestamp(conversation_events[2].created_at - t_origin, 4.5, speed_factor=speed)
check_wallclock_event_timestamp(
conversation_events[2].created_at - t_origin, 4.5, speed_factor=speed
)
assert conversation_events[3].item.type == "message"
assert conversation_events[3].item.role == "assistant"
assert conversation_events[3].item.text_content == "session.say from on_user_turn_completed"
check_timestamp(
conversation_events[3].created_at - t_origin, 5.5, speed_factor=speed, max_abs_diff=1.0
check_wallclock_event_timestamp(
conversation_events[3].created_at - t_origin,
5.5,
speed_factor=speed,
max_early_diff=1.0,
max_late_diff=1.0,
)
assert conversation_events[4].item.type == "message"
assert conversation_events[4].item.role == "assistant"
assert conversation_events[4].item.text_content == "Goodbye! have a nice day!"
check_timestamp(
conversation_events[4].created_at - t_origin, 9.0, speed_factor=speed, max_abs_diff=1.0
check_wallclock_event_timestamp(
conversation_events[4].created_at - t_origin,
9.0,
speed_factor=speed,
max_early_diff=1.0,
max_late_diff=1.0,
)

# chat context
Expand Down Expand Up @@ -671,7 +711,9 @@ async def test_aec_warmup() -> None:
assert agent_state_events[2].new_state == "speaking"
# interruption delayed to 5.5s (EOU), not 4.5s (VAD was blocked by warmup)
speaking_to_listening = next(e for e in agent_state_events[3:] if e.new_state == "listening")
check_timestamp(speaking_to_listening.created_at - t_origin, 5.5, speed_factor=speed)
check_wallclock_event_timestamp(
speaking_to_listening.created_at - t_origin, 5.5, speed_factor=speed
)


async def test_start_boundary_does_not_block_vad_interruption() -> None:
Expand Down Expand Up @@ -833,6 +875,8 @@ async def test_preemptive_generation(preemptive_generation: dict, expected_laten
assert agent_state_events[1].new_state == "thinking"
assert agent_state_events[2].new_state == "speaking"
t_agent_start_speaking = agent_state_events[2].created_at
# This compares two event timestamps from the same turn, so scheduler
# latency largely cancels out and the latency invariant can stay strict.
check_timestamp(
t_agent_start_speaking - t_user_stop_speaking,
t_target=expected_latency,
Expand Down Expand Up @@ -1032,6 +1076,38 @@ def check_timestamp(
)


def check_wallclock_event_timestamp(
t_event: float,
t_target: float,
*,
speed_factor: float = 1.0,
max_early_diff: float = EVENT_TIMESTAMP_EARLY_TOLERANCE,
max_late_diff: float = EVENT_TIMESTAMP_LATE_TOLERANCE,
) -> None:
"""
Check wall-clock event timestamps with separate early and late bounds.

These tests run the fake pipeline faster than real time. Under CI load, asyncio
callbacks can be dispatched slightly late after scaling, so the default keeps
the previous early tolerance while allowing a larger late tolerance. An event
that fires too early is still a behavioral regression.
Use this for created_at event timestamps; keep check_timestamp for deterministic
pipeline durations, playback positions, and same-turn deltas.
"""
t_event = t_event * speed_factor
diff = t_event - t_target
print(
"check_wallclock_event_timestamp: "
f"t_event: {t_event}, t_target: {t_target}, "
f"max_early_diff: {max_early_diff}, "
f"max_late_diff: {max_late_diff}"
)
assert -max_early_diff <= diff <= max_late_diff, (
f"event timestamp {t_event} differs from target {t_target} by {diff}; "
f"allowed range is -{max_early_diff}/+{max_late_diff}"
)


async def test_silent_tool_call_pause_state_does_not_leak_into_tool_reply() -> None:
speed = 5.0
actions = FakeActions()
Expand Down