diff --git a/README.md b/README.md index d9ecdedae6..a7dc56c2c4 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,9 @@ A detailed setup guide for Windows, macOS, and Linux can be found in the Agent Z - The Web UI output is very clean, fluid, colorful, readable, and interactive; nothing is hidden. - You can load or save chats directly within the Web UI. - The same output you see in the terminal is automatically saved to an HTML file in **logs/** folder for every session. +- Voice is provided by the built-in `_kokoro_tts` and `_whisper_stt` plugins. +- Docker/bootstrap remains responsible for installing Kokoro, Whisper, `ffmpeg`, and related speech dependencies. +- If `_kokoro_tts` is disabled, spoken output falls back to the browser's native speech synthesis.  diff --git a/api/synthesize.py b/api/synthesize.py deleted file mode 100644 index 7957ef0d87..0000000000 --- a/api/synthesize.py +++ /dev/null @@ -1,96 +0,0 @@ -# api/synthesize.py - -from helpers.api import ApiHandler, Request, Response - -from helpers import runtime, settings, kokoro_tts - -class Synthesize(ApiHandler): - async def process(self, input: dict, request: Request) -> dict | Response: - text = input.get("text", "") - ctxid = input.get("ctxid", "") - - if ctxid: - context = self.use_context(ctxid) - - # if not await kokoro_tts.is_downloaded(): - # context.log.log(type="info", content="Kokoro TTS model is currently being initialized, please wait...") - - try: - # # Clean and chunk text for long responses - # cleaned_text = self._clean_text(text) - # chunks = self._chunk_text(cleaned_text) - - # if len(chunks) == 1: - # # Single chunk - return as before - # audio = await kokoro_tts.synthesize_sentences(chunks) - # return {"audio": audio, "success": True} - # else: - # # Multiple chunks - return as sequence - # audio_parts = [] - # for chunk in chunks: - # chunk_audio = await kokoro_tts.synthesize_sentences([chunk]) - # audio_parts.append(chunk_audio) - # return {"audio_parts": audio_parts, "success": True} - - # audio is chunked on the frontend for better flow - audio = await kokoro_tts.synthesize_sentences([text]) - return {"audio": audio, "success": True} - except Exception as e: - return {"error": str(e), "success": False} - - # def _clean_text(self, text: str) -> str: - # """Clean text by removing markdown, tables, code blocks, and other formatting""" - # # Remove code blocks - # text = re.sub(r'```[\s\S]*?```', '', text) - # text = re.sub(r'`[^`]*`', '', text) - - # # Remove markdown links - # text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) - - # # Remove markdown formatting - # text = re.sub(r'[*_#]+', '', text) - - # # Remove tables (basic cleanup) - # text = re.sub(r'\|[^\n]*\|', '', text) - - # # Remove extra whitespace and newlines - # text = re.sub(r'\n+', ' ', text) - # text = re.sub(r'\s+', ' ', text) - - # # Remove URLs - # text = re.sub(r'https?://[^\s]+', '', text) - - # # Remove email addresses - # text = re.sub(r'\S+@\S+', '', text) - - # return text.strip() - - # def _chunk_text(self, text: str) -> list[str]: - # """Split text into manageable chunks for TTS""" - # # If text is short enough, return as single chunk - # if len(text) <= 300: - # return [text] - - # # Split into sentences first - # sentences = re.split(r'(?<=[.!?])\s+', text) - - # chunks = [] - # current_chunk = "" - - # for sentence in sentences: - # sentence = sentence.strip() - # if not sentence: - # continue - - # # If adding this sentence would make chunk too long, start new chunk - # if current_chunk and len(current_chunk + " " + sentence) > 300: - # chunks.append(current_chunk.strip()) - # current_chunk = sentence - # else: - # current_chunk += (" " if current_chunk else "") + sentence - - # # Add the last chunk if it has content - # if current_chunk.strip(): - # chunks.append(current_chunk.strip()) - - # return chunks if chunks else [text] \ No newline at end of file diff --git a/api/transcribe.py b/api/transcribe.py deleted file mode 100644 index 93e1cd5927..0000000000 --- a/api/transcribe.py +++ /dev/null @@ -1,18 +0,0 @@ -from helpers.api import ApiHandler, Request, Response - -from helpers import runtime, settings, whisper - -class Transcribe(ApiHandler): - async def process(self, input: dict, request: Request) -> dict | Response: - audio = input.get("audio") - ctxid = input.get("ctxid", "") - - if ctxid: - context = self.use_context(ctxid) - - # if not await whisper.is_downloaded(): - # context.log.log(type="info", content="Whisper STT model is currently being initialized, please wait...") - - set = settings.get_settings() - result = await whisper.transcribe(set["stt_model_size"], audio) # type: ignore - return result diff --git a/docs/guides/usage.md b/docs/guides/usage.md index c77e672f11..99c53daa02 100644 --- a/docs/guides/usage.md +++ b/docs/guides/usage.md @@ -748,12 +748,19 @@ If you encounter issues with the tunnel feature: > Combine tunneling with authentication for secure remote access to your Agent Zero instance from any device, including mobile phones and tablets. ## Voice Interface -Agent Zero provides both Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities for natural voice interaction: +Agent Zero provides both Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities for natural voice interaction through built-in plugins: + +- `_kokoro_tts` handles server-side Kokoro speech synthesis when enabled +- `_whisper_stt` handles server-side Whisper transcription and injects the microphone UI when enabled +- Browser-native `speechSynthesis` remains the fallback output path when `_kokoro_tts` is disabled + +Use the Agent Plugins section in Settings to enable or disable either plugin independently. ### Text-to-Speech Enable voice responses from agents: * Toggle the "Speech" switch in the Preferences section of the sidebar -* Agents will use your system's built-in voice synthesizer to speak their messages +* If `_kokoro_tts` is enabled, agents will use Kokoro for spoken output +* If `_kokoro_tts` is disabled, agents will use your browser's built-in voice synthesizer * Click the "Stop Speech" button above the input area to immediately stop any ongoing speech * You can also click the speech button when hovering over messages to speak individual messages or their parts @@ -761,7 +768,7 @@ Enable voice responses from agents: - The interface allows users to stop speech at any time if a response is too lengthy or if they wish to intervene during the conversation. -The TTS uses a standard voice interface provided by modern browsers, which may sound robotic but is effective and does not require complex AI models. This ensures low latency and quick responses across various platforms, including mobile devices. +Kokoro gives you a local container-side TTS path when the plugin is enabled. When it is disabled, Agent Zero falls back to the browser voice stack, which is lower-friction and works well across devices. > [!TIP] @@ -771,19 +778,20 @@ The TTS uses a standard voice interface provided by modern browsers, which may s > - Creating a more interactive experience ### Speech-to-Text -Send voice messages to agents using OpenAI's Whisper model (does not require OpenAI API key!): +Send voice messages to agents using Whisper (does not require an OpenAI API key): 1. Click the microphone button in the input area to start recording + - The microphone button only appears when `_whisper_stt` is enabled 2. The button color indicates the current status: - Grey: Inactive - - Red: Listening - - Green: Recording - - Teal: Waiting - - Cyan (pulsing): Processing + - Teal: Listening + - Red: Recording + - Amber: Waiting + - Purple: Processing or activating Users can adjust settings such as silence threshold and message duration before sending to optimize their interaction experience. -Configure STT settings in the Settings page: +Configure Whisper STT from the plugin settings screen in the Voice section or from Agent Plugins: * **Model Size:** Choose between Base (74M, English) or other models - Note: Only Large and Turbo models support multiple languages * **Language Code:** Set your preferred language (e.g., 'en', 'fr', 'it', 'cz') @@ -795,9 +803,8 @@ Configure STT settings in the Settings page:  > [!IMPORTANT] -> All STT and TTS functionalities operate locally within the Docker container, -> ensuring that no data is transmitted to external servers or OpenAI APIs. This -> enhances user privacy while maintaining functionality. +> Whisper STT and Kokoro TTS operate locally within the Docker/container runtime when their plugins are enabled. +> Browser fallback TTS runs locally in the browser. No voice path requires OpenAI APIs. ## Mathematical Expressions * **Complex Mathematics:** Supports full KaTeX syntax for: diff --git a/docs/setup/installation.md b/docs/setup/installation.md index 25ed3e141b..dfc973ef21 100644 --- a/docs/setup/installation.md +++ b/docs/setup/installation.md @@ -352,11 +352,13 @@ Use `claude-sonnet-4-5` for Anthropic, but use `anthropic/claude-sonnet-4-5` for > [!NOTE] > Agent Zero uses a local embedding model by default (runs on CPU), but you can switch to OpenAI embeddings like `text-embedding-3-small` or `text-embedding-3-large` if preferred. -### Speech to Text Options +### Built-in Voice Plugins -- **Model Size:** Choose the speech recognition model size -- **Language Code:** Set the primary language for voice recognition -- **Silence Settings:** Configure silence threshold, duration, and timeout parameters for voice input +- Agent Zero ships Whisper STT as the built-in `_whisper_stt` plugin and Kokoro TTS as the built-in `_kokoro_tts` plugin. +- Docker/bootstrap remains responsible for installing the required speech dependencies such as `ffmpeg`, Kokoro, Whisper, and `soundfile`. +- Both plugins can be enabled or disabled independently from the Agent Plugins section in the Web UI. +- Whisper model size, language, and silence behavior are configured from the plugin settings screen. +- If `_kokoro_tts` is disabled, spoken output falls back to the browser's native speech synthesis instead of the container runtime. ### API Keys diff --git a/helpers/kokoro_tts.py b/helpers/kokoro_tts.py deleted file mode 100644 index 7dd7c12f63..0000000000 --- a/helpers/kokoro_tts.py +++ /dev/null @@ -1,127 +0,0 @@ -# kokoro_tts.py - -import base64 -import io -import warnings -import asyncio -import soundfile as sf -from helpers import runtime -from helpers.print_style import PrintStyle -from helpers.notification import NotificationManager, NotificationType, NotificationPriority - -warnings.filterwarnings("ignore", category=FutureWarning) -warnings.filterwarnings("ignore", category=UserWarning) - -_pipeline = None -_voice = "am_puck,am_onyx" -_speed = 1.1 -is_updating_model = False - - -async def preload(): - try: - # return await runtime.call_development_function(_preload) - return await _preload() - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # PrintStyle.standard("RFC failed, falling back to direct execution...") - # return await _preload() - - -async def _preload(): - global _pipeline, is_updating_model - - while is_updating_model: - await asyncio.sleep(0.1) - - try: - is_updating_model = True - if not _pipeline: - NotificationManager.send_notification( - NotificationType.INFO, - NotificationPriority.NORMAL, - "Loading Kokoro TTS model...", - display_time=99, - group="kokoro-preload") - PrintStyle.standard("Loading Kokoro TTS model...") - from kokoro import KPipeline - _pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M") - NotificationManager.send_notification( - NotificationType.INFO, - NotificationPriority.NORMAL, - "Kokoro TTS model loaded.", - display_time=2, - group="kokoro-preload") - finally: - is_updating_model = False - - -async def is_downloading(): - try: - # return await runtime.call_development_function(_is_downloading) - return _is_downloading() - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # return _is_downloading() - - -def _is_downloading(): - return is_updating_model - -async def is_downloaded(): - try: - # return await runtime.call_development_function(_is_downloaded) - return _is_downloaded() - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # return _is_downloaded() - -def _is_downloaded(): - return _pipeline is not None - - -async def synthesize_sentences(sentences: list[str]): - """Generate audio for multiple sentences and return concatenated base64 audio""" - try: - # return await runtime.call_development_function(_synthesize_sentences, sentences) - return await _synthesize_sentences(sentences) - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # return await _synthesize_sentences(sentences) - - -async def _synthesize_sentences(sentences: list[str]): - await _preload() - - combined_audio = [] - - try: - for sentence in sentences: - if sentence.strip(): - segments = _pipeline(sentence.strip(), voice=_voice, speed=_speed) # type: ignore - segment_list = list(segments) - - for segment in segment_list: - audio_tensor = segment.audio - audio_numpy = audio_tensor.detach().cpu().numpy() # type: ignore - combined_audio.extend(audio_numpy) - - # Convert combined audio to bytes - buffer = io.BytesIO() - sf.write(buffer, combined_audio, 24000, format="WAV") - audio_bytes = buffer.getvalue() - - # Return base64 encoded audio - return base64.b64encode(audio_bytes).decode("utf-8") - - except Exception as e: - PrintStyle.error(f"Error in Kokoro TTS synthesis: {e}") - raise \ No newline at end of file diff --git a/helpers/mcp_handler.py b/helpers/mcp_handler.py index 439127b218..cd9869364b 100644 --- a/helpers/mcp_handler.py +++ b/helpers/mcp_handler.py @@ -42,6 +42,7 @@ from helpers import dirty_json from helpers.print_style import PrintStyle from helpers.tool import Tool, Response +from helpers.extension import call_extensions_async def normalize_name(name: str) -> str: @@ -1105,10 +1106,21 @@ async def _create_stdio_transport( # Check if this is a streaming HTTP type if _is_streaming_http_type(server.type): # Use streamable HTTP client + # Before passing headers to httpx, allow extensions to resolve placeholders + resolved_headers = await call_extensions_async( + "resolve_mcp_server_headers", + agent=None, + server_name=server.name, + headers=dict(server.headers or {}), + ) + if resolved_headers is not None: + headers_to_use = resolved_headers + else: + headers_to_use = server.headers transport_result = await current_exit_stack.enter_async_context( streamablehttp_client( url=server.url, - headers=server.headers, + headers=headers_to_use, timeout=timedelta(seconds=init_timeout), sse_read_timeout=timedelta(seconds=tool_timeout), httpx_client_factory=client_factory, @@ -1123,10 +1135,21 @@ async def _create_stdio_transport( return read_stream, write_stream else: # Use traditional SSE client (default behavior) + # Before passing headers to httpx, allow extensions to resolve placeholders + resolved_headers = await call_extensions_async( + "resolve_mcp_server_headers", + agent=None, + server_name=server.name, + headers=dict(server.headers or {}), + ) + if resolved_headers is not None: + headers_to_use = resolved_headers + else: + headers_to_use = server.headers stdio_transport = await current_exit_stack.enter_async_context( sse_client( url=server.url, - headers=server.headers, + headers=headers_to_use, timeout=init_timeout, sse_read_timeout=tool_timeout, httpx_client_factory=client_factory, diff --git a/helpers/settings.py b/helpers/settings.py index cfa5873479..4eac5a8005 100644 --- a/helpers/settings.py +++ b/helpers/settings.py @@ -7,13 +7,14 @@ from typing import Any, Literal, TypedDict, cast, TypeVar import models -from helpers import runtime, whisper, defer, git, subagents +from helpers import runtime, defer, git, subagents from . import files, dotenv from helpers.print_style import PrintStyle from helpers.providers import get_providers, FieldOption as ProvidersFO from helpers.secrets import get_default_secrets_manager from helpers import dirty_json from helpers.notification import NotificationManager, NotificationType, NotificationPriority +from helpers.extension import extensible T = TypeVar('T') @@ -78,14 +79,6 @@ class Settings(TypedDict): websocket_server_restart_enabled: bool uvicorn_access_logs_enabled: bool - stt_model_size: str - stt_language: str - stt_silence_threshold: float - stt_silence_duration: int - stt_waiting_timeout: int - - tts_kokoro: bool - mcp_servers: str mcp_client_init_timeout: int mcp_client_tool_timeout: int @@ -151,7 +144,6 @@ class SettingsOutputAdditional(TypedDict): embedding_providers: list[ModelProvider] agent_subdirs: list[FieldOption] knowledge_subdirs: list[FieldOption] - stt_models: list[FieldOption] is_dockerized: bool runtime_settings: dict[str, Any] @@ -196,14 +188,6 @@ def convert_out(settings: Settings) -> SettingsOutput: if item["key"] != "_example"], knowledge_subdirs=[{"value": subdir, "label": subdir} for subdir in files.get_subdirectories("knowledge", exclude="default")], - stt_models=[ - {"value": "tiny", "label": "Tiny (39M, English)"}, - {"value": "base", "label": "Base (74M, English)"}, - {"value": "small", "label": "Small (244M, English)"}, - {"value": "medium", "label": "Medium (769M, English)"}, - {"value": "large", "label": "Large (1.5B, Multilingual)"}, - {"value": "turbo", "label": "Turbo (Multilingual)"}, - ], runtime_settings={}, ), ) @@ -225,7 +209,6 @@ def convert_out(settings: Settings) -> SettingsOutput: additional["agent_subdirs"] = _ensure_option_present(additional.get("agent_subdirs"), current.get("agent_profile")) additional["knowledge_subdirs"] = _ensure_option_present(additional.get("knowledge_subdirs"), current.get("agent_knowledge_subdir")) - additional["stt_models"] = _ensure_option_present(additional.get("stt_models"), current.get("stt_model_size")) # masked api keys providers = get_providers("chat") + get_providers("embedding") @@ -312,6 +295,7 @@ def set_runtime_settings_snapshot(settings: Settings) -> None: _runtime_settings_snapshot = settings.copy() +@extensible def set_settings(settings: Settings, apply: bool = True): global _settings previous = _settings @@ -322,6 +306,7 @@ def set_settings(settings: Settings, apply: bool = True): return reload_settings() +@extensible def set_settings_delta(delta: dict, apply: bool = True): current = get_settings() new = {**current, **delta} @@ -470,12 +455,6 @@ def get_default_settings() -> Settings: rfc_port_http=get_default_value("rfc_port_http", 55080), websocket_server_restart_enabled=get_default_value("websocket_server_restart_enabled", True), uvicorn_access_logs_enabled=get_default_value("uvicorn_access_logs_enabled", False), - stt_model_size=get_default_value("stt_model_size", "base"), - stt_language=get_default_value("stt_language", "en"), - stt_silence_threshold=get_default_value("stt_silence_threshold", 0.3), - stt_silence_duration=get_default_value("stt_silence_duration", 1000), - stt_waiting_timeout=get_default_value("stt_waiting_timeout", 2000), - tts_kokoro=get_default_value("tts_kokoro", True), mcp_servers=get_default_value("mcp_servers", '{\n "mcpServers": {}\n}'), mcp_client_init_timeout=get_default_value("mcp_client_init_timeout", 10), mcp_client_tool_timeout=get_default_value("mcp_client_tool_timeout", 120), @@ -505,12 +484,6 @@ def _apply_settings(previous: Settings | None): agent.config = ctx.config agent = agent.get_data(agent.DATA_NAME_SUBORDINATE) - # reload whisper model if necessary - if not previous or _settings["stt_model_size"] != previous["stt_model_size"]: - task = defer.DeferredTask().start_task( - whisper.preload, _settings["stt_model_size"] - ) # TODO overkill, replace with background task - # update mcp settings if necessary if not previous or _settings["mcp_servers"] != previous["mcp_servers"]: from helpers.mcp_handler import MCPConfig diff --git a/helpers/whisper.py b/helpers/whisper.py deleted file mode 100644 index 8a0b7fc794..0000000000 --- a/helpers/whisper.py +++ /dev/null @@ -1,96 +0,0 @@ -import base64 -import warnings -import whisper -import tempfile -import asyncio -from helpers import runtime, rfc, settings, files -from helpers.print_style import PrintStyle -from helpers.notification import NotificationManager, NotificationType, NotificationPriority - -# Suppress FutureWarning from torch.load -warnings.filterwarnings("ignore", category=FutureWarning) - -_model = None -_model_name = "" -is_updating_model = False # Tracks whether the model is currently updating - -async def preload(model_name:str): - try: - # return await runtime.call_development_function(_preload, model_name) - return await _preload(model_name) - except Exception as e: - # if not runtime.is_development(): - raise e - -async def _preload(model_name:str): - global _model, _model_name, is_updating_model - - while is_updating_model: - await asyncio.sleep(0.1) - - try: - is_updating_model = True - if not _model or _model_name != model_name: - NotificationManager.send_notification( - NotificationType.INFO, - NotificationPriority.NORMAL, - "Loading Whisper model...", - display_time=99, - group="whisper-preload") - PrintStyle.standard(f"Loading Whisper model: {model_name}") - _model = whisper.load_model(name=model_name, download_root=files.get_abs_path("/tmp/models/whisper")) # type: ignore - _model_name = model_name - NotificationManager.send_notification( - NotificationType.INFO, - NotificationPriority.NORMAL, - "Whisper model loaded.", - display_time=2, - group="whisper-preload") - finally: - is_updating_model = False - -async def is_downloading(): - # return await runtime.call_development_function(_is_downloading) - return _is_downloading() - -def _is_downloading(): - return is_updating_model - -async def is_downloaded(): - try: - # return await runtime.call_development_function(_is_downloaded) - return _is_downloaded() - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # return _is_downloaded() - -def _is_downloaded(): - return _model is not None - -async def transcribe(model_name:str, audio_bytes_b64: str): - # return await runtime.call_development_function(_transcribe, model_name, audio_bytes_b64) - return await _transcribe(model_name, audio_bytes_b64) - - -async def _transcribe(model_name:str, audio_bytes_b64: str): - await _preload(model_name) - - # Decode audio bytes if encoded as a base64 string - audio_bytes = base64.b64decode(audio_bytes_b64) - - # Create temp audio file - import os - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as audio_file: - audio_file.write(audio_bytes) - temp_path = audio_file.name - try: - # Transcribe the audio file - result = _model.transcribe(temp_path, fp16=False) # type: ignore - return result - finally: - try: - os.remove(temp_path) - except Exception: - pass # ignore errors during cleanup diff --git a/plugins/_browser_agent/extensions/webui/get_message_handler/browser-agent-handler.js b/plugins/_browser_agent/extensions/webui/get_message_handler/browser-agent-handler.js index 5177c13945..82767f2620 100644 --- a/plugins/_browser_agent/extensions/webui/get_message_handler/browser-agent-handler.js +++ b/plugins/_browser_agent/extensions/webui/get_message_handler/browser-agent-handler.js @@ -3,7 +3,7 @@ import { copyToClipboard, } from "/components/messages/action-buttons/simple-action-buttons.js"; import { store as stepDetailStore } from "/components/modals/process-step-detail/step-detail-store.js"; -import { store as speechStore } from "/components/chat/speech/speech-store.js"; +import { ttsService } from "/js/tts-service.js"; import { buildDetailPayload, cleanStepTitle, @@ -36,7 +36,7 @@ function drawMessageBrowserAgent({ buildDetailPayload(arguments[0], { headerLabels: [] }), ), ), - createActionButton("speak", "", () => speechStore.speak(answerText)), + createActionButton("speak", "", () => ttsService.speak(answerText)), createActionButton("copy", "", () => copyToClipboard(answerText)), ].filter(Boolean) : []; diff --git a/plugins/_kokoro_tts/README.md b/plugins/_kokoro_tts/README.md new file mode 100644 index 0000000000..28b3e61b90 --- /dev/null +++ b/plugins/_kokoro_tts/README.md @@ -0,0 +1,19 @@ +# Kokoro TTS + +Built-in speech synthesis plugin backed by Kokoro. + +## Behavior + +- Registers Kokoro as the active TTS provider when the plugin is enabled. +- Keeps browser-native `speechSynthesis` as the fallback path when disabled. +- Keeps Python dependencies on the core Docker/bootstrap path. This plugin does not install packages or binaries on demand. + +## Config + +- `voice`: Kokoro voice identifier +- `speed`: Kokoro playback speed multiplier + +## Routes + +- `POST /api/plugins/_kokoro_tts/synthesize` +- `POST /api/plugins/_kokoro_tts/status` diff --git a/plugins/_kokoro_tts/api/status.py b/plugins/_kokoro_tts/api/status.py new file mode 100644 index 0000000000..3b1321972e --- /dev/null +++ b/plugins/_kokoro_tts/api/status.py @@ -0,0 +1,31 @@ +import importlib.metadata + +from helpers.api import ApiHandler, Request, Response +from plugins._kokoro_tts.helpers import migration, runtime + + +class Status(ApiHandler): + async def process(self, input: dict, request: Request) -> dict | Response: + migration.ensure_migrated() + + package_version = "" + package_error = "" + try: + package_version = importlib.metadata.version("kokoro") + except Exception as e: + package_error = str(e) + + return { + "plugin": "_kokoro_tts", + "enabled": runtime.is_globally_enabled(), + "config": runtime.get_config(), + "model": { + "ready": await runtime.is_downloaded(), + "loading": await runtime.is_downloading(), + }, + "package": { + "version": package_version, + "error": package_error, + }, + "fallback": "Browser-native speechSynthesis remains the fallback when Kokoro is disabled.", + } diff --git a/plugins/_kokoro_tts/api/synthesize.py b/plugins/_kokoro_tts/api/synthesize.py new file mode 100644 index 0000000000..5530f90039 --- /dev/null +++ b/plugins/_kokoro_tts/api/synthesize.py @@ -0,0 +1,22 @@ +from helpers.api import ApiHandler, Request, Response +from plugins._kokoro_tts.helpers import runtime + + +class Synthesize(ApiHandler): + async def process(self, input: dict, request: Request) -> dict | Response: + if not runtime.is_globally_enabled(): + return Response(status=409, response="Kokoro TTS plugin is disabled") + + text = str(input.get("text") or "").strip() + if not text: + return Response(status=400, response="Missing text") + + try: + audio = await runtime.synthesize_sentences([text]) + return { + "success": True, + "audio": audio, + "mime_type": "audio/wav", + } + except Exception as e: + return {"success": False, "error": str(e)} diff --git a/plugins/_kokoro_tts/default_config.yaml b/plugins/_kokoro_tts/default_config.yaml new file mode 100644 index 0000000000..85be3ad699 --- /dev/null +++ b/plugins/_kokoro_tts/default_config.yaml @@ -0,0 +1,2 @@ +voice: am_puck,am_onyx +speed: 1.1 diff --git a/plugins/_kokoro_tts/extensions/webui/page-head/runtime.html b/plugins/_kokoro_tts/extensions/webui/page-head/runtime.html new file mode 100644 index 0000000000..f4142ada17 --- /dev/null +++ b/plugins/_kokoro_tts/extensions/webui/page-head/runtime.html @@ -0,0 +1,5 @@ + diff --git a/plugins/_kokoro_tts/extensions/webui/voice-settings-main/kokoro-card.html b/plugins/_kokoro_tts/extensions/webui/voice-settings-main/kokoro-card.html new file mode 100644 index 0000000000..ff4a2ebb02 --- /dev/null +++ b/plugins/_kokoro_tts/extensions/webui/voice-settings-main/kokoro-card.html @@ -0,0 +1,105 @@ +
auto to let Whisper detect it.
+ and tags with placeholder before extracting text
- doc.querySelectorAll('pre, code').forEach(el => {
- el.textContent = codePlaceholder;
- });
-
- // Extract text content (this strips all HTML tags properly)
- text = doc.body.textContent || "";
- } catch (e) {
- // Fallback: simple tag stripping if DOMParser fails
- console.warn("[Speech Store] DOMParser failed, using fallback:", e);
- text = text.replace(/]*>[\s\S]*?<\/pre>/gi, codePlaceholder);
- text = text.replace(/]*>[\s\S]*?<\/code>/gi, codePlaceholder);
- text = text.replace(/<[^>]+>/g, ''); // strip remaining tags
- }
-
- // Remove markdown links: [label](url) → label
- text = text.replace(/\[([^\]]+)\]\([^\)]+\)/g, "$1");
-
- // Remove markdown formatting: *, _, #
- text = text.replace(/[*_#]+/g, "");
-
- // Handle tables - both complete and partial
- // Check if text contains a table-like pattern
- if (text.includes("|")) {
- // Find consecutive lines with | characters (table rows)
- const tableLines = text
- .split("\n")
- .filter((line) => line.includes("|") && line.trim().startsWith("|"));
- if (tableLines.length > 0) {
- // Replace each table line with a placeholder
- for (const line of tableLines) {
- text = text.replace(line, tablePlaceholder);
- }
- } else {
- // Just handle individual table rows
- text = text.replace(/\|[^\n]*\|/g, tablePlaceholder);
- }
- }
-
- // Remove emojis and private unicode blocks
- text = text.replace(
- /([\u2700-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|[\u2011-\u26FF]|\uD83E[\uDD10-\uDDFF])/g,
- ""
- );
-
- // Replace URLs with just the domain name
- text = text.replace(/https?:\/\/[^\s]+/g, (match) => {
- try {
- return new URL(match).hostname;
- } catch {
- return "";
- }
- });
-
- // Remove email addresses
- // text = text.replace(/\S+@\S+/g, "");
-
- // Replace UUIDs with 'UUID'
- text = text.replace(
- /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g,
- "UUID"
- );
-
- // Collapse multiple spaces/tabs to a single space, but preserve newlines
- text = text.replace(/[ \t]+/g, " ");
-
- // Function to merge consecutive placeholders of any type
- function mergePlaceholders(txt, placeholder, replacement) {
- // Create regex for consecutive placeholders (with possible whitespace between)
- const regex = new RegExp(placeholder + "\\s*" + placeholder, "g");
- // Merge consecutive placeholders until no more found
- while (regex.test(txt)) {
- txt = txt.replace(regex, placeholder);
- }
- // Replace all remaining placeholders with human-readable text
- return txt.replace(new RegExp(placeholder, "g"), replacement);
- }
-
- // Apply placeholder merging for both types
- text = mergePlaceholders(text, codePlaceholder, "See code attached ...");
- text = mergePlaceholders(text, tablePlaceholder, "See table attached ...");
-
- // Trim leading/trailing whitespace
- text = text.trim();
-
- return text;
- },
-
- // Initialize microphone input
- async initMicrophone() {
- if (this.microphoneInput) return this.microphoneInput;
-
- this.microphoneInput = new MicrophoneInput(async (text, isFinal) => {
- if (isFinal) {
- this.sendMessage(text);
- }
- });
-
- const initialized = await this.microphoneInput.initialize();
- return initialized ? this.microphoneInput : null;
- },
-
- async sendMessage(text) {
- text = "(voice) " + text;
- updateChatInput(text);
- if (!this.microphoneInput.messageSent) {
- this.microphoneInput.messageSent = true;
- await sendMessage();
- }
- },
-
- // Request microphone permission - delegate to MicrophoneInput
- async requestMicrophonePermission() {
- return this.microphoneInput
- ? this.microphoneInput.requestPermission()
- : MicrophoneInput.prototype.requestPermission.call(null);
- },
-};
-
-// Microphone Input Class (simplified for store integration)
-class MicrophoneInput {
- constructor(updateCallback) {
- this.mediaRecorder = null;
- this.audioChunks = [];
- this.lastChunk = [];
- this.updateCallback = updateCallback;
- this.messageSent = false;
- this.audioContext = null;
- this.mediaStreamSource = null;
- this.analyserNode = null;
- this._status = Status.INACTIVE;
- this.lastAudioTime = null;
- this.waitingTimer = null;
- this.silenceStartTime = null;
- this.hasStartedRecording = false;
- this.analysisFrame = null;
- }
-
- get status() {
- return this._status;
- }
-
- set status(newStatus) {
- if (this._status === newStatus) return;
-
- const oldStatus = this._status;
- this._status = newStatus;
- console.log(`Mic status changed from ${oldStatus} to ${newStatus}`);
-
- this.handleStatusChange(oldStatus, newStatus);
- }
-
- async initialize() {
- // Set status to activating at the start of initialization
- this.status = Status.ACTIVATING;
- try {
- // get selected device from microphone settings
- const selectedDevice = microphoneSettingStore.getSelectedDevice();
-
- const stream = await navigator.mediaDevices.getUserMedia({
- audio: {
- deviceId:
- selectedDevice && selectedDevice.deviceId
- ? { exact: selectedDevice.deviceId }
- : undefined,
- echoCancellation: true,
- noiseSuppression: true,
- channelCount: 1,
- },
- });
-
- this.mediaRecorder = new MediaRecorder(stream);
- this.mediaRecorder.ondataavailable = (event) => {
- if (
- event.data.size > 0 &&
- (this.status === Status.RECORDING || this.status === Status.WAITING)
- ) {
- if (this.lastChunk) {
- this.audioChunks.push(this.lastChunk);
- this.lastChunk = null;
- }
- this.audioChunks.push(event.data);
- } else if (this.status === Status.LISTENING) {
- this.lastChunk = event.data;
- }
- };
-
- this.setupAudioAnalysis(stream);
- return true;
- } catch (error) {
- console.error("Microphone initialization error:", error);
- toast("Failed to access microphone. Please check permissions.", "error");
- return false;
- }
- }
-
- handleStatusChange(oldStatus, newStatus) {
- if (newStatus != Status.RECORDING) {
- this.lastChunk = null;
- }
-
- switch (newStatus) {
- case Status.INACTIVE:
- this.handleInactiveState();
- break;
- case Status.LISTENING:
- this.handleListeningState();
- break;
- case Status.RECORDING:
- this.handleRecordingState();
- break;
- case Status.WAITING:
- this.handleWaitingState();
- break;
- case Status.PROCESSING:
- this.handleProcessingState();
- break;
- }
- }
-
- handleInactiveState() {
- this.stopRecording();
- this.stopAudioAnalysis();
- if (this.waitingTimer) {
- clearTimeout(this.waitingTimer);
- this.waitingTimer = null;
- }
- }
-
- handleListeningState() {
- this.stopRecording();
- this.audioChunks = [];
- this.hasStartedRecording = false;
- this.silenceStartTime = null;
- this.lastAudioTime = null;
- this.messageSent = false;
- this.startAudioAnalysis();
- }
-
- handleRecordingState() {
- if (!this.hasStartedRecording && this.mediaRecorder.state !== "recording") {
- this.hasStartedRecording = true;
- this.mediaRecorder.start(1000);
- console.log("Speech started");
- }
- if (this.waitingTimer) {
- clearTimeout(this.waitingTimer);
- this.waitingTimer = null;
- }
- }
-
- handleWaitingState() {
- this.waitingTimer = setTimeout(() => {
- if (this.status === Status.WAITING) {
- this.status = Status.PROCESSING;
- }
- }, store.stt_waiting_timeout);
- }
-
- handleProcessingState() {
- this.stopRecording();
- this.process();
- }
-
- setupAudioAnalysis(stream) {
- this.audioContext = new (window.AudioContext ||
- window.webkitAudioContext)();
- this.mediaStreamSource = this.audioContext.createMediaStreamSource(stream);
- this.analyserNode = this.audioContext.createAnalyser();
- this.analyserNode.fftSize = 2048;
- this.analyserNode.minDecibels = -90;
- this.analyserNode.maxDecibels = -10;
- this.analyserNode.smoothingTimeConstant = 0.85;
- this.mediaStreamSource.connect(this.analyserNode);
- }
-
- startAudioAnalysis() {
- const analyzeFrame = () => {
- if (this.status === Status.INACTIVE) return;
-
- const dataArray = new Uint8Array(this.analyserNode.fftSize);
- this.analyserNode.getByteTimeDomainData(dataArray);
-
- let sum = 0;
- for (let i = 0; i < dataArray.length; i++) {
- const amplitude = (dataArray[i] - 128) / 128;
- sum += amplitude * amplitude;
- }
- const rms = Math.sqrt(sum / dataArray.length);
- const now = Date.now();
-
- // Update status based on audio level (ignore if TTS is speaking)
- if (rms > this.densify(store.stt_silence_threshold)) {
- this.lastAudioTime = now;
- this.silenceStartTime = null;
-
- if (
- (this.status === Status.LISTENING ||
- this.status === Status.WAITING) &&
- !store.isSpeaking
- ) {
- this.status = Status.RECORDING;
- }
- } else if (this.status === Status.RECORDING) {
- if (!this.silenceStartTime) {
- this.silenceStartTime = now;
- }
-
- const silenceDuration = now - this.silenceStartTime;
- if (silenceDuration >= store.stt_silence_duration) {
- this.status = Status.WAITING;
- }
- }
-
- this.analysisFrame = requestAnimationFrame(analyzeFrame);
- };
-
- this.analysisFrame = requestAnimationFrame(analyzeFrame);
- }
-
- stopAudioAnalysis() {
- if (this.analysisFrame) {
- cancelAnimationFrame(this.analysisFrame);
- this.analysisFrame = null;
- }
- }
-
- stopRecording() {
- if (this.mediaRecorder?.state === "recording") {
- this.mediaRecorder.stop();
- this.hasStartedRecording = false;
- }
- }
-
- densify(x) {
- return Math.exp(-5 * (1 - x));
- }
-
- async process() {
- if (this.audioChunks.length === 0) {
- this.status = Status.LISTENING;
- return;
- }
-
- const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
- const base64 = await this.convertBlobToBase64Wav(audioBlob);
-
- try {
- const result = await sendJsonData("/transcribe", { audio: base64 });
- const text = this.filterResult(result.text || "");
-
- if (text) {
- console.log("Transcription:", result.text);
- await this.updateCallback(result.text, true);
- }
- } catch (error) {
- window.toastFetchError("Transcription error", error);
- console.error("Transcription error:", error);
- } finally {
- this.audioChunks = [];
- this.status = Status.LISTENING;
- }
- }
-
- convertBlobToBase64Wav(audioBlob) {
- return new Promise((resolve, reject) => {
- const reader = new FileReader();
- reader.onloadend = () => {
- const base64Data = reader.result.split(",")[1];
- resolve(base64Data);
- };
- reader.onerror = (error) => reject(error);
- reader.readAsDataURL(audioBlob);
- });
- }
-
- filterResult(text) {
- text = text.trim();
- let ok = false;
- while (!ok) {
- if (!text) break;
- if (text[0] === "{" && text[text.length - 1] === "}") break;
- if (text[0] === "(" && text[text.length - 1] === ")") break;
- if (text[0] === "[" && text[text.length - 1] === "]") break;
- ok = true;
- }
- if (ok) return text;
- else console.log(`Discarding transcription: ${text}`);
- }
-
- // Toggle microphone between active and inactive states
- async toggle() {
- const hasPermission = await this.requestPermission();
- if (!hasPermission) return;
-
- // Toggle between listening and inactive
- if (this.status === Status.INACTIVE || this.status === Status.ACTIVATING) {
- this.status = Status.LISTENING;
- } else {
- this.status = Status.INACTIVE;
- }
- }
-
- // Request microphone permission
- async requestPermission() {
- try {
- await navigator.mediaDevices.getUserMedia({ audio: true });
- return true;
- } catch (err) {
- console.error("Error accessing microphone:", err);
- toast(
- "Microphone access denied. Please enable microphone access in your browser settings.",
- "error"
- );
- return false;
- }
- }
-}
-
-export const store = createStore("speech", model);
-
-// Initialize speech store
-// window.speechStore = speechStore;
-
-// Event listeners
-document.addEventListener("settings-updated", () => store.loadSettings());
-// document.addEventListener("DOMContentLoaded", () => speechStore.init());
diff --git a/webui/components/settings/agent/agent-settings.html b/webui/components/settings/agent/agent-settings.html
index e860d65a3c..b4bdd33faa 100644
--- a/webui/components/settings/agent/agent-settings.html
+++ b/webui/components/settings/agent/agent-settings.html
@@ -22,9 +22,9 @@
-
-
- Speech
+
+
+ Voice
@@ -44,8 +44,8 @@
]*>[\s\S]*?<\/pre>/gi, codePlaceholder);
+ text = text.replace(/]*>[\s\S]*?<\/code>/gi, codePlaceholder);
+ text = text.replace(/<[^>]+>/g, "");
+ }
+
+ text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1");
+ text = text.replace(/[*_#]+/g, "");
+
+ if (text.includes("|")) {
+ const tableLines = text
+ .split("\n")
+ .filter((line) => line.includes("|") && line.trim().startsWith("|"));
+ if (tableLines.length > 0) {
+ for (const line of tableLines) {
+ text = text.replace(line, tablePlaceholder);
+ }
+ } else {
+ text = text.replace(/\|[^\n]*\|/g, tablePlaceholder);
+ }
+ }
+
+ text = text.replace(
+ /([\u2700-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|[\u2011-\u26FF]|\uD83E[\uDD10-\uDDFF])/g,
+ "",
+ );
+
+ text = text.replace(/https?:\/\/[^\s]+/g, (match) => {
+ try {
+ return new URL(match).hostname;
+ } catch {
+ return "";
+ }
+ });
+
+ text = text.replace(
+ /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g,
+ "UUID",
+ );
+ text = text.replace(/[ \t]+/g, " ");
+
+ const mergePlaceholders = (value, placeholder, replacement) => {
+ const pattern = new RegExp(`${placeholder}\\s*${placeholder}`, "g");
+ while (pattern.test(value)) {
+ value = value.replace(pattern, placeholder);
+ }
+ return value.replace(new RegExp(placeholder, "g"), replacement);
+ };
+
+ text = mergePlaceholders(text, codePlaceholder, "See code attached ...");
+ text = mergePlaceholders(text, tablePlaceholder, "See table attached ...");
+
+ return text.trim();
+ }
+}
+
+export const ttsService = new TtsService();
+globalThis.ttsService = ttsService;