diff --git a/README.md b/README.md index d9ecdedae6..a7dc56c2c4 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,9 @@ A detailed setup guide for Windows, macOS, and Linux can be found in the Agent Z - The Web UI output is very clean, fluid, colorful, readable, and interactive; nothing is hidden. - You can load or save chats directly within the Web UI. - The same output you see in the terminal is automatically saved to an HTML file in **logs/** folder for every session. +- Voice is provided by the built-in `_kokoro_tts` and `_whisper_stt` plugins. +- Docker/bootstrap remains responsible for installing Kokoro, Whisper, `ffmpeg`, and related speech dependencies. +- If `_kokoro_tts` is disabled, spoken output falls back to the browser's native speech synthesis. ![Time example](/docs/res/time_example.jpg) diff --git a/api/synthesize.py b/api/synthesize.py deleted file mode 100644 index 7957ef0d87..0000000000 --- a/api/synthesize.py +++ /dev/null @@ -1,96 +0,0 @@ -# api/synthesize.py - -from helpers.api import ApiHandler, Request, Response - -from helpers import runtime, settings, kokoro_tts - -class Synthesize(ApiHandler): - async def process(self, input: dict, request: Request) -> dict | Response: - text = input.get("text", "") - ctxid = input.get("ctxid", "") - - if ctxid: - context = self.use_context(ctxid) - - # if not await kokoro_tts.is_downloaded(): - # context.log.log(type="info", content="Kokoro TTS model is currently being initialized, please wait...") - - try: - # # Clean and chunk text for long responses - # cleaned_text = self._clean_text(text) - # chunks = self._chunk_text(cleaned_text) - - # if len(chunks) == 1: - # # Single chunk - return as before - # audio = await kokoro_tts.synthesize_sentences(chunks) - # return {"audio": audio, "success": True} - # else: - # # Multiple chunks - return as sequence - # audio_parts = [] - # for chunk in chunks: - # chunk_audio = await kokoro_tts.synthesize_sentences([chunk]) - # audio_parts.append(chunk_audio) - # return {"audio_parts": audio_parts, "success": True} - - # audio is chunked on the frontend for better flow - audio = await kokoro_tts.synthesize_sentences([text]) - return {"audio": audio, "success": True} - except Exception as e: - return {"error": str(e), "success": False} - - # def _clean_text(self, text: str) -> str: - # """Clean text by removing markdown, tables, code blocks, and other formatting""" - # # Remove code blocks - # text = re.sub(r'```[\s\S]*?```', '', text) - # text = re.sub(r'`[^`]*`', '', text) - - # # Remove markdown links - # text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) - - # # Remove markdown formatting - # text = re.sub(r'[*_#]+', '', text) - - # # Remove tables (basic cleanup) - # text = re.sub(r'\|[^\n]*\|', '', text) - - # # Remove extra whitespace and newlines - # text = re.sub(r'\n+', ' ', text) - # text = re.sub(r'\s+', ' ', text) - - # # Remove URLs - # text = re.sub(r'https?://[^\s]+', '', text) - - # # Remove email addresses - # text = re.sub(r'\S+@\S+', '', text) - - # return text.strip() - - # def _chunk_text(self, text: str) -> list[str]: - # """Split text into manageable chunks for TTS""" - # # If text is short enough, return as single chunk - # if len(text) <= 300: - # return [text] - - # # Split into sentences first - # sentences = re.split(r'(?<=[.!?])\s+', text) - - # chunks = [] - # current_chunk = "" - - # for sentence in sentences: - # sentence = sentence.strip() - # if not sentence: - # continue - - # # If adding this sentence would make chunk too long, start new chunk - # if current_chunk and len(current_chunk + " " + sentence) > 300: - # chunks.append(current_chunk.strip()) - # current_chunk = sentence - # else: - # current_chunk += (" " if current_chunk else "") + sentence - - # # Add the last chunk if it has content - # if current_chunk.strip(): - # chunks.append(current_chunk.strip()) - - # return chunks if chunks else [text] \ No newline at end of file diff --git a/api/transcribe.py b/api/transcribe.py deleted file mode 100644 index 93e1cd5927..0000000000 --- a/api/transcribe.py +++ /dev/null @@ -1,18 +0,0 @@ -from helpers.api import ApiHandler, Request, Response - -from helpers import runtime, settings, whisper - -class Transcribe(ApiHandler): - async def process(self, input: dict, request: Request) -> dict | Response: - audio = input.get("audio") - ctxid = input.get("ctxid", "") - - if ctxid: - context = self.use_context(ctxid) - - # if not await whisper.is_downloaded(): - # context.log.log(type="info", content="Whisper STT model is currently being initialized, please wait...") - - set = settings.get_settings() - result = await whisper.transcribe(set["stt_model_size"], audio) # type: ignore - return result diff --git a/docs/guides/usage.md b/docs/guides/usage.md index c77e672f11..99c53daa02 100644 --- a/docs/guides/usage.md +++ b/docs/guides/usage.md @@ -748,12 +748,19 @@ If you encounter issues with the tunnel feature: > Combine tunneling with authentication for secure remote access to your Agent Zero instance from any device, including mobile phones and tablets. ## Voice Interface -Agent Zero provides both Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities for natural voice interaction: +Agent Zero provides both Text-to-Speech (TTS) and Speech-to-Text (STT) capabilities for natural voice interaction through built-in plugins: + +- `_kokoro_tts` handles server-side Kokoro speech synthesis when enabled +- `_whisper_stt` handles server-side Whisper transcription and injects the microphone UI when enabled +- Browser-native `speechSynthesis` remains the fallback output path when `_kokoro_tts` is disabled + +Use the Agent Plugins section in Settings to enable or disable either plugin independently. ### Text-to-Speech Enable voice responses from agents: * Toggle the "Speech" switch in the Preferences section of the sidebar -* Agents will use your system's built-in voice synthesizer to speak their messages +* If `_kokoro_tts` is enabled, agents will use Kokoro for spoken output +* If `_kokoro_tts` is disabled, agents will use your browser's built-in voice synthesizer * Click the "Stop Speech" button above the input area to immediately stop any ongoing speech * You can also click the speech button when hovering over messages to speak individual messages or their parts @@ -761,7 +768,7 @@ Enable voice responses from agents: - The interface allows users to stop speech at any time if a response is too lengthy or if they wish to intervene during the conversation. -The TTS uses a standard voice interface provided by modern browsers, which may sound robotic but is effective and does not require complex AI models. This ensures low latency and quick responses across various platforms, including mobile devices. +Kokoro gives you a local container-side TTS path when the plugin is enabled. When it is disabled, Agent Zero falls back to the browser voice stack, which is lower-friction and works well across devices. > [!TIP] @@ -771,19 +778,20 @@ The TTS uses a standard voice interface provided by modern browsers, which may s > - Creating a more interactive experience ### Speech-to-Text -Send voice messages to agents using OpenAI's Whisper model (does not require OpenAI API key!): +Send voice messages to agents using Whisper (does not require an OpenAI API key): 1. Click the microphone button in the input area to start recording + - The microphone button only appears when `_whisper_stt` is enabled 2. The button color indicates the current status: - Grey: Inactive - - Red: Listening - - Green: Recording - - Teal: Waiting - - Cyan (pulsing): Processing + - Teal: Listening + - Red: Recording + - Amber: Waiting + - Purple: Processing or activating Users can adjust settings such as silence threshold and message duration before sending to optimize their interaction experience. -Configure STT settings in the Settings page: +Configure Whisper STT from the plugin settings screen in the Voice section or from Agent Plugins: * **Model Size:** Choose between Base (74M, English) or other models - Note: Only Large and Turbo models support multiple languages * **Language Code:** Set your preferred language (e.g., 'en', 'fr', 'it', 'cz') @@ -795,9 +803,8 @@ Configure STT settings in the Settings page: ![Speech to Text Settings](../res/usage/ui-settings-5-speech-to-text.png) > [!IMPORTANT] -> All STT and TTS functionalities operate locally within the Docker container, -> ensuring that no data is transmitted to external servers or OpenAI APIs. This -> enhances user privacy while maintaining functionality. +> Whisper STT and Kokoro TTS operate locally within the Docker/container runtime when their plugins are enabled. +> Browser fallback TTS runs locally in the browser. No voice path requires OpenAI APIs. ## Mathematical Expressions * **Complex Mathematics:** Supports full KaTeX syntax for: diff --git a/docs/setup/installation.md b/docs/setup/installation.md index 25ed3e141b..dfc973ef21 100644 --- a/docs/setup/installation.md +++ b/docs/setup/installation.md @@ -352,11 +352,13 @@ Use `claude-sonnet-4-5` for Anthropic, but use `anthropic/claude-sonnet-4-5` for > [!NOTE] > Agent Zero uses a local embedding model by default (runs on CPU), but you can switch to OpenAI embeddings like `text-embedding-3-small` or `text-embedding-3-large` if preferred. -### Speech to Text Options +### Built-in Voice Plugins -- **Model Size:** Choose the speech recognition model size -- **Language Code:** Set the primary language for voice recognition -- **Silence Settings:** Configure silence threshold, duration, and timeout parameters for voice input +- Agent Zero ships Whisper STT as the built-in `_whisper_stt` plugin and Kokoro TTS as the built-in `_kokoro_tts` plugin. +- Docker/bootstrap remains responsible for installing the required speech dependencies such as `ffmpeg`, Kokoro, Whisper, and `soundfile`. +- Both plugins can be enabled or disabled independently from the Agent Plugins section in the Web UI. +- Whisper model size, language, and silence behavior are configured from the plugin settings screen. +- If `_kokoro_tts` is disabled, spoken output falls back to the browser's native speech synthesis instead of the container runtime. ### API Keys diff --git a/helpers/kokoro_tts.py b/helpers/kokoro_tts.py deleted file mode 100644 index 7dd7c12f63..0000000000 --- a/helpers/kokoro_tts.py +++ /dev/null @@ -1,127 +0,0 @@ -# kokoro_tts.py - -import base64 -import io -import warnings -import asyncio -import soundfile as sf -from helpers import runtime -from helpers.print_style import PrintStyle -from helpers.notification import NotificationManager, NotificationType, NotificationPriority - -warnings.filterwarnings("ignore", category=FutureWarning) -warnings.filterwarnings("ignore", category=UserWarning) - -_pipeline = None -_voice = "am_puck,am_onyx" -_speed = 1.1 -is_updating_model = False - - -async def preload(): - try: - # return await runtime.call_development_function(_preload) - return await _preload() - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # PrintStyle.standard("RFC failed, falling back to direct execution...") - # return await _preload() - - -async def _preload(): - global _pipeline, is_updating_model - - while is_updating_model: - await asyncio.sleep(0.1) - - try: - is_updating_model = True - if not _pipeline: - NotificationManager.send_notification( - NotificationType.INFO, - NotificationPriority.NORMAL, - "Loading Kokoro TTS model...", - display_time=99, - group="kokoro-preload") - PrintStyle.standard("Loading Kokoro TTS model...") - from kokoro import KPipeline - _pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M") - NotificationManager.send_notification( - NotificationType.INFO, - NotificationPriority.NORMAL, - "Kokoro TTS model loaded.", - display_time=2, - group="kokoro-preload") - finally: - is_updating_model = False - - -async def is_downloading(): - try: - # return await runtime.call_development_function(_is_downloading) - return _is_downloading() - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # return _is_downloading() - - -def _is_downloading(): - return is_updating_model - -async def is_downloaded(): - try: - # return await runtime.call_development_function(_is_downloaded) - return _is_downloaded() - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # return _is_downloaded() - -def _is_downloaded(): - return _pipeline is not None - - -async def synthesize_sentences(sentences: list[str]): - """Generate audio for multiple sentences and return concatenated base64 audio""" - try: - # return await runtime.call_development_function(_synthesize_sentences, sentences) - return await _synthesize_sentences(sentences) - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # return await _synthesize_sentences(sentences) - - -async def _synthesize_sentences(sentences: list[str]): - await _preload() - - combined_audio = [] - - try: - for sentence in sentences: - if sentence.strip(): - segments = _pipeline(sentence.strip(), voice=_voice, speed=_speed) # type: ignore - segment_list = list(segments) - - for segment in segment_list: - audio_tensor = segment.audio - audio_numpy = audio_tensor.detach().cpu().numpy() # type: ignore - combined_audio.extend(audio_numpy) - - # Convert combined audio to bytes - buffer = io.BytesIO() - sf.write(buffer, combined_audio, 24000, format="WAV") - audio_bytes = buffer.getvalue() - - # Return base64 encoded audio - return base64.b64encode(audio_bytes).decode("utf-8") - - except Exception as e: - PrintStyle.error(f"Error in Kokoro TTS synthesis: {e}") - raise \ No newline at end of file diff --git a/helpers/mcp_handler.py b/helpers/mcp_handler.py index 439127b218..cd9869364b 100644 --- a/helpers/mcp_handler.py +++ b/helpers/mcp_handler.py @@ -42,6 +42,7 @@ from helpers import dirty_json from helpers.print_style import PrintStyle from helpers.tool import Tool, Response +from helpers.extension import call_extensions_async def normalize_name(name: str) -> str: @@ -1105,10 +1106,21 @@ async def _create_stdio_transport( # Check if this is a streaming HTTP type if _is_streaming_http_type(server.type): # Use streamable HTTP client + # Before passing headers to httpx, allow extensions to resolve placeholders + resolved_headers = await call_extensions_async( + "resolve_mcp_server_headers", + agent=None, + server_name=server.name, + headers=dict(server.headers or {}), + ) + if resolved_headers is not None: + headers_to_use = resolved_headers + else: + headers_to_use = server.headers transport_result = await current_exit_stack.enter_async_context( streamablehttp_client( url=server.url, - headers=server.headers, + headers=headers_to_use, timeout=timedelta(seconds=init_timeout), sse_read_timeout=timedelta(seconds=tool_timeout), httpx_client_factory=client_factory, @@ -1123,10 +1135,21 @@ async def _create_stdio_transport( return read_stream, write_stream else: # Use traditional SSE client (default behavior) + # Before passing headers to httpx, allow extensions to resolve placeholders + resolved_headers = await call_extensions_async( + "resolve_mcp_server_headers", + agent=None, + server_name=server.name, + headers=dict(server.headers or {}), + ) + if resolved_headers is not None: + headers_to_use = resolved_headers + else: + headers_to_use = server.headers stdio_transport = await current_exit_stack.enter_async_context( sse_client( url=server.url, - headers=server.headers, + headers=headers_to_use, timeout=init_timeout, sse_read_timeout=tool_timeout, httpx_client_factory=client_factory, diff --git a/helpers/settings.py b/helpers/settings.py index cfa5873479..4eac5a8005 100644 --- a/helpers/settings.py +++ b/helpers/settings.py @@ -7,13 +7,14 @@ from typing import Any, Literal, TypedDict, cast, TypeVar import models -from helpers import runtime, whisper, defer, git, subagents +from helpers import runtime, defer, git, subagents from . import files, dotenv from helpers.print_style import PrintStyle from helpers.providers import get_providers, FieldOption as ProvidersFO from helpers.secrets import get_default_secrets_manager from helpers import dirty_json from helpers.notification import NotificationManager, NotificationType, NotificationPriority +from helpers.extension import extensible T = TypeVar('T') @@ -78,14 +79,6 @@ class Settings(TypedDict): websocket_server_restart_enabled: bool uvicorn_access_logs_enabled: bool - stt_model_size: str - stt_language: str - stt_silence_threshold: float - stt_silence_duration: int - stt_waiting_timeout: int - - tts_kokoro: bool - mcp_servers: str mcp_client_init_timeout: int mcp_client_tool_timeout: int @@ -151,7 +144,6 @@ class SettingsOutputAdditional(TypedDict): embedding_providers: list[ModelProvider] agent_subdirs: list[FieldOption] knowledge_subdirs: list[FieldOption] - stt_models: list[FieldOption] is_dockerized: bool runtime_settings: dict[str, Any] @@ -196,14 +188,6 @@ def convert_out(settings: Settings) -> SettingsOutput: if item["key"] != "_example"], knowledge_subdirs=[{"value": subdir, "label": subdir} for subdir in files.get_subdirectories("knowledge", exclude="default")], - stt_models=[ - {"value": "tiny", "label": "Tiny (39M, English)"}, - {"value": "base", "label": "Base (74M, English)"}, - {"value": "small", "label": "Small (244M, English)"}, - {"value": "medium", "label": "Medium (769M, English)"}, - {"value": "large", "label": "Large (1.5B, Multilingual)"}, - {"value": "turbo", "label": "Turbo (Multilingual)"}, - ], runtime_settings={}, ), ) @@ -225,7 +209,6 @@ def convert_out(settings: Settings) -> SettingsOutput: additional["agent_subdirs"] = _ensure_option_present(additional.get("agent_subdirs"), current.get("agent_profile")) additional["knowledge_subdirs"] = _ensure_option_present(additional.get("knowledge_subdirs"), current.get("agent_knowledge_subdir")) - additional["stt_models"] = _ensure_option_present(additional.get("stt_models"), current.get("stt_model_size")) # masked api keys providers = get_providers("chat") + get_providers("embedding") @@ -312,6 +295,7 @@ def set_runtime_settings_snapshot(settings: Settings) -> None: _runtime_settings_snapshot = settings.copy() +@extensible def set_settings(settings: Settings, apply: bool = True): global _settings previous = _settings @@ -322,6 +306,7 @@ def set_settings(settings: Settings, apply: bool = True): return reload_settings() +@extensible def set_settings_delta(delta: dict, apply: bool = True): current = get_settings() new = {**current, **delta} @@ -470,12 +455,6 @@ def get_default_settings() -> Settings: rfc_port_http=get_default_value("rfc_port_http", 55080), websocket_server_restart_enabled=get_default_value("websocket_server_restart_enabled", True), uvicorn_access_logs_enabled=get_default_value("uvicorn_access_logs_enabled", False), - stt_model_size=get_default_value("stt_model_size", "base"), - stt_language=get_default_value("stt_language", "en"), - stt_silence_threshold=get_default_value("stt_silence_threshold", 0.3), - stt_silence_duration=get_default_value("stt_silence_duration", 1000), - stt_waiting_timeout=get_default_value("stt_waiting_timeout", 2000), - tts_kokoro=get_default_value("tts_kokoro", True), mcp_servers=get_default_value("mcp_servers", '{\n "mcpServers": {}\n}'), mcp_client_init_timeout=get_default_value("mcp_client_init_timeout", 10), mcp_client_tool_timeout=get_default_value("mcp_client_tool_timeout", 120), @@ -505,12 +484,6 @@ def _apply_settings(previous: Settings | None): agent.config = ctx.config agent = agent.get_data(agent.DATA_NAME_SUBORDINATE) - # reload whisper model if necessary - if not previous or _settings["stt_model_size"] != previous["stt_model_size"]: - task = defer.DeferredTask().start_task( - whisper.preload, _settings["stt_model_size"] - ) # TODO overkill, replace with background task - # update mcp settings if necessary if not previous or _settings["mcp_servers"] != previous["mcp_servers"]: from helpers.mcp_handler import MCPConfig diff --git a/helpers/whisper.py b/helpers/whisper.py deleted file mode 100644 index 8a0b7fc794..0000000000 --- a/helpers/whisper.py +++ /dev/null @@ -1,96 +0,0 @@ -import base64 -import warnings -import whisper -import tempfile -import asyncio -from helpers import runtime, rfc, settings, files -from helpers.print_style import PrintStyle -from helpers.notification import NotificationManager, NotificationType, NotificationPriority - -# Suppress FutureWarning from torch.load -warnings.filterwarnings("ignore", category=FutureWarning) - -_model = None -_model_name = "" -is_updating_model = False # Tracks whether the model is currently updating - -async def preload(model_name:str): - try: - # return await runtime.call_development_function(_preload, model_name) - return await _preload(model_name) - except Exception as e: - # if not runtime.is_development(): - raise e - -async def _preload(model_name:str): - global _model, _model_name, is_updating_model - - while is_updating_model: - await asyncio.sleep(0.1) - - try: - is_updating_model = True - if not _model or _model_name != model_name: - NotificationManager.send_notification( - NotificationType.INFO, - NotificationPriority.NORMAL, - "Loading Whisper model...", - display_time=99, - group="whisper-preload") - PrintStyle.standard(f"Loading Whisper model: {model_name}") - _model = whisper.load_model(name=model_name, download_root=files.get_abs_path("/tmp/models/whisper")) # type: ignore - _model_name = model_name - NotificationManager.send_notification( - NotificationType.INFO, - NotificationPriority.NORMAL, - "Whisper model loaded.", - display_time=2, - group="whisper-preload") - finally: - is_updating_model = False - -async def is_downloading(): - # return await runtime.call_development_function(_is_downloading) - return _is_downloading() - -def _is_downloading(): - return is_updating_model - -async def is_downloaded(): - try: - # return await runtime.call_development_function(_is_downloaded) - return _is_downloaded() - except Exception as e: - # if not runtime.is_development(): - raise e - # Fallback to direct execution if RFC fails in development - # return _is_downloaded() - -def _is_downloaded(): - return _model is not None - -async def transcribe(model_name:str, audio_bytes_b64: str): - # return await runtime.call_development_function(_transcribe, model_name, audio_bytes_b64) - return await _transcribe(model_name, audio_bytes_b64) - - -async def _transcribe(model_name:str, audio_bytes_b64: str): - await _preload(model_name) - - # Decode audio bytes if encoded as a base64 string - audio_bytes = base64.b64decode(audio_bytes_b64) - - # Create temp audio file - import os - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as audio_file: - audio_file.write(audio_bytes) - temp_path = audio_file.name - try: - # Transcribe the audio file - result = _model.transcribe(temp_path, fp16=False) # type: ignore - return result - finally: - try: - os.remove(temp_path) - except Exception: - pass # ignore errors during cleanup diff --git a/plugins/_browser_agent/extensions/webui/get_message_handler/browser-agent-handler.js b/plugins/_browser_agent/extensions/webui/get_message_handler/browser-agent-handler.js index 5177c13945..82767f2620 100644 --- a/plugins/_browser_agent/extensions/webui/get_message_handler/browser-agent-handler.js +++ b/plugins/_browser_agent/extensions/webui/get_message_handler/browser-agent-handler.js @@ -3,7 +3,7 @@ import { copyToClipboard, } from "/components/messages/action-buttons/simple-action-buttons.js"; import { store as stepDetailStore } from "/components/modals/process-step-detail/step-detail-store.js"; -import { store as speechStore } from "/components/chat/speech/speech-store.js"; +import { ttsService } from "/js/tts-service.js"; import { buildDetailPayload, cleanStepTitle, @@ -36,7 +36,7 @@ function drawMessageBrowserAgent({ buildDetailPayload(arguments[0], { headerLabels: [] }), ), ), - createActionButton("speak", "", () => speechStore.speak(answerText)), + createActionButton("speak", "", () => ttsService.speak(answerText)), createActionButton("copy", "", () => copyToClipboard(answerText)), ].filter(Boolean) : []; diff --git a/plugins/_kokoro_tts/README.md b/plugins/_kokoro_tts/README.md new file mode 100644 index 0000000000..28b3e61b90 --- /dev/null +++ b/plugins/_kokoro_tts/README.md @@ -0,0 +1,19 @@ +# Kokoro TTS + +Built-in speech synthesis plugin backed by Kokoro. + +## Behavior + +- Registers Kokoro as the active TTS provider when the plugin is enabled. +- Keeps browser-native `speechSynthesis` as the fallback path when disabled. +- Keeps Python dependencies on the core Docker/bootstrap path. This plugin does not install packages or binaries on demand. + +## Config + +- `voice`: Kokoro voice identifier +- `speed`: Kokoro playback speed multiplier + +## Routes + +- `POST /api/plugins/_kokoro_tts/synthesize` +- `POST /api/plugins/_kokoro_tts/status` diff --git a/plugins/_kokoro_tts/api/status.py b/plugins/_kokoro_tts/api/status.py new file mode 100644 index 0000000000..3b1321972e --- /dev/null +++ b/plugins/_kokoro_tts/api/status.py @@ -0,0 +1,31 @@ +import importlib.metadata + +from helpers.api import ApiHandler, Request, Response +from plugins._kokoro_tts.helpers import migration, runtime + + +class Status(ApiHandler): + async def process(self, input: dict, request: Request) -> dict | Response: + migration.ensure_migrated() + + package_version = "" + package_error = "" + try: + package_version = importlib.metadata.version("kokoro") + except Exception as e: + package_error = str(e) + + return { + "plugin": "_kokoro_tts", + "enabled": runtime.is_globally_enabled(), + "config": runtime.get_config(), + "model": { + "ready": await runtime.is_downloaded(), + "loading": await runtime.is_downloading(), + }, + "package": { + "version": package_version, + "error": package_error, + }, + "fallback": "Browser-native speechSynthesis remains the fallback when Kokoro is disabled.", + } diff --git a/plugins/_kokoro_tts/api/synthesize.py b/plugins/_kokoro_tts/api/synthesize.py new file mode 100644 index 0000000000..5530f90039 --- /dev/null +++ b/plugins/_kokoro_tts/api/synthesize.py @@ -0,0 +1,22 @@ +from helpers.api import ApiHandler, Request, Response +from plugins._kokoro_tts.helpers import runtime + + +class Synthesize(ApiHandler): + async def process(self, input: dict, request: Request) -> dict | Response: + if not runtime.is_globally_enabled(): + return Response(status=409, response="Kokoro TTS plugin is disabled") + + text = str(input.get("text") or "").strip() + if not text: + return Response(status=400, response="Missing text") + + try: + audio = await runtime.synthesize_sentences([text]) + return { + "success": True, + "audio": audio, + "mime_type": "audio/wav", + } + except Exception as e: + return {"success": False, "error": str(e)} diff --git a/plugins/_kokoro_tts/default_config.yaml b/plugins/_kokoro_tts/default_config.yaml new file mode 100644 index 0000000000..85be3ad699 --- /dev/null +++ b/plugins/_kokoro_tts/default_config.yaml @@ -0,0 +1,2 @@ +voice: am_puck,am_onyx +speed: 1.1 diff --git a/plugins/_kokoro_tts/extensions/webui/page-head/runtime.html b/plugins/_kokoro_tts/extensions/webui/page-head/runtime.html new file mode 100644 index 0000000000..f4142ada17 --- /dev/null +++ b/plugins/_kokoro_tts/extensions/webui/page-head/runtime.html @@ -0,0 +1,5 @@ + diff --git a/plugins/_kokoro_tts/extensions/webui/voice-settings-main/kokoro-card.html b/plugins/_kokoro_tts/extensions/webui/voice-settings-main/kokoro-card.html new file mode 100644 index 0000000000..ff4a2ebb02 --- /dev/null +++ b/plugins/_kokoro_tts/extensions/webui/voice-settings-main/kokoro-card.html @@ -0,0 +1,105 @@ +
+
+
+
+
Kokoro TTS
+
+ Server-side speech synthesis plugin. When disabled, Agent Zero falls back to browser-native TTS. +
+
+ +
+ +
+
+ Voice + +
+
+ Speed + +
+
+ +
+ + +
+
+ + +
diff --git a/plugins/_kokoro_tts/helpers/__init__.py b/plugins/_kokoro_tts/helpers/__init__.py new file mode 100644 index 0000000000..487ef2c748 --- /dev/null +++ b/plugins/_kokoro_tts/helpers/__init__.py @@ -0,0 +1,3 @@ +from . import runtime + +__all__ = ["runtime"] diff --git a/plugins/_kokoro_tts/helpers/migration.py b/plugins/_kokoro_tts/helpers/migration.py new file mode 100644 index 0000000000..20a69b36b1 --- /dev/null +++ b/plugins/_kokoro_tts/helpers/migration.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import json + +from helpers import files, plugins + + +PLUGIN_NAME = "_kokoro_tts" +LEGACY_SETTINGS_FILE = files.get_abs_path("usr/settings.json") + + +def ensure_migrated() -> bool: + legacy_settings = _read_legacy_settings() + legacy_enabled = _coerce_bool(legacy_settings.get("tts_kokoro"), default=True) + if legacy_enabled or _has_explicit_toggle(): + return False + + disabled_path = plugins.determine_plugin_asset_path( + PLUGIN_NAME, "", "", plugins.DISABLED_FILE_NAME + ) + files.write_file(disabled_path, "") + plugins.clear_plugin_cache([PLUGIN_NAME]) + return True + + +def _has_explicit_toggle() -> bool: + for root in plugins.get_plugin_roots(PLUGIN_NAME): + if files.exists(files.get_abs_path(root, plugins.ENABLED_FILE_NAME)): + return True + if files.exists(files.get_abs_path(root, plugins.DISABLED_FILE_NAME)): + return True + return False + + +def _read_legacy_settings() -> dict: + if not files.exists(LEGACY_SETTINGS_FILE): + return {} + + try: + return json.loads(files.read_file(LEGACY_SETTINGS_FILE)) + except Exception: + return {} + + +def _coerce_bool(value: object, default: bool) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, str): + lowered = value.strip().lower() + if lowered in {"true", "1", "yes", "on"}: + return True + if lowered in {"false", "0", "no", "off"}: + return False + return default diff --git a/plugins/_kokoro_tts/helpers/runtime.py b/plugins/_kokoro_tts/helpers/runtime.py new file mode 100644 index 0000000000..365886a2e0 --- /dev/null +++ b/plugins/_kokoro_tts/helpers/runtime.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import asyncio +import base64 +import io +import warnings +from typing import Any + +import soundfile as sf + +from helpers import plugins +from helpers.notification import ( + NotificationManager, + NotificationPriority, + NotificationType, +) +from helpers.print_style import PrintStyle +from plugins._kokoro_tts.helpers import migration + + +warnings.filterwarnings("ignore", category=FutureWarning) +warnings.filterwarnings("ignore", category=UserWarning) + + +PLUGIN_NAME = "_kokoro_tts" +DEFAULT_CONFIG = { + "voice": "am_puck,am_onyx", + "speed": 1.1, +} + +_pipeline = None +is_updating_model = False + + +def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]: + normalized = dict(DEFAULT_CONFIG) + if not isinstance(config, dict): + return normalized + + voice = str(config.get("voice", normalized["voice"]) or "").strip() + if voice: + normalized["voice"] = voice + + try: + speed = float(config.get("speed", normalized["speed"])) + if speed > 0: + normalized["speed"] = speed + except (TypeError, ValueError): + pass + + return normalized + + +def get_config() -> dict[str, Any]: + config = plugins.get_plugin_config(PLUGIN_NAME) or {} + return normalize_config(config) + + +def is_globally_enabled() -> bool: + migration.ensure_migrated() + return plugins.determined_toggle_from_paths( + True, reversed(plugins.get_plugin_roots(PLUGIN_NAME)) + ) + + +async def preload(config: dict[str, Any] | None = None): + return await _preload() + + +async def _preload(): + global _pipeline, is_updating_model + + while is_updating_model: + await asyncio.sleep(0.1) + + try: + is_updating_model = True + if not _pipeline: + NotificationManager.send_notification( + NotificationType.INFO, + NotificationPriority.NORMAL, + "Loading Kokoro TTS model...", + display_time=99, + group="kokoro-preload", + ) + PrintStyle.standard("Loading Kokoro TTS model...") + from kokoro import KPipeline + + _pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M") + NotificationManager.send_notification( + NotificationType.INFO, + NotificationPriority.NORMAL, + "Kokoro TTS model loaded.", + display_time=2, + group="kokoro-preload", + ) + finally: + is_updating_model = False + + +async def is_downloading() -> bool: + return is_updating_model + + +async def is_downloaded() -> bool: + return _pipeline is not None + + +async def synthesize_sentences( + sentences: list[str], config: dict[str, Any] | None = None +) -> str: + cfg = normalize_config(config or get_config()) + return await _synthesize_sentences( + sentences, + voice=str(cfg["voice"]), + speed=float(cfg["speed"]), + ) + + +async def _synthesize_sentences( + sentences: list[str], *, voice: str, speed: float +) -> str: + await _preload() + + combined_audio: list[float] = [] + + try: + for sentence in sentences: + if not sentence.strip(): + continue + + segments = _pipeline(sentence.strip(), voice=voice, speed=speed) # type: ignore[misc] + for segment in list(segments): + audio_tensor = segment.audio + audio_numpy = audio_tensor.detach().cpu().numpy() # type: ignore[union-attr] + combined_audio.extend(audio_numpy.tolist()) + + if not combined_audio: + return "" + + buffer = io.BytesIO() + sf.write(buffer, combined_audio, 24000, format="WAV") + return base64.b64encode(buffer.getvalue()).decode("utf-8") + except Exception as e: + PrintStyle.error(f"Error in Kokoro TTS synthesis: {e}") + raise diff --git a/plugins/_kokoro_tts/hooks.py b/plugins/_kokoro_tts/hooks.py new file mode 100644 index 0000000000..036cc75ff8 --- /dev/null +++ b/plugins/_kokoro_tts/hooks.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from plugins._kokoro_tts.helpers import migration, runtime + + +def get_plugin_config(default=None, **kwargs): + migration.ensure_migrated() + return runtime.normalize_config(default or {}) + + +def save_plugin_config(default=None, settings=None, **kwargs): + return runtime.normalize_config(settings or default or {}) diff --git a/plugins/_kokoro_tts/plugin.yaml b/plugins/_kokoro_tts/plugin.yaml new file mode 100644 index 0000000000..8c810fafc5 --- /dev/null +++ b/plugins/_kokoro_tts/plugin.yaml @@ -0,0 +1,9 @@ +name: _kokoro_tts +title: Kokoro TTS +description: Built-in Kokoro text-to-speech plugin with browser TTS fallback when disabled. +version: 1.0.0 +always_enabled: false +settings_sections: + - agent +per_project_config: false +per_agent_config: false diff --git a/plugins/_kokoro_tts/webui/config.html b/plugins/_kokoro_tts/webui/config.html new file mode 100644 index 0000000000..755258cc26 --- /dev/null +++ b/plugins/_kokoro_tts/webui/config.html @@ -0,0 +1,39 @@ + + + Kokoro TTS + + + +
+ +
+ + diff --git a/plugins/_kokoro_tts/webui/kokoro-tts-store.js b/plugins/_kokoro_tts/webui/kokoro-tts-store.js new file mode 100644 index 0000000000..715836ae48 --- /dev/null +++ b/plugins/_kokoro_tts/webui/kokoro-tts-store.js @@ -0,0 +1,116 @@ +import { createStore } from "/js/AlpineStore.js"; +import { toastFrontendError } from "/components/notifications/notification-store.js"; +import { callJsonApi } from "/js/api.js"; +import { ttsService } from "/js/tts-service.js"; + +const PLUGIN_NAME = "_kokoro_tts"; + +const model = { + runtimeInitialized: false, + statusLoaded: false, + loading: false, + error: "", + enabled: false, + config: { + voice: "", + speed: 1.1, + }, + modelReady: false, + modelLoading: false, + packageVersion: "", + providerCleanup: null, + + async initRuntime() { + if (this.runtimeInitialized) return; + this.runtimeInitialized = true; + await this.refreshStatus({ suppressError: true }); + }, + + async ensureStatusLoaded() { + if (this.statusLoaded || this.loading) return; + await this.refreshStatus({ suppressError: true }); + }, + + async refreshStatus({ suppressError = false } = {}) { + this.loading = true; + this.error = ""; + + try { + const status = await callJsonApi(`/plugins/${PLUGIN_NAME}/status`, {}); + this.statusLoaded = true; + this.enabled = !!status?.enabled; + this.config = { + voice: status?.config?.voice || "", + speed: Number(status?.config?.speed || 1.1), + }; + this.modelReady = !!status?.model?.ready; + this.modelLoading = !!status?.model?.loading; + this.packageVersion = status?.package?.version || ""; + + if (this.enabled) { + this.registerProvider(); + } else { + this.unregisterProvider(); + } + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + this.unregisterProvider(); + if (!suppressError) { + void toastFrontendError(this.error, "Kokoro TTS"); + } + } finally { + this.loading = false; + } + }, + + registerProvider() { + if (this.providerCleanup || !this.enabled) return; + + this.providerCleanup = ttsService.registerProvider(PLUGIN_NAME, { + synthesize: async (text) => { + const result = await callJsonApi(`/plugins/${PLUGIN_NAME}/synthesize`, { + text, + }); + if (!result?.success) { + throw new Error(result?.error || "Kokoro TTS synthesis failed."); + } + + return { + audioBase64: result.audio || "", + mimeType: result.mime_type || "audio/wav", + }; + }, + }); + }, + + unregisterProvider() { + if (!this.providerCleanup) return; + this.providerCleanup(); + this.providerCleanup = null; + }, + + get statusText() { + if (!this.enabled) return "Disabled"; + if (this.modelLoading) return "Loading"; + if (this.modelReady) return "Ready"; + return "Idle"; + }, + + get statusClass() { + if (!this.enabled) return "warn"; + if (this.modelLoading) return "warn"; + if (this.modelReady) return "ok"; + return "warn"; + }, + + async openConfig() { + const { store } = await import("/components/plugins/plugin-settings-store.js"); + await store.openConfig(PLUGIN_NAME); + }, + + openPanel() { + window.openModal?.(`/plugins/${PLUGIN_NAME}/webui/main.html`); + }, +}; + +export const store = createStore("kokoroTts", model); diff --git a/plugins/_kokoro_tts/webui/main.html b/plugins/_kokoro_tts/webui/main.html new file mode 100644 index 0000000000..5002d9ee6e --- /dev/null +++ b/plugins/_kokoro_tts/webui/main.html @@ -0,0 +1,135 @@ + + + Kokoro TTS + + + + +
+ +
+ + + + diff --git a/plugins/_whisper_stt/README.md b/plugins/_whisper_stt/README.md new file mode 100644 index 0000000000..9035ecbb54 --- /dev/null +++ b/plugins/_whisper_stt/README.md @@ -0,0 +1,22 @@ +# Whisper STT + +Built-in speech-to-text plugin backed by Whisper. + +## Responsibilities + +- Registers Whisper as the active STT provider when the plugin is enabled. +- Owns the microphone runtime, device selector UI, plugin prompt injection, and plugin APIs. +- Keeps dependency installation and model bootstrap on the Docker/bootstrap path. + +## Config + +- `model_size`: Whisper model name +- `language`: language hint or `auto` +- `silence_threshold`: frontend threshold before recording starts +- `silence_duration`: silence window before waiting state +- `waiting_timeout`: delay before transcription dispatch + +## API + +- `POST /api/plugins/_whisper_stt/transcribe` +- `POST /api/plugins/_whisper_stt/status` diff --git a/plugins/_whisper_stt/api/status.py b/plugins/_whisper_stt/api/status.py new file mode 100644 index 0000000000..40c89e4aca --- /dev/null +++ b/plugins/_whisper_stt/api/status.py @@ -0,0 +1,31 @@ +import importlib.metadata + +from helpers.api import ApiHandler, Request, Response +from plugins._whisper_stt.helpers import migration, runtime + + +class Status(ApiHandler): + async def process(self, input: dict, request: Request) -> dict | Response: + migration.ensure_config_seeded() + + package_version = "" + package_error = "" + try: + package_version = importlib.metadata.version("openai-whisper") + except Exception as e: + package_error = str(e) + + return { + "plugin": "_whisper_stt", + "enabled": runtime.is_globally_enabled(), + "config": runtime.get_config(), + "model": { + "ready": await runtime.is_downloaded(), + "loading": await runtime.is_downloading(), + "loaded_model": runtime.get_loaded_model_name(), + }, + "package": { + "version": package_version, + "error": package_error, + }, + } diff --git a/plugins/_whisper_stt/api/transcribe.py b/plugins/_whisper_stt/api/transcribe.py new file mode 100644 index 0000000000..5c0b4f30e7 --- /dev/null +++ b/plugins/_whisper_stt/api/transcribe.py @@ -0,0 +1,26 @@ +from helpers.api import ApiHandler, Request, Response +from plugins._whisper_stt.helpers import runtime + + +class Transcribe(ApiHandler): + async def process(self, input: dict, request: Request) -> dict | Response: + if not runtime.is_globally_enabled(): + return Response(status=409, response="Whisper STT plugin is disabled") + + audio = str(input.get("audio") or "").strip() + if not audio: + return Response(status=400, response="Missing audio") + + ctxid = str(input.get("ctxid") or "").strip() + if ctxid: + self.use_context(ctxid) + + try: + result = await runtime.transcribe(audio) + return { + "success": True, + "text": str(result.get("text") or "").strip(), + "language": str(result.get("language") or "").strip(), + } + except Exception as e: + return {"success": False, "error": str(e), "text": ""} diff --git a/plugins/_whisper_stt/default_config.yaml b/plugins/_whisper_stt/default_config.yaml new file mode 100644 index 0000000000..5141c6a171 --- /dev/null +++ b/plugins/_whisper_stt/default_config.yaml @@ -0,0 +1,5 @@ +model_size: base +language: en +silence_threshold: 0.3 +silence_duration: 1000 +waiting_timeout: 2000 diff --git a/plugins/_whisper_stt/extensions/python/system_prompt/_20_voice_transcription.py b/plugins/_whisper_stt/extensions/python/system_prompt/_20_voice_transcription.py new file mode 100644 index 0000000000..528acf1fe4 --- /dev/null +++ b/plugins/_whisper_stt/extensions/python/system_prompt/_20_voice_transcription.py @@ -0,0 +1,15 @@ +from agent import LoopData +from helpers.extension import Extension + + +class VoiceTranscriptionPrompt(Extension): + async def execute( + self, + system_prompt: list[str] = [], + loop_data: LoopData = LoopData(), + **kwargs, + ): + if not self.agent: + return + + system_prompt.append(self.agent.read_prompt("agent.system.voice_transcription.md")) diff --git a/plugins/_whisper_stt/extensions/webui/chat-input-box-end/microphone-button.html b/plugins/_whisper_stt/extensions/webui/chat-input-box-end/microphone-button.html new file mode 100644 index 0000000000..3f1ba97fea --- /dev/null +++ b/plugins/_whisper_stt/extensions/webui/chat-input-box-end/microphone-button.html @@ -0,0 +1,18 @@ +
+ +
diff --git a/plugins/_whisper_stt/extensions/webui/page-head/runtime.html b/plugins/_whisper_stt/extensions/webui/page-head/runtime.html new file mode 100644 index 0000000000..5fe07196b2 --- /dev/null +++ b/plugins/_whisper_stt/extensions/webui/page-head/runtime.html @@ -0,0 +1,6 @@ + + diff --git a/plugins/_whisper_stt/extensions/webui/voice-settings-main/whisper-card.html b/plugins/_whisper_stt/extensions/webui/voice-settings-main/whisper-card.html new file mode 100644 index 0000000000..15841750fd --- /dev/null +++ b/plugins/_whisper_stt/extensions/webui/voice-settings-main/whisper-card.html @@ -0,0 +1,103 @@ +
+
+
+
+
Whisper STT
+
+ Browser microphone input routed into the built-in Whisper transcription backend. +
+
+ +
+ +
+
+ Model size + +
+
+ Language + +
+
+ Microphone + +
+
+ +
+ + +
+
+ + +
diff --git a/plugins/_whisper_stt/helpers/__init__.py b/plugins/_whisper_stt/helpers/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/plugins/_whisper_stt/helpers/__init__.py @@ -0,0 +1 @@ + diff --git a/plugins/_whisper_stt/helpers/migration.py b/plugins/_whisper_stt/helpers/migration.py new file mode 100644 index 0000000000..4ccd59157f --- /dev/null +++ b/plugins/_whisper_stt/helpers/migration.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +import json +from typing import Any + +from helpers import files, plugins + + +PLUGIN_NAME = "_whisper_stt" +LEGACY_SETTINGS_FILE = files.get_abs_path("usr/settings.json") +DEFAULT_CONFIG = { + "model_size": "base", + "language": "en", + "silence_threshold": 0.3, + "silence_duration": 1000, + "waiting_timeout": 2000, +} + + +def ensure_config_seeded() -> bool: + config_path = get_config_path() + if files.exists(config_path): + return False + + config = build_seed_config(_read_legacy_settings()) + files.write_file(config_path, json.dumps(config, indent=2)) + return True + + +def get_config_path() -> str: + return plugins.determine_plugin_asset_path( + PLUGIN_NAME, "", "", plugins.CONFIG_FILE_NAME + ) + + +def read_saved_config() -> dict[str, Any]: + config_path = get_config_path() + if not files.exists(config_path): + return {} + + try: + return json.loads(files.read_file(config_path)) + except Exception: + return {} + + +def build_seed_config(legacy_settings: dict[str, Any]) -> dict[str, Any]: + seeded = dict(DEFAULT_CONFIG) + + model_size = str( + legacy_settings.get("stt_model_size", seeded["model_size"]) or "" + ).strip() + if model_size: + seeded["model_size"] = model_size + + language = str(legacy_settings.get("stt_language", seeded["language"]) or "").strip() + if language: + seeded["language"] = language + + seeded["silence_threshold"] = _coerce_float( + legacy_settings.get("stt_silence_threshold"), + seeded["silence_threshold"], + ) + seeded["silence_duration"] = _coerce_int( + legacy_settings.get("stt_silence_duration"), + seeded["silence_duration"], + ) + seeded["waiting_timeout"] = _coerce_int( + legacy_settings.get("stt_waiting_timeout"), + seeded["waiting_timeout"], + ) + + return seeded + + +def _read_legacy_settings() -> dict[str, Any]: + if not files.exists(LEGACY_SETTINGS_FILE): + return {} + + try: + return json.loads(files.read_file(LEGACY_SETTINGS_FILE)) + except Exception: + return {} + + +def _coerce_float(value: Any, default: float) -> float: + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _coerce_int(value: Any, default: int) -> int: + try: + return int(value) + except (TypeError, ValueError): + return default diff --git a/plugins/_whisper_stt/helpers/runtime.py b/plugins/_whisper_stt/helpers/runtime.py new file mode 100644 index 0000000000..4b9a6b61d5 --- /dev/null +++ b/plugins/_whisper_stt/helpers/runtime.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import asyncio +import base64 +import os +import tempfile +import warnings +from typing import Any + +import whisper + +from helpers import files, plugins +from helpers.notification import ( + NotificationManager, + NotificationPriority, + NotificationType, +) +from helpers.print_style import PrintStyle +from plugins._whisper_stt.helpers import migration + + +warnings.filterwarnings("ignore", category=FutureWarning) + + +PLUGIN_NAME = "_whisper_stt" +DEFAULT_CONFIG = { + "model_size": "base", + "language": "en", + "silence_threshold": 0.3, + "silence_duration": 1000, + "waiting_timeout": 2000, +} +VALID_MODEL_SIZES = {"tiny", "base", "small", "medium", "large", "turbo"} + +_model = None +_model_name = "" +is_updating_model = False + + +def normalize_config(config: dict[str, Any] | None) -> dict[str, Any]: + normalized = dict(DEFAULT_CONFIG) + if not isinstance(config, dict): + return normalized + + model_size = str(config.get("model_size", normalized["model_size"]) or "").strip() + if model_size in VALID_MODEL_SIZES: + normalized["model_size"] = model_size + + language = str(config.get("language", normalized["language"]) or "").strip() + if language: + normalized["language"] = language + + try: + silence_threshold = float( + config.get("silence_threshold", normalized["silence_threshold"]) + ) + normalized["silence_threshold"] = min(max(silence_threshold, 0.0), 1.0) + except (TypeError, ValueError): + pass + + try: + silence_duration = int( + config.get("silence_duration", normalized["silence_duration"]) + ) + if silence_duration > 0: + normalized["silence_duration"] = silence_duration + except (TypeError, ValueError): + pass + + try: + waiting_timeout = int(config.get("waiting_timeout", normalized["waiting_timeout"])) + if waiting_timeout > 0: + normalized["waiting_timeout"] = waiting_timeout + except (TypeError, ValueError): + pass + + return normalized + + +def get_config() -> dict[str, Any]: + migration.ensure_config_seeded() + config = plugins.get_plugin_config(PLUGIN_NAME) or {} + return normalize_config(config) + + +def get_loaded_model_name() -> str: + return _model_name + + +def is_globally_enabled() -> bool: + return plugins.determined_toggle_from_paths( + True, reversed(plugins.get_plugin_roots(PLUGIN_NAME)) + ) + + +async def preload(model_name: str | None = None): + cfg = get_config() + resolved_model = str(model_name or cfg["model_size"]) + return await _preload(resolved_model) + + +async def _preload(model_name: str): + global _model, _model_name, is_updating_model + + while is_updating_model: + await asyncio.sleep(0.1) + + try: + is_updating_model = True + if not _model or _model_name != model_name: + NotificationManager.send_notification( + NotificationType.INFO, + NotificationPriority.NORMAL, + "Loading Whisper model...", + display_time=99, + group="whisper-preload", + ) + PrintStyle.standard(f"Loading Whisper model: {model_name}") + _model = whisper.load_model( + name=model_name, + download_root=files.get_abs_path("/tmp/models/whisper"), + ) + _model_name = model_name + NotificationManager.send_notification( + NotificationType.INFO, + NotificationPriority.NORMAL, + "Whisper model loaded.", + display_time=2, + group="whisper-preload", + ) + finally: + is_updating_model = False + + +async def is_downloading() -> bool: + return is_updating_model + + +async def is_downloaded() -> bool: + return _model is not None + + +async def transcribe( + audio_bytes_b64: str, config: dict[str, Any] | None = None +) -> dict[str, Any]: + cfg = normalize_config(config or get_config()) + return await _transcribe( + str(cfg["model_size"]), + audio_bytes_b64, + language=_resolve_language(str(cfg["language"])), + ) + + +async def _transcribe( + model_name: str, audio_bytes_b64: str, *, language: str | None = None +) -> dict[str, Any]: + await _preload(model_name) + + audio_bytes = base64.b64decode(audio_bytes_b64) + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as audio_file: + audio_file.write(audio_bytes) + temp_path = audio_file.name + + try: + kwargs: dict[str, Any] = {"fp16": False} + if language: + kwargs["language"] = language + + result = _model.transcribe(temp_path, **kwargs) # type: ignore[union-attr] + return result if isinstance(result, dict) else {} + finally: + try: + os.remove(temp_path) + except Exception: + pass + + +def _resolve_language(language: str) -> str | None: + value = language.strip().lower() + if not value or value == "auto": + return None + return value diff --git a/plugins/_whisper_stt/hooks.py b/plugins/_whisper_stt/hooks.py new file mode 100644 index 0000000000..feeb9feae1 --- /dev/null +++ b/plugins/_whisper_stt/hooks.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from helpers.defer import DeferredTask +from plugins._whisper_stt.helpers import migration, runtime + + +def get_plugin_config(default=None, **kwargs): + migration.ensure_config_seeded() + return runtime.normalize_config(default or {}) + + +def save_plugin_config(default=None, settings=None, **kwargs): + migration.ensure_config_seeded() + + normalized = runtime.normalize_config(settings or default or {}) + previous = runtime.normalize_config(migration.read_saved_config()) + + previous_model = str(previous.get("model_size") or "") + next_model = str(normalized.get("model_size") or "") + if next_model and next_model != previous_model: + DeferredTask().start_task(runtime.preload, next_model) + + return normalized diff --git a/plugins/_whisper_stt/plugin.yaml b/plugins/_whisper_stt/plugin.yaml new file mode 100644 index 0000000000..8de98e800a --- /dev/null +++ b/plugins/_whisper_stt/plugin.yaml @@ -0,0 +1,9 @@ +name: _whisper_stt +title: Whisper STT +description: Built-in Whisper speech-to-text plugin. +version: 1.0.0 +always_enabled: false +settings_sections: + - agent +per_project_config: false +per_agent_config: false diff --git a/plugins/_whisper_stt/prompts/agent.system.voice_transcription.md b/plugins/_whisper_stt/prompts/agent.system.voice_transcription.md new file mode 100644 index 0000000000..9084dfa28a --- /dev/null +++ b/plugins/_whisper_stt/prompts/agent.system.voice_transcription.md @@ -0,0 +1 @@ +if starts (voice) then transcribed can contain errors consider compensation diff --git a/plugins/_whisper_stt/webui/config.html b/plugins/_whisper_stt/webui/config.html new file mode 100644 index 0000000000..a413f3ab4b --- /dev/null +++ b/plugins/_whisper_stt/webui/config.html @@ -0,0 +1,87 @@ + + + Whisper STT + + + +
+ +
+ + diff --git a/plugins/_whisper_stt/webui/main.html b/plugins/_whisper_stt/webui/main.html new file mode 100644 index 0000000000..847e31d344 --- /dev/null +++ b/plugins/_whisper_stt/webui/main.html @@ -0,0 +1,176 @@ + + + Whisper STT + + + + +
+ +
+ + + + diff --git a/plugins/_whisper_stt/webui/whisper-stt-store.js b/plugins/_whisper_stt/webui/whisper-stt-store.js new file mode 100644 index 0000000000..586bf390af --- /dev/null +++ b/plugins/_whisper_stt/webui/whisper-stt-store.js @@ -0,0 +1,685 @@ +import { createStore } from "/js/AlpineStore.js"; +import { toastFrontendError } from "/components/notifications/notification-store.js"; +import { callJsonApi } from "/js/api.js"; +import { sttService } from "/js/stt-service.js"; +import { ttsService } from "/js/tts-service.js"; +import { sendMessage, updateChatInput } from "/index.js"; + +const PLUGIN_NAME = "_whisper_stt"; + +const Status = { + INACTIVE: "inactive", + ACTIVATING: "activating", + LISTENING: "listening", + RECORDING: "recording", + WAITING: "waiting", + PROCESSING: "processing", +}; + +const model = { + runtimeInitialized: false, + statusLoaded: false, + loading: false, + error: "", + enabled: false, + config: { + model_size: "base", + language: "en", + silence_threshold: 0.3, + silence_duration: 1000, + waiting_timeout: 2000, + }, + modelReady: false, + modelLoading: false, + loadedModel: "", + packageVersion: "", + providerCleanup: null, + microphoneInput: null, + isProcessingClick: false, + devices: [], + selectedDevice: "", + requestingPermission: false, + _ttsListener: null, + _deviceChangeListenerBound: false, + + async initRuntime() { + if (this.runtimeInitialized) return; + + this.runtimeInitialized = true; + await this.loadDevices(); + await this.refreshStatus({ suppressError: true }); + + if (!this._deviceChangeListenerBound) { + navigator.mediaDevices?.addEventListener?.("devicechange", () => { + void this.loadDevices(); + }); + this._deviceChangeListenerBound = true; + } + + if (!this._ttsListener) { + this._ttsListener = (event) => { + if (event?.detail?.isSpeaking && this.micStatus !== Status.INACTIVE) { + this.stop(); + } + }; + ttsService.addEventListener("statechange", this._ttsListener); + } + }, + + async ensureStatusLoaded({ force = false, suppressError = true } = {}) { + if ((!this.statusLoaded || force) && !this.loading) { + await this.refreshStatus({ suppressError }); + } + }, + + async refreshStatus({ suppressError = false } = {}) { + this.loading = true; + this.error = ""; + + try { + const status = await callJsonApi(`/plugins/${PLUGIN_NAME}/status`, {}); + this.statusLoaded = true; + this.enabled = !!status?.enabled; + this.config = { + model_size: status?.config?.model_size || "base", + language: status?.config?.language || "en", + silence_threshold: Number(status?.config?.silence_threshold ?? 0.3), + silence_duration: Number(status?.config?.silence_duration ?? 1000), + waiting_timeout: Number(status?.config?.waiting_timeout ?? 2000), + }; + this.modelReady = !!status?.model?.ready; + this.modelLoading = !!status?.model?.loading; + this.loadedModel = status?.model?.loaded_model || ""; + this.packageVersion = status?.package?.version || ""; + + if (this.enabled) { + this.registerProvider(); + } else { + this.unregisterProvider(); + } + } catch (error) { + this.error = error instanceof Error ? error.message : String(error); + this.unregisterProvider(); + if (!suppressError) { + void toastFrontendError(this.error, "Whisper STT"); + } + } finally { + this.loading = false; + } + }, + + registerProvider() { + if (this.providerCleanup || !this.enabled) return; + + this.providerCleanup = sttService.registerProvider(PLUGIN_NAME, { + handleMicrophoneClick: async () => await this.handleMicrophoneClick(), + requestMicrophonePermission: async () => + await this.requestMicrophonePermission(), + updateMicrophoneButtonUI: () => {}, + stop: () => this.stop(), + getStatus: () => this.micStatus, + }); + + sttService.emitStatusChange(this.micStatus); + }, + + unregisterProvider() { + if (!this.providerCleanup) return; + + this.stop(); + this.providerCleanup(); + this.providerCleanup = null; + }, + + async openConfig() { + const { store } = await import("/components/plugins/plugin-settings-store.js"); + await store.openConfig(PLUGIN_NAME); + }, + + openPanel() { + window.openModal?.(`/plugins/${PLUGIN_NAME}/webui/main.html`); + }, + + async loadDevices() { + try { + const devices = await navigator.mediaDevices.enumerateDevices(); + this.devices = devices.filter( + (device) => device.kind === "audioinput" && device.deviceId, + ); + + const saved = localStorage.getItem("whisperSttSelectedDevice") || ""; + const savedStillExists = this.devices.some( + (device) => device.deviceId === saved, + ); + + if (savedStillExists) { + this.selectedDevice = saved; + return; + } + + const defaultDevice = + this.devices.find((device) => device.deviceId === "default") || + this.devices[0]; + this.selectedDevice = defaultDevice?.deviceId || ""; + } catch (error) { + console.error("[Whisper STT] Failed to enumerate audio devices", error); + this.devices = []; + this.selectedDevice = ""; + } + }, + + async selectDevice(deviceId) { + this.selectedDevice = deviceId || ""; + localStorage.setItem("whisperSttSelectedDevice", this.selectedDevice); + + if (this.microphoneInput?.selectedDeviceId !== this.selectedDevice) { + this.stop(); + this.microphoneInput = null; + } + }, + + getSelectedDevice() { + let device = this.devices.find( + (candidate) => candidate.deviceId === this.selectedDevice, + ); + + if (!device && this.devices.length > 0) { + device = + this.devices.find((candidate) => candidate.deviceId === "default") || + this.devices[0]; + } + + return device || null; + }, + + async requestMicrophonePermission() { + this.requestingPermission = true; + + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + stream.getTracks().forEach((track) => track.stop()); + await this.loadDevices(); + return true; + } catch (error) { + console.error("[Whisper STT] Microphone permission denied", error); + globalThis.toast?.( + "Microphone access denied. Please enable microphone access in your browser settings.", + "error", + ); + return false; + } finally { + this.requestingPermission = false; + } + }, + + async handleMicrophoneClick() { + if (this.isProcessingClick) return; + + this.isProcessingClick = true; + try { + await this.ensureStatusLoaded({ force: true, suppressError: false }); + if (!this.enabled) { + globalThis.justToast?.("Whisper STT is disabled.", "info"); + return; + } + + ttsService.stop(); + + const selectedDevice = this.getSelectedDevice(); + if ( + this.microphoneInput && + this.microphoneInput.selectedDeviceId !== (selectedDevice?.deviceId || "") + ) { + this.stop(); + this.microphoneInput = null; + } + + if (!this.microphoneInput) { + await this.initMicrophone(); + } + + if (this.microphoneInput) { + await this.microphoneInput.toggle(); + } + } finally { + setTimeout(() => { + this.isProcessingClick = false; + }, 300); + } + }, + + async initMicrophone() { + if (this.microphoneInput) return this.microphoneInput; + + const input = new MicrophoneInput(this, async (text, isFinal) => { + if (isFinal) { + await this.sendVoiceMessage(text); + } + }); + + const initialized = await input.initialize(); + this.microphoneInput = initialized ? input : null; + return this.microphoneInput; + }, + + async sendVoiceMessage(text) { + const message = `(voice) ${text}`; + updateChatInput(message); + + if (!this.microphoneInput?.messageSent) { + this.microphoneInput.messageSent = true; + await sendMessage(); + } + }, + + notifyStatusChange() { + sttService.emitStatusChange(this.micStatus); + }, + + stop() { + if (this.microphoneInput) { + this.microphoneInput.status = Status.INACTIVE; + this.microphoneInput.dispose(); + this.microphoneInput = null; + } + + this.notifyStatusChange(); + }, + + get micStatus() { + return this.microphoneInput?.status || Status.INACTIVE; + }, + + get statusText() { + if (!this.enabled) return "Disabled"; + if (this.modelLoading) return "Loading"; + if (this.modelReady) return "Ready"; + return "Idle"; + }, + + get statusClass() { + if (!this.enabled) return "warn"; + if (this.modelLoading) return "warn"; + if (this.modelReady) return "ok"; + return "warn"; + }, + + get buttonIcon() { + switch (this.micStatus) { + case Status.ACTIVATING: + return "progress_activity"; + case Status.LISTENING: + return "hearing"; + case Status.RECORDING: + return "radio_button_checked"; + case Status.WAITING: + return "hourglass_top"; + case Status.PROCESSING: + return "progress_activity"; + case Status.INACTIVE: + default: + return "mic"; + } + }, + + get buttonTitle() { + switch (this.micStatus) { + case Status.ACTIVATING: + return "Starting microphone"; + case Status.LISTENING: + return "Stop voice input"; + case Status.RECORDING: + return "Recording voice input"; + case Status.WAITING: + return "Waiting for more speech"; + case Status.PROCESSING: + return "Transcribing voice input"; + case Status.INACTIVE: + default: + return "Start voice input"; + } + }, + + get selectedDeviceLabel() { + const device = this.getSelectedDevice(); + if (!device) return "System default"; + return device.label || "System default"; + }, +}; + +class MicrophoneInput { + constructor(owner, updateCallback) { + this.owner = owner; + this.updateCallback = updateCallback; + this.mediaStream = null; + this.mediaRecorder = null; + this.audioContext = null; + this.mediaStreamSource = null; + this.analyserNode = null; + this.audioChunks = []; + this.lastChunk = null; + this.messageSent = false; + this.lastAudioTime = null; + this.waitingTimer = null; + this.silenceStartTime = null; + this.hasStartedRecording = false; + this.analysisFrame = null; + this.selectedDeviceId = ""; + this._status = Status.INACTIVE; + } + + get status() { + return this._status; + } + + set status(nextStatus) { + if (this._status === nextStatus) return; + + const previousStatus = this._status; + this._status = nextStatus; + this.handleStatusChange(previousStatus, nextStatus); + this.owner.notifyStatusChange(); + } + + async initialize() { + this.status = Status.ACTIVATING; + + try { + const selectedDevice = this.owner.getSelectedDevice(); + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + deviceId: + selectedDevice?.deviceId + ? { exact: selectedDevice.deviceId } + : undefined, + echoCancellation: true, + noiseSuppression: true, + channelCount: 1, + }, + }); + + this.selectedDeviceId = selectedDevice?.deviceId || ""; + this.mediaStream = stream; + this.mediaRecorder = new MediaRecorder(stream); + this.mediaRecorder.ondataavailable = (event) => { + if ( + event.data.size > 0 && + (this.status === Status.RECORDING || this.status === Status.WAITING) + ) { + if (this.lastChunk) { + this.audioChunks.push(this.lastChunk); + this.lastChunk = null; + } + this.audioChunks.push(event.data); + } else if (this.status === Status.LISTENING) { + this.lastChunk = event.data; + } + }; + + this.setupAudioAnalysis(stream); + return true; + } catch (error) { + console.error("[Whisper STT] Microphone initialization failed", error); + globalThis.toast?.( + "Failed to access the microphone. Please check browser permissions.", + "error", + ); + this.status = Status.INACTIVE; + this.dispose(); + return false; + } + } + + handleStatusChange(previousStatus, nextStatus) { + if (nextStatus !== Status.RECORDING) { + this.lastChunk = null; + } + + switch (nextStatus) { + case Status.INACTIVE: + this.handleInactiveState(); + break; + case Status.LISTENING: + this.handleListeningState(); + break; + case Status.RECORDING: + this.handleRecordingState(); + break; + case Status.WAITING: + this.handleWaitingState(); + break; + case Status.PROCESSING: + this.handleProcessingState(); + break; + } + } + + handleInactiveState() { + this.stopRecording(); + this.stopAudioAnalysis(); + clearTimeout(this.waitingTimer); + this.waitingTimer = null; + } + + handleListeningState() { + this.stopRecording(); + this.audioChunks = []; + this.hasStartedRecording = false; + this.silenceStartTime = null; + this.lastAudioTime = null; + this.messageSent = false; + this.startAudioAnalysis(); + } + + handleRecordingState() { + if (!this.mediaRecorder) return; + + if (!this.hasStartedRecording && this.mediaRecorder.state !== "recording") { + this.hasStartedRecording = true; + this.mediaRecorder.start(1000); + } + + clearTimeout(this.waitingTimer); + this.waitingTimer = null; + } + + handleWaitingState() { + clearTimeout(this.waitingTimer); + this.waitingTimer = setTimeout(() => { + if (this.status === Status.WAITING) { + this.status = Status.PROCESSING; + } + }, this.owner.config.waiting_timeout); + } + + handleProcessingState() { + this.stopRecording(); + void this.process(); + } + + setupAudioAnalysis(stream) { + this.audioContext = new (window.AudioContext || window.webkitAudioContext)(); + this.mediaStreamSource = this.audioContext.createMediaStreamSource(stream); + this.analyserNode = this.audioContext.createAnalyser(); + this.analyserNode.fftSize = 2048; + this.analyserNode.minDecibels = -90; + this.analyserNode.maxDecibels = -10; + this.analyserNode.smoothingTimeConstant = 0.85; + this.mediaStreamSource.connect(this.analyserNode); + } + + startAudioAnalysis() { + const analyzeFrame = () => { + if (this.status === Status.INACTIVE || !this.analyserNode) return; + + const dataArray = new Uint8Array(this.analyserNode.fftSize); + this.analyserNode.getByteTimeDomainData(dataArray); + + let sum = 0; + for (let index = 0; index < dataArray.length; index += 1) { + const amplitude = (dataArray[index] - 128) / 128; + sum += amplitude * amplitude; + } + + const rms = Math.sqrt(sum / dataArray.length); + const now = Date.now(); + const silenceThreshold = this.densify(this.owner.config.silence_threshold); + + if (rms > silenceThreshold) { + this.lastAudioTime = now; + this.silenceStartTime = null; + + if ( + (this.status === Status.LISTENING || this.status === Status.WAITING) && + !ttsService.isSpeaking() + ) { + this.status = Status.RECORDING; + } + } else if (this.status === Status.RECORDING) { + if (!this.silenceStartTime) { + this.silenceStartTime = now; + } + + const silenceDuration = now - this.silenceStartTime; + if (silenceDuration >= this.owner.config.silence_duration) { + this.status = Status.WAITING; + } + } + + this.analysisFrame = requestAnimationFrame(analyzeFrame); + }; + + this.stopAudioAnalysis(); + this.analysisFrame = requestAnimationFrame(analyzeFrame); + } + + stopAudioAnalysis() { + if (this.analysisFrame) { + cancelAnimationFrame(this.analysisFrame); + this.analysisFrame = null; + } + } + + stopRecording() { + if (this.mediaRecorder?.state === "recording") { + this.mediaRecorder.stop(); + this.hasStartedRecording = false; + } + } + + densify(value) { + return Math.exp(-5 * (1 - value)); + } + + async process() { + if (this.audioChunks.length === 0) { + if (this.status === Status.PROCESSING) { + this.status = Status.LISTENING; + } + return; + } + + const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" }); + const audio = await this.convertBlobToBase64(audioBlob); + + try { + const result = await callJsonApi(`/plugins/${PLUGIN_NAME}/transcribe`, { + audio, + }); + const text = this.filterResult(result?.text || ""); + if (text) { + await this.updateCallback(text, true); + } + } catch (error) { + console.error("[Whisper STT] Transcription failed", error); + window.toastFetchError?.("Transcription error", error); + } finally { + this.audioChunks = []; + if (this.status === Status.PROCESSING) { + this.status = Status.LISTENING; + } + } + } + + convertBlobToBase64(audioBlob) { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onloadend = () => { + const result = String(reader.result || ""); + resolve(result.split(",")[1] || ""); + }; + reader.onerror = (error) => reject(error); + reader.readAsDataURL(audioBlob); + }); + } + + filterResult(text) { + const normalized = String(text || "").trim(); + if (!normalized) return ""; + + const wrapped = + (normalized.startsWith("{") && normalized.endsWith("}")) || + (normalized.startsWith("(") && normalized.endsWith(")")) || + (normalized.startsWith("[") && normalized.endsWith("]")); + + if (wrapped) { + console.log(`[Whisper STT] Discarding transcription: ${normalized}`); + return ""; + } + + return normalized; + } + + async toggle() { + const hasPermission = await this.requestPermission(); + if (!hasPermission) return; + + if ( + this.status === Status.INACTIVE || + this.status === Status.ACTIVATING + ) { + this.status = Status.LISTENING; + } else { + this.owner.stop(); + } + } + + async requestPermission() { + return await this.owner.requestMicrophonePermission(); + } + + dispose() { + clearTimeout(this.waitingTimer); + this.waitingTimer = null; + this.stopAudioAnalysis(); + + try { + this.mediaRecorder?.stream?.getTracks?.().forEach((track) => track.stop()); + } catch (_error) { + // Ignore media cleanup failures. + } + + try { + this.mediaStream?.getTracks?.().forEach((track) => track.stop()); + } catch (_error) { + // Ignore media cleanup failures. + } + + try { + this.audioContext?.close?.(); + } catch (_error) { + // Ignore audio context cleanup failures. + } + + this.mediaStream = null; + this.mediaRecorder = null; + this.mediaStreamSource = null; + this.analyserNode = null; + this.audioContext = null; + this.audioChunks = []; + this.lastChunk = null; + this.hasStartedRecording = false; + } +} + +export const store = createStore("whisperStt", model); diff --git a/plugins/_whisper_stt/webui/whisper-stt.css b/plugins/_whisper_stt/webui/whisper-stt.css new file mode 100644 index 0000000000..fb39d1f7af --- /dev/null +++ b/plugins/_whisper_stt/webui/whisper-stt.css @@ -0,0 +1,49 @@ +.microphone-button { + position: relative; + background: #0f766e; + color: #f8fafc; +} + +.microphone-button:hover { + background: #115e59; +} + +.microphone-button:active { + background: #134e4a; +} + +.microphone-button.mic-activating, +.microphone-button.mic-processing { + background: #6d28d9; +} + +.microphone-button.mic-listening { + background: #0f766e; +} + +.microphone-button.mic-recording { + background: #c1121f; + animation: whisper-mic-pulse 1.2s ease-in-out infinite; +} + +.microphone-button.mic-waiting { + background: #9a6700; +} + +.microphone-button:disabled { + opacity: 0.7; + cursor: wait; +} + +@keyframes whisper-mic-pulse { + 0%, + 100% { + transform: scale(1); + box-shadow: 0 0 0 0 rgba(193, 18, 31, 0.35); + } + + 50% { + transform: scale(1.05); + box-shadow: 0 0 0 8px rgba(193, 18, 31, 0); + } +} diff --git a/preload.py b/preload.py index 84639b43dd..b0287a160b 100644 --- a/preload.py +++ b/preload.py @@ -1,18 +1,20 @@ import asyncio -from helpers import runtime, whisper, settings +from helpers import runtime from helpers.print_style import PrintStyle -from helpers import kokoro_tts import models +from plugins._kokoro_tts.helpers import runtime as kokoro_tts_runtime +from plugins._whisper_stt.helpers import runtime as whisper_stt_runtime async def preload(): try: - set = settings.get_default_settings() - # preload whisper model async def preload_whisper(): + if not whisper_stt_runtime.is_globally_enabled(): + return None try: - return await whisper.preload(set["stt_model_size"]) + config = whisper_stt_runtime.get_config() + return await whisper_stt_runtime.preload(str(config["model_size"])) except Exception as e: PrintStyle().error(f"Error in preload_whisper: {e}") @@ -32,11 +34,12 @@ async def preload_embedding(): # preload kokoro tts model if enabled async def preload_kokoro(): - if set["tts_kokoro"]: - try: - return await kokoro_tts.preload() - except Exception as e: - PrintStyle().error(f"Error in preload_kokoro: {e}") + if not kokoro_tts_runtime.is_globally_enabled(): + return None + try: + return await kokoro_tts_runtime.preload() + except Exception as e: + PrintStyle().error(f"Error in preload_kokoro: {e}") # async tasks to preload tasks = [ diff --git a/tests/test_speech_plugin_split.py b/tests/test_speech_plugin_split.py new file mode 100644 index 0000000000..7d51a08521 --- /dev/null +++ b/tests/test_speech_plugin_split.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import sys +from pathlib import Path + + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + + +from helpers import plugins, settings + + +def test_builtin_speech_plugins_are_discoverable_and_toggleable() -> None: + discovered = { + item.name: item + for item in plugins.get_enhanced_plugins_list( + custom=True, + builtin=True, + plugin_names=["_kokoro_tts", "_whisper_stt"], + ) + } + + assert "_kokoro_tts" in discovered + assert "_whisper_stt" in discovered + + assert discovered["_kokoro_tts"].always_enabled is False + assert discovered["_whisper_stt"].always_enabled is False + assert "agent" in discovered["_kokoro_tts"].settings_sections + assert "agent" in discovered["_whisper_stt"].settings_sections + + +def test_legacy_core_speech_artifacts_are_removed() -> None: + removed_paths = [ + "api/synthesize.py", + "api/transcribe.py", + "helpers/kokoro_tts.py", + "helpers/whisper.py", + "webui/components/chat/speech/speech-store.js", + "webui/components/settings/agent/speech.html", + "webui/components/settings/speech/microphone-setting-store.js", + "webui/components/settings/speech/microphone.html", + "webui/css/speech.css", + "webui/js/speech_browser.js", + ] + + for relative_path in removed_paths: + assert not (PROJECT_ROOT / relative_path).exists(), relative_path + + +def test_plugin_owned_voice_files_exist() -> None: + expected_paths = [ + "plugins/_kokoro_tts/plugin.yaml", + "plugins/_kokoro_tts/api/synthesize.py", + "plugins/_kokoro_tts/extensions/webui/page-head/runtime.html", + "plugins/_kokoro_tts/extensions/webui/voice-settings-main/kokoro-card.html", + "plugins/_whisper_stt/plugin.yaml", + "plugins/_whisper_stt/api/transcribe.py", + "plugins/_whisper_stt/extensions/python/system_prompt/_20_voice_transcription.py", + "plugins/_whisper_stt/extensions/webui/page-head/runtime.html", + "plugins/_whisper_stt/extensions/webui/chat-input-box-end/microphone-button.html", + "plugins/_whisper_stt/extensions/webui/voice-settings-main/whisper-card.html", + "plugins/_whisper_stt/webui/whisper-stt-store.js", + ] + + for relative_path in expected_paths: + assert (PROJECT_ROOT / relative_path).exists(), relative_path + + +def test_core_settings_no_longer_expose_legacy_speech_keys() -> None: + defaults = settings.get_default_settings() + output = settings.convert_out(defaults) + + legacy_keys = { + "tts_kokoro", + "stt_model_size", + "stt_language", + "stt_silence_threshold", + "stt_silence_duration", + "stt_waiting_timeout", + } + + assert legacy_keys.isdisjoint(defaults.keys()) + assert legacy_keys.isdisjoint(output["settings"].keys()) + assert "stt_models" not in output["additional"] + + +def test_voice_prompt_rule_moves_to_whisper_plugin() -> None: + core_prompt = (PROJECT_ROOT / "prompts/agent.system.main.communication_additions.md").read_text( + encoding="utf-8" + ) + whisper_prompt = (PROJECT_ROOT / "plugins/_whisper_stt/prompts/agent.system.voice_transcription.md").read_text( + encoding="utf-8" + ) + voice_surface = (PROJECT_ROOT / "webui/components/settings/agent/voice.html").read_text( + encoding="utf-8" + ) + + assert "if starts (voice) then transcribed can contain errors consider compensation" not in core_prompt + assert "if starts (voice) then transcribed can contain errors consider compensation" in whisper_prompt + assert '' in voice_surface + assert '' in voice_surface + assert '' in voice_surface diff --git a/webui/components/chat/input/chat-bar-input.html b/webui/components/chat/input/chat-bar-input.html index 0f41dd3b2a..f9eea0616f 100644 --- a/webui/components/chat/input/chat-bar-input.html +++ b/webui/components/chat/input/chat-bar-input.html @@ -1,7 +1,6 @@ @@ -13,8 +12,29 @@
-

- +

+ volume_off

diff --git a/webui/components/chat/speech/speech-store.js b/webui/components/chat/speech/speech-store.js deleted file mode 100644 index 82b3bee98b..0000000000 --- a/webui/components/chat/speech/speech-store.js +++ /dev/null @@ -1,962 +0,0 @@ -import { createStore } from "/js/AlpineStore.js"; -import { updateChatInput, sendMessage } from "/index.js"; -import { sleep } from "/js/sleep.js"; -import { store as microphoneSettingStore } from "/components/settings/speech/microphone-setting-store.js"; -import * as shortcuts from "/js/shortcuts.js"; - -const Status = { - INACTIVE: "inactive", - ACTIVATING: "activating", - LISTENING: "listening", - RECORDING: "recording", - WAITING: "waiting", - PROCESSING: "processing", -}; - -// Create the speech store -const model = { - // Initialization guard - _initialized: false, - - // STT Settings - stt_model_size: "tiny", - stt_language: "en", - stt_silence_threshold: 0.05, - stt_silence_duration: 1000, - stt_waiting_timeout: 2000, - - // TTS Settings - tts_kokoro: false, - - // TTS State - isSpeaking: false, - speakingId: "", - speakingText: "", - currentAudio: null, - audioEl: null, - audioContext: null, - userHasInteracted: false, - stopSpeechChain: false, - ttsStream: null, - - // STT State - microphoneInput: null, - isProcessingClick: false, - selectedDevice: null, - - // Getter for micStatus - delegates to microphoneInput - get micStatus() { - return this.microphoneInput?.status || Status.INACTIVE; - }, - - updateMicrophoneButtonUI() { - const microphoneButton = document.getElementById("microphone-button"); - if (!microphoneButton) return; - const status = this.micStatus; - microphoneButton.classList.remove( - "mic-inactive", - "mic-activating", - "mic-listening", - "mic-recording", - "mic-waiting", - "mic-processing" - ); - microphoneButton.classList.add(`mic-${status.toLowerCase()}`); - microphoneButton.setAttribute("data-status", status); - }, - - async handleMicrophoneClick() { - if (this.isProcessingClick) return; - this.isProcessingClick = true; - try { - // reset mic input if device has changed in settings - const device = microphoneSettingStore.getSelectedDevice(); - if (device != this.selectedDevice) { - this.selectedDevice = device; - this.microphoneInput = null; - console.log("Device changed, microphoneInput reset"); - } - - if (!this.microphoneInput) { - await this.initMicrophone(); - } - - if (this.microphoneInput) { - await this.microphoneInput.toggle(); - } - } finally { - setTimeout(() => { - this.isProcessingClick = false; - }, 300); - } - }, - - // Initialize speech functionality - async init() { - // Guard against multiple initializations - if (this._initialized) { - console.log( - "[Speech Store] Already initialized, skipping duplicate init()" - ); - return; - } - - this._initialized = true; - await this.loadSettings(); - this.setupBrowserTTS(); - this.setupUserInteractionHandling(); - }, - - // Load settings from server - async loadSettings() { - try { - const response = await fetchApi("/settings_get", { method: "POST" }); - const data = await response.json(); - const settings = data?.settings || {}; - - if (settings) { - this.stt_model_size = settings.stt_model_size ?? this.stt_model_size; - this.stt_language = settings.stt_language ?? this.stt_language; - this.stt_silence_threshold = - settings.stt_silence_threshold ?? this.stt_silence_threshold; - this.stt_silence_duration = - settings.stt_silence_duration ?? this.stt_silence_duration; - this.stt_waiting_timeout = - settings.stt_waiting_timeout ?? this.stt_waiting_timeout; - this.tts_kokoro = settings.tts_kokoro ?? this.tts_kokoro; - } - } catch (error) { - window.toastFetchError("Failed to load speech settings", error); - console.error("Failed to load speech settings:", error); - } - }, - - // Setup browser TTS - setupBrowserTTS() { - this.synth = window.speechSynthesis; - this.browserUtterance = null; - }, - - // Setup user interaction handling for autoplay policy - setupUserInteractionHandling() { - const enableAudio = () => { - if (!this.userHasInteracted) { - this.userHasInteracted = true; - console.log("User interaction detected - audio playback enabled"); - - // Create a dummy audio context to "unlock" audio - try { - this.audioContext = new (window.AudioContext || - window.webkitAudioContext)(); - this.audioContext.resume(); - } catch (e) { - console.log("AudioContext not available"); - } - } - }; - - // Listen for any user interaction - const events = ["click", "touchstart", "keydown", "mousedown"]; - events.forEach((event) => { - document.addEventListener(event, enableAudio, { - once: true, - passive: true, - }); - }); - }, - - // main speak function, allows to speak a stream of text that is generated piece by piece - async speakStream(id, text, finished = false) { - // if already running the same stream, do nothing - if ( - this.ttsStream && - this.ttsStream.id === id && - this.ttsStream.text === text && - this.ttsStream.finished === finished - ) - return; - - // if user has not interacted (after reload), do not play audio - if (!this.userHasInteracted) return this.showAudioPermissionPrompt(); - - // new stream - if (!this.ttsStream || this.ttsStream.id !== id) { - // this.stop(); // stop potential previous stream - // create new stream data - this.ttsStream = { - id, - text, - finished, - running: false, - lastChunkIndex: -1, - stopped: false, - chunks: [], - }; - } else { - // update existing stream data - this.ttsStream.finished = finished; - this.ttsStream.text = text; - } - - // cleanup text - const cleanText = this.cleanText(text); - if (!cleanText.trim()) return; - - // chunk it for faster processing - this.ttsStream.chunks = this.chunkText(cleanText); - if (this.ttsStream.chunks.length == 0) return; - - // if stream was already running, just updating chunks is enough - // The running loop will pick up the new chunks automatically - if (this.ttsStream.running) return; - else this.ttsStream.running = true; // proceed to running phase - - // terminator function to kill the stream if new stream has started - const terminator = () => - this.ttsStream?.id !== id || this.ttsStream?.stopped; - - const spoken = []; - - // continuously loop until all chunks are spoken and stream is finished - while (true) { - // check if we should stop - if (terminator()) break; - - // get the next chunk index to speak - const nextIndex = this.ttsStream.lastChunkIndex + 1; - - // if no more chunks available, check if we should wait or exit - if (nextIndex >= this.ttsStream.chunks.length) { - // if stream is finished, we're done - if (this.ttsStream.finished) break; - // otherwise wait a bit for more chunks to arrive - await new Promise((resolve) => setTimeout(resolve, 50)); - continue; - } - - // do not speak the last chunk until finished (it is being generated) - if ( - nextIndex == this.ttsStream.chunks.length - 1 && - !this.ttsStream.finished - ) { - // wait a bit for more content or finish signal - await new Promise((resolve) => setTimeout(resolve, 50)); - continue; - } - - // set the index of last spoken chunk - this.ttsStream.lastChunkIndex = nextIndex; - - // speak the chunk - const chunk = this.ttsStream.chunks[nextIndex]; - spoken.push(chunk); - await this._speak(chunk, nextIndex > 0, () => terminator()); - } - - // at the end, finish stream data - this.ttsStream.running = false; - }, - - // simplified speak function, speak a single finished piece of text - async speak(text) { - const id = Math.random(); - return await this.speakStream(id, text, true); - }, - - // speak wrapper - async _speak(text, waitForPrevious, terminator) { - // default browser speech - if (!this.tts_kokoro) - return await this.speakWithBrowser(text, waitForPrevious, terminator); - - // kokoro tts - try { - await await this.speakWithKokoro(text, waitForPrevious, terminator); - } catch (error) { - console.error(error); - return await this.speakWithBrowser(text, waitForPrevious, terminator); - } - }, - - chunkText(text, { maxChunkLength = 135, lineSeparator = "..." } = {}) { - const INC_LIMIT = maxChunkLength * 2; - const MIN_CHUNK_LENGTH = 20; // minimum length for a chunk before merging - - // Only split by ,/word if needed (unchanged) - const splitDeep = (seg) => { - if (seg.length <= INC_LIMIT) return [seg]; - const byComma = seg.match(/[^,]+(?:,|$)/g); - if (byComma.length > 1) - return byComma.flatMap((p, i) => - splitDeep(i < byComma.length - 1 ? p : p.replace(/,$/, "")) - ); - const out = []; - let part = ""; - for (const word of seg.split(/\s+/)) { - const need = part ? part.length + 1 + word.length : word.length; - if (need <= maxChunkLength) { - part += (part ? " " : "") + word; - } else { - if (part) out.push(part); - if (word.length > maxChunkLength) { - for (let i = 0; i < word.length; i += maxChunkLength) - out.push(word.slice(i, i + maxChunkLength)); - part = ""; - } else { - part = word; - } - } - } - if (part) out.push(part); - return out; - }; - - // Only split on [.!?] followed by space - const sentenceTokens = (line) => { - const toks = []; - let start = 0; - for (let i = 0; i < line.length; i++) { - const c = line[i]; - if ( - (c === "." || c === "!" || c === "?") && - /\s/.test(line[i + 1] || "") - ) { - toks.push(line.slice(start, i + 1)); - i += 1; - start = i + 1; - } - } - if (start < line.length) toks.push(line.slice(start)); - return toks; - }; - - // Step 1: Split all newlines into individual chunks first - let initialChunks = []; - const lines = text.split(/\n+/).filter((l) => l.trim()); - - for (const line of lines) { - if (!line.trim()) continue; - // Process each line into sentence tokens and add to chunks - const sentences = sentenceTokens(line.trim()); - initialChunks.push(...sentences); - } - - // Step 2: Merge short chunks until they meet minimum length criteria - const finalChunks = []; - let currentChunk = ""; - - for (let i = 0; i < initialChunks.length; i++) { - const chunk = initialChunks[i]; - - // If current chunk is empty, start with this chunk - if (!currentChunk) { - currentChunk = chunk; - // If this is the last chunk or it's already long enough, add it - if ( - i === initialChunks.length - 1 || - currentChunk.length >= MIN_CHUNK_LENGTH - ) { - finalChunks.push(currentChunk); - currentChunk = ""; - } - continue; - } - - // Current chunk exists, check if we should merge - if (currentChunk.length < MIN_CHUNK_LENGTH) { - // Try to merge with separator - const merged = currentChunk + " " + lineSeparator + " " + chunk; - - // Check if merged chunk fits within max length - if (merged.length <= maxChunkLength) { - currentChunk = merged; - } else { - // Doesn't fit, add current chunk and start new one - finalChunks.push(currentChunk); - currentChunk = chunk; - } - } else { - // Current chunk is already long enough, add it and start new one - finalChunks.push(currentChunk); - currentChunk = chunk; - } - - // If this is the last chunk, add whatever is in the buffer - if (i === initialChunks.length - 1 && currentChunk) { - finalChunks.push(currentChunk); - } - } - - return finalChunks.map((chunk) => chunk.trimEnd()); - }, - - // Show a prompt to user to enable audio - showAudioPermissionPrompt() { - shortcuts.frontendNotification({ - type: "info", - message: "Click anywhere to enable audio playback", - displayTime: 5000, - frontendOnly: true, - }); - console.log("Please click anywhere on the page to enable audio playback"); - }, - - // Browser TTS - async speakWithBrowser(text, waitForPrevious = false, terminator = null) { - // wait for previous to finish if requested - while (waitForPrevious && this.isSpeaking) await sleep(25); - if (terminator && terminator()) return; - - // stop previous only if not waiting for it - if (!waitForPrevious) this.stopAudio(); - - this.browserUtterance = new SpeechSynthesisUtterance(text); - this.browserUtterance.onstart = () => { - this.isSpeaking = true; - }; - this.browserUtterance.onend = () => { - this.isSpeaking = false; - }; - - this.synth.speak(this.browserUtterance); - }, - - // Kokoro TTS - async speakWithKokoro(text, waitForPrevious = false, terminator = null) { - try { - // synthesize on the backend - const response = await sendJsonData("/synthesize", { text }); - - // wait for previous to finish if requested - while (waitForPrevious && this.isSpeaking) await sleep(25); - if (terminator && terminator()) return; - - // stop previous only if not waiting for it - if (!waitForPrevious) this.stopAudio(); - - if (response.success) { - if (response.audio_parts) { - // Multiple chunks - play sequentially - for (const audioPart of response.audio_parts) { - if (terminator && terminator()) return; - await this.playAudio(audioPart); - await sleep(100); // Brief pause - } - } else if (response.audio) { - // Single audio - this.playAudio(response.audio); - } - } else { - throw new Error("Kokoro TTS error:", response.error); - } - } catch (error) { - throw new Error("Kokoro TTS error:", error); - } - }, - - // Play base64 audio - async playAudio(base64Audio) { - return new Promise((resolve, reject) => { - const audio = this.audioEl ? this.audioEl : (this.audioEl = new Audio()); - - // Reset any previous playback state - audio.pause(); - audio.currentTime = 0; - - audio.onplay = () => { - this.isSpeaking = true; - }; - audio.onended = () => { - this.isSpeaking = false; - this.currentAudio = null; - resolve(); - }; - audio.onerror = (error) => { - this.isSpeaking = false; - this.currentAudio = null; - reject(error); - }; - - audio.src = `data:audio/wav;base64,${base64Audio}`; - this.currentAudio = audio; - - audio.play().catch((error) => { - this.isSpeaking = false; - this.currentAudio = null; - - if (error.name === "NotAllowedError") { - this.showAudioPermissionPrompt(); - this.userHasInteracted = false; - } - reject(error); - }); - }); - }, - - // Stop current speech chain - stop() { - this.stopAudio(); // stop current audio immediately - if (this.ttsStream) this.ttsStream.stopped = true; // set stop on current stream - }, - - // Stop current speech audio - stopAudio() { - if (this.synth?.speaking) { - this.synth.cancel(); - } - - if (this.audioEl) { - this.audioEl.pause(); - this.audioEl.currentTime = 0; - } - this.currentAudio = null; - this.isSpeaking = false; - }, - - // Clean text for TTS - cleanText(text) { - // Use SUB character (ASCII 26, 0x1A) for placeholders to avoid conflicts with actual text - const SUB = "\x1A"; // non-printable substitute character - const codePlaceholder = SUB + "code" + SUB; - const tablePlaceholder = SUB + "table" + SUB; - - // Handle code blocks BEFORE HTML parsing (markdown code blocks) - text = text.replace(/```(?:[a-zA-Z0-9]*\n)?[\s\S]*?```/g, codePlaceholder); // closed code blocks - text = text.replace(/```(?:[a-zA-Z0-9]*\n)?[\s\S]*$/g, codePlaceholder); // unclosed code blocks - - // Replace inline code ticks with content preserved - text = text.replace(/`([^`]*)`/g, "$1"); // remove backticks but keep content - - // Parse HTML using browser's DOMParser to properly extract text content - try { - const parser = new DOMParser(); - // Wrap in a div to handle fragments - const doc = parser.parseFromString(`
${text}
`, 'text/html'); - - // Replace
 and  tags with placeholder before extracting text
-      doc.querySelectorAll('pre, code').forEach(el => {
-        el.textContent = codePlaceholder;
-      });
-
-      // Extract text content (this strips all HTML tags properly)
-      text = doc.body.textContent || "";
-    } catch (e) {
-      // Fallback: simple tag stripping if DOMParser fails
-      console.warn("[Speech Store] DOMParser failed, using fallback:", e);
-      text = text.replace(/]*>[\s\S]*?<\/pre>/gi, codePlaceholder);
-      text = text.replace(/]*>[\s\S]*?<\/code>/gi, codePlaceholder);
-      text = text.replace(/<[^>]+>/g, ''); // strip remaining tags
-    }
-
-    // Remove markdown links: [label](url) → label
-    text = text.replace(/\[([^\]]+)\]\([^\)]+\)/g, "$1");
-
-    // Remove markdown formatting: *, _, #
-    text = text.replace(/[*_#]+/g, "");
-
-    // Handle tables - both complete and partial
-    // Check if text contains a table-like pattern
-    if (text.includes("|")) {
-      // Find consecutive lines with | characters (table rows)
-      const tableLines = text
-        .split("\n")
-        .filter((line) => line.includes("|") && line.trim().startsWith("|"));
-      if (tableLines.length > 0) {
-        // Replace each table line with a placeholder
-        for (const line of tableLines) {
-          text = text.replace(line, tablePlaceholder);
-        }
-      } else {
-        // Just handle individual table rows
-        text = text.replace(/\|[^\n]*\|/g, tablePlaceholder);
-      }
-    }
-
-    // Remove emojis and private unicode blocks
-    text = text.replace(
-      /([\u2700-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|[\u2011-\u26FF]|\uD83E[\uDD10-\uDDFF])/g,
-      ""
-    );
-
-    // Replace URLs with just the domain name
-    text = text.replace(/https?:\/\/[^\s]+/g, (match) => {
-      try {
-        return new URL(match).hostname;
-      } catch {
-        return "";
-      }
-    });
-
-    // Remove email addresses
-    // text = text.replace(/\S+@\S+/g, "");
-
-    // Replace UUIDs with 'UUID'
-    text = text.replace(
-      /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g,
-      "UUID"
-    );
-
-    // Collapse multiple spaces/tabs to a single space, but preserve newlines
-    text = text.replace(/[ \t]+/g, " ");
-
-    // Function to merge consecutive placeholders of any type
-    function mergePlaceholders(txt, placeholder, replacement) {
-      // Create regex for consecutive placeholders (with possible whitespace between)
-      const regex = new RegExp(placeholder + "\\s*" + placeholder, "g");
-      // Merge consecutive placeholders until no more found
-      while (regex.test(txt)) {
-        txt = txt.replace(regex, placeholder);
-      }
-      // Replace all remaining placeholders with human-readable text
-      return txt.replace(new RegExp(placeholder, "g"), replacement);
-    }
-
-    // Apply placeholder merging for both types
-    text = mergePlaceholders(text, codePlaceholder, "See code attached ...");
-    text = mergePlaceholders(text, tablePlaceholder, "See table attached ...");
-
-    // Trim leading/trailing whitespace
-    text = text.trim();
-
-    return text;
-  },
-
-  // Initialize microphone input
-  async initMicrophone() {
-    if (this.microphoneInput) return this.microphoneInput;
-
-    this.microphoneInput = new MicrophoneInput(async (text, isFinal) => {
-      if (isFinal) {
-        this.sendMessage(text);
-      }
-    });
-
-    const initialized = await this.microphoneInput.initialize();
-    return initialized ? this.microphoneInput : null;
-  },
-
-  async sendMessage(text) {
-    text = "(voice) " + text;
-    updateChatInput(text);
-    if (!this.microphoneInput.messageSent) {
-      this.microphoneInput.messageSent = true;
-      await sendMessage();
-    }
-  },
-
-  // Request microphone permission - delegate to MicrophoneInput
-  async requestMicrophonePermission() {
-    return this.microphoneInput
-      ? this.microphoneInput.requestPermission()
-      : MicrophoneInput.prototype.requestPermission.call(null);
-  },
-};
-
-// Microphone Input Class (simplified for store integration)
-class MicrophoneInput {
-  constructor(updateCallback) {
-    this.mediaRecorder = null;
-    this.audioChunks = [];
-    this.lastChunk = [];
-    this.updateCallback = updateCallback;
-    this.messageSent = false;
-    this.audioContext = null;
-    this.mediaStreamSource = null;
-    this.analyserNode = null;
-    this._status = Status.INACTIVE;
-    this.lastAudioTime = null;
-    this.waitingTimer = null;
-    this.silenceStartTime = null;
-    this.hasStartedRecording = false;
-    this.analysisFrame = null;
-  }
-
-  get status() {
-    return this._status;
-  }
-
-  set status(newStatus) {
-    if (this._status === newStatus) return;
-
-    const oldStatus = this._status;
-    this._status = newStatus;
-    console.log(`Mic status changed from ${oldStatus} to ${newStatus}`);
-
-    this.handleStatusChange(oldStatus, newStatus);
-  }
-
-  async initialize() {
-    // Set status to activating at the start of initialization
-    this.status = Status.ACTIVATING;
-    try {
-      // get selected device from microphone settings
-      const selectedDevice = microphoneSettingStore.getSelectedDevice();
-
-      const stream = await navigator.mediaDevices.getUserMedia({
-        audio: {
-          deviceId:
-            selectedDevice && selectedDevice.deviceId
-              ? { exact: selectedDevice.deviceId }
-              : undefined,
-          echoCancellation: true,
-          noiseSuppression: true,
-          channelCount: 1,
-        },
-      });
-
-      this.mediaRecorder = new MediaRecorder(stream);
-      this.mediaRecorder.ondataavailable = (event) => {
-        if (
-          event.data.size > 0 &&
-          (this.status === Status.RECORDING || this.status === Status.WAITING)
-        ) {
-          if (this.lastChunk) {
-            this.audioChunks.push(this.lastChunk);
-            this.lastChunk = null;
-          }
-          this.audioChunks.push(event.data);
-        } else if (this.status === Status.LISTENING) {
-          this.lastChunk = event.data;
-        }
-      };
-
-      this.setupAudioAnalysis(stream);
-      return true;
-    } catch (error) {
-      console.error("Microphone initialization error:", error);
-      toast("Failed to access microphone. Please check permissions.", "error");
-      return false;
-    }
-  }
-
-  handleStatusChange(oldStatus, newStatus) {
-    if (newStatus != Status.RECORDING) {
-      this.lastChunk = null;
-    }
-
-    switch (newStatus) {
-      case Status.INACTIVE:
-        this.handleInactiveState();
-        break;
-      case Status.LISTENING:
-        this.handleListeningState();
-        break;
-      case Status.RECORDING:
-        this.handleRecordingState();
-        break;
-      case Status.WAITING:
-        this.handleWaitingState();
-        break;
-      case Status.PROCESSING:
-        this.handleProcessingState();
-        break;
-    }
-  }
-
-  handleInactiveState() {
-    this.stopRecording();
-    this.stopAudioAnalysis();
-    if (this.waitingTimer) {
-      clearTimeout(this.waitingTimer);
-      this.waitingTimer = null;
-    }
-  }
-
-  handleListeningState() {
-    this.stopRecording();
-    this.audioChunks = [];
-    this.hasStartedRecording = false;
-    this.silenceStartTime = null;
-    this.lastAudioTime = null;
-    this.messageSent = false;
-    this.startAudioAnalysis();
-  }
-
-  handleRecordingState() {
-    if (!this.hasStartedRecording && this.mediaRecorder.state !== "recording") {
-      this.hasStartedRecording = true;
-      this.mediaRecorder.start(1000);
-      console.log("Speech started");
-    }
-    if (this.waitingTimer) {
-      clearTimeout(this.waitingTimer);
-      this.waitingTimer = null;
-    }
-  }
-
-  handleWaitingState() {
-    this.waitingTimer = setTimeout(() => {
-      if (this.status === Status.WAITING) {
-        this.status = Status.PROCESSING;
-      }
-    }, store.stt_waiting_timeout);
-  }
-
-  handleProcessingState() {
-    this.stopRecording();
-    this.process();
-  }
-
-  setupAudioAnalysis(stream) {
-    this.audioContext = new (window.AudioContext ||
-      window.webkitAudioContext)();
-    this.mediaStreamSource = this.audioContext.createMediaStreamSource(stream);
-    this.analyserNode = this.audioContext.createAnalyser();
-    this.analyserNode.fftSize = 2048;
-    this.analyserNode.minDecibels = -90;
-    this.analyserNode.maxDecibels = -10;
-    this.analyserNode.smoothingTimeConstant = 0.85;
-    this.mediaStreamSource.connect(this.analyserNode);
-  }
-
-  startAudioAnalysis() {
-    const analyzeFrame = () => {
-      if (this.status === Status.INACTIVE) return;
-
-      const dataArray = new Uint8Array(this.analyserNode.fftSize);
-      this.analyserNode.getByteTimeDomainData(dataArray);
-
-      let sum = 0;
-      for (let i = 0; i < dataArray.length; i++) {
-        const amplitude = (dataArray[i] - 128) / 128;
-        sum += amplitude * amplitude;
-      }
-      const rms = Math.sqrt(sum / dataArray.length);
-      const now = Date.now();
-
-      // Update status based on audio level (ignore if TTS is speaking)
-      if (rms > this.densify(store.stt_silence_threshold)) {
-        this.lastAudioTime = now;
-        this.silenceStartTime = null;
-
-        if (
-          (this.status === Status.LISTENING ||
-            this.status === Status.WAITING) &&
-          !store.isSpeaking
-        ) {
-          this.status = Status.RECORDING;
-        }
-      } else if (this.status === Status.RECORDING) {
-        if (!this.silenceStartTime) {
-          this.silenceStartTime = now;
-        }
-
-        const silenceDuration = now - this.silenceStartTime;
-        if (silenceDuration >= store.stt_silence_duration) {
-          this.status = Status.WAITING;
-        }
-      }
-
-      this.analysisFrame = requestAnimationFrame(analyzeFrame);
-    };
-
-    this.analysisFrame = requestAnimationFrame(analyzeFrame);
-  }
-
-  stopAudioAnalysis() {
-    if (this.analysisFrame) {
-      cancelAnimationFrame(this.analysisFrame);
-      this.analysisFrame = null;
-    }
-  }
-
-  stopRecording() {
-    if (this.mediaRecorder?.state === "recording") {
-      this.mediaRecorder.stop();
-      this.hasStartedRecording = false;
-    }
-  }
-
-  densify(x) {
-    return Math.exp(-5 * (1 - x));
-  }
-
-  async process() {
-    if (this.audioChunks.length === 0) {
-      this.status = Status.LISTENING;
-      return;
-    }
-
-    const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
-    const base64 = await this.convertBlobToBase64Wav(audioBlob);
-
-    try {
-      const result = await sendJsonData("/transcribe", { audio: base64 });
-      const text = this.filterResult(result.text || "");
-
-      if (text) {
-        console.log("Transcription:", result.text);
-        await this.updateCallback(result.text, true);
-      }
-    } catch (error) {
-      window.toastFetchError("Transcription error", error);
-      console.error("Transcription error:", error);
-    } finally {
-      this.audioChunks = [];
-      this.status = Status.LISTENING;
-    }
-  }
-
-  convertBlobToBase64Wav(audioBlob) {
-    return new Promise((resolve, reject) => {
-      const reader = new FileReader();
-      reader.onloadend = () => {
-        const base64Data = reader.result.split(",")[1];
-        resolve(base64Data);
-      };
-      reader.onerror = (error) => reject(error);
-      reader.readAsDataURL(audioBlob);
-    });
-  }
-
-  filterResult(text) {
-    text = text.trim();
-    let ok = false;
-    while (!ok) {
-      if (!text) break;
-      if (text[0] === "{" && text[text.length - 1] === "}") break;
-      if (text[0] === "(" && text[text.length - 1] === ")") break;
-      if (text[0] === "[" && text[text.length - 1] === "]") break;
-      ok = true;
-    }
-    if (ok) return text;
-    else console.log(`Discarding transcription: ${text}`);
-  }
-
-  // Toggle microphone between active and inactive states
-  async toggle() {
-    const hasPermission = await this.requestPermission();
-    if (!hasPermission) return;
-
-    // Toggle between listening and inactive
-    if (this.status === Status.INACTIVE || this.status === Status.ACTIVATING) {
-      this.status = Status.LISTENING;
-    } else {
-      this.status = Status.INACTIVE;
-    }
-  }
-
-  // Request microphone permission
-  async requestPermission() {
-    try {
-      await navigator.mediaDevices.getUserMedia({ audio: true });
-      return true;
-    } catch (err) {
-      console.error("Error accessing microphone:", err);
-      toast(
-        "Microphone access denied. Please enable microphone access in your browser settings.",
-        "error"
-      );
-      return false;
-    }
-  }
-}
-
-export const store = createStore("speech", model);
-
-// Initialize speech store
-// window.speechStore = speechStore;
-
-// Event listeners
-document.addEventListener("settings-updated", () => store.loadSettings());
-// document.addEventListener("DOMContentLoaded", () => speechStore.init());
diff --git a/webui/components/settings/agent/agent-settings.html b/webui/components/settings/agent/agent-settings.html
index e860d65a3c..b4bdd33faa 100644
--- a/webui/components/settings/agent/agent-settings.html
+++ b/webui/components/settings/agent/agent-settings.html
@@ -22,9 +22,9 @@
                 
               
               
  • - - Speech - Speech + + Voice + Voice
  • @@ -44,8 +44,8 @@
  • -
    - +
    +
    diff --git a/webui/components/settings/agent/speech.html b/webui/components/settings/agent/speech.html deleted file mode 100644 index c775e19114..0000000000 --- a/webui/components/settings/agent/speech.html +++ /dev/null @@ -1,98 +0,0 @@ - - - Speech - - - -
    - -
    - - diff --git a/webui/components/settings/agent/voice.html b/webui/components/settings/agent/voice.html new file mode 100644 index 0000000000..e49b4bb446 --- /dev/null +++ b/webui/components/settings/agent/voice.html @@ -0,0 +1,35 @@ + + + Voice + + + +
    +
    Voice
    +
    + Voice capabilities are provided by built-in plugins. Browser-native + speech remains available as the fallback output path when no TTS plugin + is active. Enable or disable providers from the Agent Plugins section + below. +
    + + +
    + +
    + +
    + + + + diff --git a/webui/components/settings/speech/microphone-setting-store.js b/webui/components/settings/speech/microphone-setting-store.js deleted file mode 100644 index 4d079d53cb..0000000000 --- a/webui/components/settings/speech/microphone-setting-store.js +++ /dev/null @@ -1,88 +0,0 @@ -import { createStore } from "/js/AlpineStore.js"; - -const model = { - - - devices: [], - selectedDevice: "", - - async init() { - // Load selected device from localStorage if present - const saved = localStorage.getItem('microphoneSelectedDevice'); - await this.loadDevices(); - if (saved && this.devices.some(d => d.deviceId === saved)) { - this.selectedDevice = saved; - } - }, - - async loadDevices() { - // Get media devices - const devices = await navigator.mediaDevices.enumerateDevices(); - // Filter for audio input (microphones) - this.devices = devices.filter(d => d.kind === "audioinput" && d.deviceId); - // Set selected device to first available, if any - this.selectedDevice = this.devices.length > 0 ? this.devices[0].deviceId : ""; - }, - - // track permission request state - requestingPermission: false, - permissionTimer: null, - permissionAttempts: 0, - - // request microphone permission and poll for devices - async requestPermission() { - // set flag first so UI can update immediately - clearTimeout(this.permissionTimer); - this.requestingPermission = true; - this.permissionAttempts = 0; - - // request permission in next tick to allow UI to update - setTimeout(async () => { - try { - await navigator.mediaDevices.getUserMedia({ audio: true }); - // start polling for devices - this.pollForDevices(); - } catch (err) { - console.error("Microphone permission denied"); - this.requestingPermission = false; - } - }, 0); - }, - - // poll for devices until found or timeout (60s) - async pollForDevices() { - await this.loadDevices(); - - // check if we found devices with valid IDs - if (this.devices.some(d => d.deviceId && d.deviceId !== "") || this.permissionAttempts >= 60) { - this.requestingPermission = false; - return; - } - - // continue polling - this.permissionAttempts++; - this.permissionTimer = setTimeout(() => this.pollForDevices(), 1000); - }, - - async selectDevice(deviceId) { - this.selectedDevice = deviceId; - this.onSelectDevice(); - }, - - async onSelectDevice() { - localStorage.setItem('microphoneSelectedDevice', this.selectedDevice); - }, - - getSelectedDevice() { - let device = this.devices.find(d => d.deviceId === this.selectedDevice); - if (!device && this.devices.length > 0) { - device = this.devices.find(d => d.deviceId === "default") || this.devices[0]; - } - return device; - } - -}; - -const store = createStore("microphoneSetting", model); - -export { store }; diff --git a/webui/components/settings/speech/microphone.html b/webui/components/settings/speech/microphone.html deleted file mode 100644 index 7f4154afd2..0000000000 --- a/webui/components/settings/speech/microphone.html +++ /dev/null @@ -1,47 +0,0 @@ - - - - Microhone settings - - - - - - - - -
    - -
    - - - - - - - \ No newline at end of file diff --git a/webui/components/sidebar/bottom/preferences/preferences-store.js b/webui/components/sidebar/bottom/preferences/preferences-store.js index 3970ef7998..d48618127d 100644 --- a/webui/components/sidebar/bottom/preferences/preferences-store.js +++ b/webui/components/sidebar/bottom/preferences/preferences-store.js @@ -1,6 +1,6 @@ import { createStore } from "/js/AlpineStore.js"; import * as css from "/js/css.js"; -import { store as speechStore } from "/components/chat/speech/speech-store.js"; +import { ttsService } from "/js/tts-service.js"; import { applyModeSteps } from "/components/messages/process-group/process-group-dom.js"; // Preferences store centralizes user preference toggles and side-effects @@ -153,7 +153,7 @@ const model = { _applySpeech(value) { localStorage.setItem("speech", value); - if (!value) speechStore.stopAudio(); + if (!value) ttsService.stop(); }, diff --git a/webui/components/sidebar/chats/chats-list.html b/webui/components/sidebar/chats/chats-list.html index f331d0b003..973d4e8376 100644 --- a/webui/components/sidebar/chats/chats-list.html +++ b/webui/components/sidebar/chats/chats-list.html @@ -26,6 +26,7 @@

    Chats

  • +
    @@ -35,6 +36,7 @@

    Chats

    +
  • diff --git a/webui/css/speech.css b/webui/css/speech.css deleted file mode 100644 index 68e78340a8..0000000000 --- a/webui/css/speech.css +++ /dev/null @@ -1,64 +0,0 @@ -/* MIC BUTTON */ - -/* Only apply hover effects on devices that support hover */ -@media (hover: hover) { - #microphone-button:hover { - background-color: #636363; - box-shadow: 0 0 0 1px rgba(255, 255, 255, 0.08), - 0 6px 14px rgba(0, 0, 0, 0.18); - } -} - -#microphone-button:active { - background-color: #444444; - box-shadow: inset 0 0 0 1px rgba(0, 0, 0, 0.12); -} - -#microphone-button.recording { - background-color: #ff4136; /* Red color for recording */ - transition: background-color 0.3s ease; -} - -@keyframes pulse { - 0% { - transform: scale(1); - } - 50% { - transform: scale(1.1); - } - 100% { - transform: scale(1); - } - } - -.mic-pulse { - animation: pulse 1.5s infinite; -} - - -.mic-inactive{ - background-color: grey; -} - -.mic-activating{ - background-color: silver; - animation: pulse 0.8s infinite; -} - -.mic-listening { - background-color: red; -} - -.mic-recording { - background-color: green; -} - -.mic-waiting { - background-color: teal; -} - -.mic-processing { - background-color: darkcyan; - animation: pulse 0.8s infinite; - transform-origin: center; -} \ No newline at end of file diff --git a/webui/index.html b/webui/index.html index abb457f125..c9a2a37938 100644 --- a/webui/index.html +++ b/webui/index.html @@ -13,7 +13,6 @@ - diff --git a/webui/index.js b/webui/index.js index 9aed250720..d1183825f9 100644 --- a/webui/index.js +++ b/webui/index.js @@ -3,8 +3,8 @@ import * as api from "/js/api.js"; import { callJsExtensions } from "/js/extensions.js"; import * as css from "/js/css.js"; import { sleep } from "/js/sleep.js"; +import { ttsService } from "/js/tts-service.js"; import { store as attachmentsStore } from "/components/chat/attachments/attachmentsStore.js"; -import { store as speechStore } from "/components/chat/speech/speech-store.js"; import { store as notificationStore } from "/components/notifications/notification-store.js"; import { store as preferencesStore } from "/components/sidebar/bottom/preferences/preferences-store.js"; import { store as inputStore } from "/components/chat/input/input-store.js"; @@ -458,7 +458,7 @@ function speakMessages(logs) { // finished response if (log.type == "response") { // lastSpokenNo = log.no; - speechStore.speakStream( + ttsService.speakStream( getChatBasedId(log.no), log.content, log.kvps?.finished @@ -474,7 +474,7 @@ function speakMessages(logs) { log.kvps.tool_name != "response" ) { // lastSpokenNo = log.no; - speechStore.speakStream(getChatBasedId(log.no), log.kvps.headline, true); + ttsService.speakStream(getChatBasedId(log.no), log.kvps.headline, true); return; } } @@ -550,7 +550,7 @@ export const setContext = function (id) { lastSpokenNo = 0; // Stop speech when switching chats - speechStore.stopAudio(); + ttsService.stop(); // Clear the chat history immediately to avoid showing stale content const chatHistoryEl = document.getElementById("chat-history"); diff --git a/webui/js/messages.js b/webui/js/messages.js index d949e69dce..e8fc62609c 100644 --- a/webui/js/messages.js +++ b/webui/js/messages.js @@ -3,7 +3,7 @@ import { store as imageViewerStore } from "../components/modals/image-viewer/ima import { marked } from "../vendor/marked/marked.esm.js"; import { store as _messageResizeStore } from "/components/messages/resize/message-resize-store.js"; // keep here, required in html import { store as attachmentsStore } from "/components/chat/attachments/attachmentsStore.js"; -import { store as speechStore } from "/components/chat/speech/speech-store.js"; +import { ttsService } from "/js/tts-service.js"; import { createActionButton, copyToClipboard, @@ -779,7 +779,7 @@ export function drawMessageDefault({ const contentText = String(content ?? ""); const actionButtons = contentText.trim() ? [ - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; @@ -832,7 +832,7 @@ export function drawMessageAgent({ if (thoughtsText.trim()) { actionButtons.push( - createActionButton("speak", "", () => speechStore.speak(thoughtsText)), + createActionButton("speak", "", () => ttsService.speak(thoughtsText)), ); actionButtons.push( createActionButton("copy", "", () => copyToClipboard(thoughtsText)), @@ -870,7 +870,7 @@ export function drawMessageResponse({ const contentText = String(content ?? ""); const actionButtons = contentText.trim() ? [ - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; @@ -937,7 +937,7 @@ export function drawMessageResponse({ const responseText = String(content ?? ""); const responseActionButtons = responseText.trim() ? [ - createActionButton("speak", "", () => speechStore.speak(responseText)), + createActionButton("speak", "", () => ttsService.speak(responseText)), createActionButton("copy", "", () => copyToClipboard(responseText)), ].filter(Boolean) : []; @@ -1080,7 +1080,7 @@ export function drawMessageUser({ const userText = String(content ?? ""); const userActionButtons = userText.trim() ? [ - createActionButton("speak", "", () => speechStore.speak(userText)), + createActionButton("speak", "", () => ttsService.speak(userText)), createActionButton("copy", "", () => copyToClipboard(userText)), ].filter(Boolean) : []; @@ -1170,7 +1170,7 @@ export function drawMessageToolSimple({ buildDetailPayload(arguments[0], { headerLabels }), ), ), - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; @@ -1215,7 +1215,7 @@ export function drawMessageMcp({ buildDetailPayload(arguments[0], { headerLabels }), ), ), - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; @@ -1260,7 +1260,7 @@ export function drawMessageSubagent({ buildDetailPayload(arguments[0], { headerLabels }), ), ), - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; @@ -1294,7 +1294,7 @@ export function drawMessageInfo({ const contentText = String(content ?? ""); const actionButtons = contentText.trim() ? [ - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; @@ -1330,7 +1330,7 @@ export function drawMessageUtil({ const contentText = String(content ?? ""); const actionButtons = contentText.trim() ? [ - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; @@ -1369,7 +1369,7 @@ export function drawMessageHint({ const contentText = String(content ?? ""); const actionButtons = contentText.trim() ? [ - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; @@ -1437,7 +1437,7 @@ export function drawMessageWarning({ const contentText = String(content ?? ""); const actionButtons = contentText.trim() ? [ - createActionButton("speak", "", () => speechStore.speak(contentText)), + createActionButton("speak", "", () => ttsService.speak(contentText)), createActionButton("copy", "", () => copyToClipboard(contentText)), ].filter(Boolean) : []; diff --git a/webui/js/speech_browser.js b/webui/js/speech_browser.js deleted file mode 100644 index e6d7ff2b04..0000000000 --- a/webui/js/speech_browser.js +++ /dev/null @@ -1,394 +0,0 @@ -import { pipeline, read_audio } from './transformers@3.0.2.js'; -import { updateChatInput, sendMessage } from '../index.js'; - -const microphoneButton = document.getElementById('microphone-button'); -let microphoneInput = null; -let isProcessingClick = false; - -const Status = { - INACTIVE: 'inactive', - ACTIVATING: 'activating', - LISTENING: 'listening', - RECORDING: 'recording', - WAITING: 'waiting', - PROCESSING: 'processing' -}; - -class MicrophoneInput { - constructor(updateCallback, options = {}) { - this.mediaRecorder = null; - this.audioChunks = []; - this.lastChunk = []; - this.updateCallback = updateCallback; - this.messageSent = false; - - // Audio analysis properties - this.audioContext = null; - this.mediaStreamSource = null; - this.analyserNode = null; - this._status = Status.INACTIVE; - - // Timing properties - this.lastAudioTime = null; - this.waitingTimer = null; - this.silenceStartTime = null; - this.hasStartedRecording = false; - this.analysisFrame = null; - - this.options = { - modelSize: 'tiny', - language: 'en', - silenceThreshold: 0.15, - silenceDuration: 1000, - waitingTimeout: 2000, - minSpeechDuration: 500, - ...options - }; - } - - get status() { - return this._status; - } - - set status(newStatus) { - if (this._status === newStatus) return; - - const oldStatus = this._status; - this._status = newStatus; - console.log(`Mic status changed from ${oldStatus} to ${newStatus}`); - - // Update UI - microphoneButton.classList.remove(`mic-${oldStatus.toLowerCase()}`); - microphoneButton.classList.add(`mic-${newStatus.toLowerCase()}`); - microphoneButton.setAttribute('data-status', newStatus); - - // Handle state-specific behaviors - this.handleStatusChange(oldStatus, newStatus); - } - - handleStatusChange(oldStatus, newStatus) { - - //last chunk kept only for transition to recording status - if (newStatus != Status.RECORDING) { this.lastChunk = null; } - - switch (newStatus) { - case Status.INACTIVE: - this.handleInactiveState(); - break; - case Status.LISTENING: - this.handleListeningState(); - break; - case Status.RECORDING: - this.handleRecordingState(); - break; - case Status.WAITING: - this.handleWaitingState(); - break; - case Status.PROCESSING: - this.handleProcessingState(); - break; - } - } - - handleInactiveState() { - this.stopRecording(); - this.stopAudioAnalysis(); - if (this.waitingTimer) { - clearTimeout(this.waitingTimer); - this.waitingTimer = null; - } - } - - handleListeningState() { - this.stopRecording(); - this.audioChunks = []; - this.hasStartedRecording = false; - this.silenceStartTime = null; - this.lastAudioTime = null; - this.messageSent = false; - this.startAudioAnalysis(); - } - - handleRecordingState() { - if (!this.hasStartedRecording && this.mediaRecorder.state !== 'recording') { - this.hasStartedRecording = true; - this.mediaRecorder.start(1000); - console.log('Speech started'); - } - if (this.waitingTimer) { - clearTimeout(this.waitingTimer); - this.waitingTimer = null; - } - } - - handleWaitingState() { - // Don't stop recording during waiting state - this.waitingTimer = setTimeout(() => { - if (this.status === Status.WAITING) { - this.status = Status.PROCESSING; - } - }, this.options.waitingTimeout); - } - - handleProcessingState() { - this.stopRecording(); - this.process(); - } - - stopRecording() { - if (this.mediaRecorder?.state === 'recording') { - this.mediaRecorder.stop(); - this.hasStartedRecording = false; - } - } - - async initialize() { - try { - this.transcriber = await pipeline( - 'automatic-speech-recognition', - `Xenova/whisper-${this.options.modelSize}.${this.options.language}` - ); - - const stream = await navigator.mediaDevices.getUserMedia({ - audio: { - echoCancellation: true, - noiseSuppression: true, - channelCount: 1 - } - }); - - this.mediaRecorder = new MediaRecorder(stream); - this.mediaRecorder.ondataavailable = (event) => { - if (event.data.size > 0 && - (this.status === Status.RECORDING || this.status === Status.WAITING)) { - if (this.lastChunk) { - this.audioChunks.push(this.lastChunk); - this.lastChunk = null; - } - this.audioChunks.push(event.data); - console.log('Audio chunk received, total chunks:', this.audioChunks.length); - } - else if (this.status === Status.LISTENING) { - this.lastChunk = event.data; - } - }; - - this.setupAudioAnalysis(stream); - return true; - } catch (error) { - - console.error('Microphone initialization error:', error); - window.toastFrontendError('Failed to access microphone. Please check permissions.', 'Microphone Error'); - return false; - } - } - - setupAudioAnalysis(stream) { - this.audioContext = new (window.AudioContext || window.webkitAudioContext)(); - this.mediaStreamSource = this.audioContext.createMediaStreamSource(stream); - this.analyserNode = this.audioContext.createAnalyser(); - this.analyserNode.fftSize = 2048; - this.analyserNode.minDecibels = -90; - this.analyserNode.maxDecibels = -10; - this.analyserNode.smoothingTimeConstant = 0.85; - this.mediaStreamSource.connect(this.analyserNode); - } - - - startAudioAnalysis() { - const analyzeFrame = () => { - if (this.status === Status.INACTIVE) return; - - const dataArray = new Uint8Array(this.analyserNode.fftSize); - this.analyserNode.getByteTimeDomainData(dataArray); - - // Calculate RMS volume - let sum = 0; - for (let i = 0; i < dataArray.length; i++) { - const amplitude = (dataArray[i] - 128) / 128; - sum += amplitude * amplitude; - } - const rms = Math.sqrt(sum / dataArray.length); - - const now = Date.now(); - - // Update status based on audio level - if (rms > this.options.silenceThreshold) { - this.lastAudioTime = now; - this.silenceStartTime = null; - - if (this.status === Status.LISTENING || this.status === Status.WAITING) { - if (!speech.isSpeaking()) // TODO? a better way to ignore agent's voice? - this.status = Status.RECORDING; - } - } else if (this.status === Status.RECORDING) { - if (!this.silenceStartTime) { - this.silenceStartTime = now; - } - - const silenceDuration = now - this.silenceStartTime; - if (silenceDuration >= this.options.silenceDuration) { - this.status = Status.WAITING; - } - } - - this.analysisFrame = requestAnimationFrame(analyzeFrame); - }; - - this.analysisFrame = requestAnimationFrame(analyzeFrame); - } - - stopAudioAnalysis() { - if (this.analysisFrame) { - cancelAnimationFrame(this.analysisFrame); - this.analysisFrame = null; - } - } - - async process() { - if (this.audioChunks.length === 0) { - this.status = Status.LISTENING; - return; - } - - const audioBlob = new Blob(this.audioChunks, { type: 'audio/wav' }); - const audioUrl = URL.createObjectURL(audioBlob); - - - - try { - const samplingRate = 16000; - const audioData = await read_audio(audioUrl, samplingRate); - const result = await this.transcriber(audioData); - const text = this.filterResult(result.text || "") - - if (text) { - console.log('Transcription:', result.text); - await this.updateCallback(result.text, true); - } - } catch (error) { - console.error('Transcription error:', error); - window.toastFrontendError('Transcription failed.', 'Speech Recognition Error'); - } finally { - URL.revokeObjectURL(audioUrl); - this.audioChunks = []; - this.status = Status.LISTENING; - } - } - - filterResult(text) { - text = text.trim() - let ok = false - while (!ok) { - if (!text) break - if (text[0] === '{' && text[text.length - 1] === '}') break - if (text[0] === '(' && text[text.length - 1] === ')') break - if (text[0] === '[' && text[text.length - 1] === ']') break - ok = true - } - if (ok) return text - else console.log(`Discarding transcription: ${text}`) - } -} - - - -// Initialize and handle click events -async function initializeMicrophoneInput() { - microphoneInput = new MicrophoneInput( - async (text, isFinal) => { - if (isFinal) { - updateChatInput(text); - if (!microphoneInput.messageSent) { - microphoneInput.messageSent = true; - await sendMessage(); - } - } - }, - { - modelSize: 'tiny', - language: 'en', - silenceThreshold: 0.07, - silenceDuration: 1000, - waitingTimeout: 1500 - } - ); - microphoneInput.status = Status.ACTIVATING; - - return await microphoneInput.initialize(); -} - -microphoneButton.addEventListener('click', async () => { - if (isProcessingClick) return; - isProcessingClick = true; - - const hasPermission = await requestMicrophonePermission(); - if (!hasPermission) return; - - try { - if (!microphoneInput && !await initializeMicrophoneInput()) { - return; - } - - // Simply toggle between INACTIVE and LISTENING states - microphoneInput.status = - (microphoneInput.status === Status.INACTIVE || microphoneInput.status === Status.ACTIVATING) ? Status.LISTENING : Status.INACTIVE; - } finally { - setTimeout(() => { - isProcessingClick = false; - }, 300); - } -}); - -// Some error handling for microphone input -async function requestMicrophonePermission() { - try { - await navigator.mediaDevices.getUserMedia({ audio: true }); - return true; - } catch (err) { - console.error('Error accessing microphone:', err); - window.toastFrontendError('Microphone access denied. Please enable microphone access in your browser settings.', 'Microphone Error'); - return false; - } -} - - -class Speech { - constructor() { - this.synth = window.speechSynthesis; - this.utterance = null; - } - - stripEmojis(str) { - return str - .replace(/([\u2700-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|[\u2011-\u26FF]|\uD83E[\uDD10-\uDDFF])/g, '') - .replace(/\s+/g, ' ') - .trim(); - } - - speak(text) { - console.log('Speaking:', text); - // Stop any current utterance - this.stop(); - - // Remove emojis and create a new utterance - text = this.stripEmojis(text); - this.utterance = new SpeechSynthesisUtterance(text); - - // Speak the new utterance - this.synth.speak(this.utterance); - } - - stop() { - if (this.isSpeaking()) { - this.synth.cancel(); - } - } - - isSpeaking() { - return this.synth?.speaking || false; - } -} - -export const speech = new Speech(); -window.speech = speech diff --git a/webui/js/stt-service.js b/webui/js/stt-service.js new file mode 100644 index 0000000000..d4f0e3e0c1 --- /dev/null +++ b/webui/js/stt-service.js @@ -0,0 +1,87 @@ +class SttService extends EventTarget { + constructor() { + super(); + this.providers = new Map(); + } + + registerProvider(id, provider) { + if (!id || !provider) { + throw new Error("STT providers must define an id and provider object."); + } + + this.providers.set(id, provider); + this.emitProvidersChange(); + + return () => this.unregisterProvider(id); + } + + unregisterProvider(id) { + if (!this.providers.has(id)) return; + const activeProviderId = this.getActiveProviderId(); + if (activeProviderId === id) { + this.stop(); + this.emitStatusChange("inactive"); + } + this.providers.delete(id); + this.emitProvidersChange(); + } + + getActiveProviderId() { + const next = this.providers.keys().next(); + return next.done ? "" : String(next.value || ""); + } + + getActiveProvider() { + const providerId = this.getActiveProviderId(); + return providerId ? this.providers.get(providerId) || null : null; + } + + hasProvider() { + return !!this.getActiveProvider(); + } + + emitProvidersChange() { + this.dispatchEvent( + new CustomEvent("providerschange", { + detail: { + activeProviderId: this.getActiveProviderId(), + providerIds: Array.from(this.providers.keys()), + }, + }), + ); + } + + emitStatusChange(status) { + this.dispatchEvent( + new CustomEvent("statuschange", { + detail: { + activeProviderId: this.getActiveProviderId(), + status, + }, + }), + ); + } + + async handleMicrophoneClick() { + return await this.getActiveProvider()?.handleMicrophoneClick?.(); + } + + async requestMicrophonePermission() { + return await this.getActiveProvider()?.requestMicrophonePermission?.(); + } + + updateMicrophoneButtonUI() { + this.getActiveProvider()?.updateMicrophoneButtonUI?.(); + } + + stop() { + this.getActiveProvider()?.stop?.(); + } + + getStatus() { + return this.getActiveProvider()?.getStatus?.() || "inactive"; + } +} + +export const sttService = new SttService(); +globalThis.sttService = sttService; diff --git a/webui/js/tts-service.js b/webui/js/tts-service.js new file mode 100644 index 0000000000..1e36942386 --- /dev/null +++ b/webui/js/tts-service.js @@ -0,0 +1,532 @@ +import { sleep } from "/js/sleep.js"; +import * as shortcuts from "/js/shortcuts.js"; + +class TtsService extends EventTarget { + constructor() { + super(); + this.providers = new Map(); + this.synth = window.speechSynthesis; + this.browserUtterance = null; + this.audioEl = null; + this.currentAudio = null; + this.audioContext = null; + this.userHasInteracted = false; + this.ttsStream = null; + this._isSpeaking = false; + + this.setupUserInteractionHandling(); + } + + registerProvider(id, provider) { + if (!id || !provider || typeof provider.synthesize !== "function") { + throw new Error("TTS providers must define an id and synthesize(text)."); + } + + this.providers.set(id, provider); + this.emitProvidersChange(); + + return () => this.unregisterProvider(id); + } + + unregisterProvider(id) { + if (!this.providers.has(id)) return; + + const activeProviderId = this.getActiveProviderId(); + this.providers.delete(id); + + if (activeProviderId === id) { + this.stop(); + } + + this.emitProvidersChange(); + } + + getActiveProviderId() { + const next = this.providers.keys().next(); + return next.done ? "" : String(next.value || ""); + } + + getActiveProvider() { + const providerId = this.getActiveProviderId(); + return providerId ? this.providers.get(providerId) || null : null; + } + + hasProvider() { + return !!this.getActiveProvider(); + } + + isSpeaking() { + return this._isSpeaking; + } + + getState() { + return { + activeProviderId: this.getActiveProviderId(), + isSpeaking: this.isSpeaking(), + userHasInteracted: this.userHasInteracted, + }; + } + + emitProvidersChange() { + this.dispatchEvent( + new CustomEvent("providerschange", { + detail: { + activeProviderId: this.getActiveProviderId(), + providerIds: Array.from(this.providers.keys()), + }, + }), + ); + this.emitStateChange(); + } + + emitStateChange() { + this.dispatchEvent( + new CustomEvent("statechange", { + detail: this.getState(), + }), + ); + } + + setSpeaking(value) { + const next = !!value; + if (this._isSpeaking === next) return; + this._isSpeaking = next; + this.emitStateChange(); + } + + setupUserInteractionHandling() { + const enableAudio = () => { + if (this.userHasInteracted) return; + + this.userHasInteracted = true; + try { + this.audioContext = new (window.AudioContext || + window.webkitAudioContext)(); + this.audioContext.resume(); + } catch (_error) { + // AudioContext is unavailable in some browsers/modes. + } + + this.emitStateChange(); + }; + + const events = ["click", "touchstart", "keydown", "mousedown"]; + events.forEach((eventName) => { + document.addEventListener(eventName, enableAudio, { + once: true, + passive: true, + }); + }); + } + + showAudioPermissionPrompt() { + shortcuts.frontendNotification({ + type: "info", + message: "Click anywhere to enable audio playback", + displayTime: 5000, + frontendOnly: true, + }); + } + + async speak(text) { + const id = Math.random(); + return await this.speakStream(id, text, true); + } + + async speakStream(id, text, finished = false) { + if ( + this.ttsStream && + this.ttsStream.id === id && + this.ttsStream.text === text && + this.ttsStream.finished === finished + ) { + return; + } + + if (!this.userHasInteracted) { + this.showAudioPermissionPrompt(); + return; + } + + if (!this.ttsStream || this.ttsStream.id !== id) { + this.ttsStream = { + id, + text, + finished, + running: false, + lastChunkIndex: -1, + stopped: false, + chunks: [], + }; + } else { + this.ttsStream.finished = finished; + this.ttsStream.text = text; + } + + const cleanText = this.cleanText(text); + if (!cleanText.trim()) return; + + this.ttsStream.chunks = this.chunkText(cleanText); + if (this.ttsStream.chunks.length === 0) return; + + if (this.ttsStream.running) return; + this.ttsStream.running = true; + + const terminator = () => + this.ttsStream?.id !== id || this.ttsStream?.stopped; + + while (true) { + if (terminator()) break; + + const nextIndex = this.ttsStream.lastChunkIndex + 1; + if (nextIndex >= this.ttsStream.chunks.length) { + if (this.ttsStream.finished) break; + await new Promise((resolve) => setTimeout(resolve, 50)); + continue; + } + + if ( + nextIndex === this.ttsStream.chunks.length - 1 && + !this.ttsStream.finished + ) { + await new Promise((resolve) => setTimeout(resolve, 50)); + continue; + } + + this.ttsStream.lastChunkIndex = nextIndex; + const chunk = this.ttsStream.chunks[nextIndex]; + await this.speakChunk(chunk, nextIndex > 0, terminator); + } + + this.ttsStream.running = false; + } + + async speakChunk(text, waitForPrevious = false, terminator = null) { + const provider = this.getActiveProvider(); + + if (provider) { + try { + return await this.speakWithProvider( + provider, + text, + waitForPrevious, + terminator, + ); + } catch (error) { + console.error("TTS provider failed, falling back to browser TTS", error); + } + } + + return await this.speakWithBrowser(text, waitForPrevious, terminator); + } + + async speakWithProvider(provider, text, waitForPrevious = false, terminator = null) { + const payload = await provider.synthesize(text, { + providerId: this.getActiveProviderId(), + }); + + while (waitForPrevious && this.isSpeaking()) { + await sleep(25); + } + if (terminator && terminator()) return; + + if (!waitForPrevious) { + this.stopAudio(); + } + + if (!payload) return; + + if (Array.isArray(payload.audioParts)) { + for (const part of payload.audioParts) { + if (terminator && terminator()) return; + await this.playAudioBase64(part, payload.mimeType); + await sleep(100); + } + return; + } + + const audioBase64 = payload.audioBase64 || payload.audio; + if (audioBase64) { + await this.playAudioBase64(audioBase64, payload.mimeType); + } + } + + async speakWithBrowser(text, waitForPrevious = false, terminator = null) { + while (waitForPrevious && this.isSpeaking()) { + await sleep(25); + } + if (terminator && terminator()) return; + + if (!waitForPrevious) { + this.stopAudio(); + } + + return await new Promise((resolve, reject) => { + const utterance = new SpeechSynthesisUtterance(text); + this.browserUtterance = utterance; + + utterance.onstart = () => { + this.setSpeaking(true); + }; + utterance.onend = () => { + if (this.browserUtterance === utterance) { + this.browserUtterance = null; + } + this.setSpeaking(false); + resolve(); + }; + utterance.onerror = (error) => { + if (this.browserUtterance === utterance) { + this.browserUtterance = null; + } + this.setSpeaking(false); + reject(error); + }; + + this.synth.speak(utterance); + }); + } + + async playAudioBase64(base64Audio, mimeType = "audio/wav") { + return await new Promise((resolve, reject) => { + const audio = this.audioEl ? this.audioEl : (this.audioEl = new Audio()); + + audio.pause(); + audio.currentTime = 0; + + audio.onplay = () => { + this.setSpeaking(true); + }; + audio.onended = () => { + this.setSpeaking(false); + this.currentAudio = null; + resolve(); + }; + audio.onerror = (error) => { + this.setSpeaking(false); + this.currentAudio = null; + reject(error); + }; + + audio.src = `data:${mimeType};base64,${base64Audio}`; + this.currentAudio = audio; + + audio.play().catch((error) => { + this.setSpeaking(false); + this.currentAudio = null; + if (error?.name === "NotAllowedError") { + this.showAudioPermissionPrompt(); + this.userHasInteracted = false; + this.emitStateChange(); + } + reject(error); + }); + }); + } + + stop() { + this.stopAudio(); + if (this.ttsStream) { + this.ttsStream.stopped = true; + } + + const provider = this.getActiveProvider(); + try { + provider?.stop?.(); + } catch (error) { + console.error("Failed to stop TTS provider cleanly", error); + } + } + + stopAudio() { + if (this.synth?.speaking) { + this.synth.cancel(); + } + + if (this.audioEl) { + this.audioEl.pause(); + this.audioEl.currentTime = 0; + } + + this.currentAudio = null; + this.setSpeaking(false); + } + + chunkText(text, { maxChunkLength = 135, lineSeparator = "..." } = {}) { + const INC_LIMIT = maxChunkLength * 2; + const MIN_CHUNK_LENGTH = 20; + + const splitDeep = (segment) => { + if (segment.length <= INC_LIMIT) return [segment]; + const byComma = segment.match(/[^,]+(?:,|$)/g); + if (byComma.length > 1) { + return byComma.flatMap((part, index) => + splitDeep( + index < byComma.length - 1 ? part : part.replace(/,$/, ""), + ), + ); + } + + const out = []; + let part = ""; + for (const word of segment.split(/\s+/)) { + const need = part ? part.length + 1 + word.length : word.length; + if (need <= maxChunkLength) { + part += (part ? " " : "") + word; + } else { + if (part) out.push(part); + if (word.length > maxChunkLength) { + for (let index = 0; index < word.length; index += maxChunkLength) { + out.push(word.slice(index, index + maxChunkLength)); + } + part = ""; + } else { + part = word; + } + } + } + if (part) out.push(part); + return out; + }; + + const sentenceTokens = (line) => { + const tokens = []; + let start = 0; + for (let index = 0; index < line.length; index++) { + const character = line[index]; + if ( + (character === "." || character === "!" || character === "?") && + /\s/.test(line[index + 1] || "") + ) { + tokens.push(line.slice(start, index + 1)); + index += 1; + start = index + 1; + } + } + if (start < line.length) { + tokens.push(line.slice(start)); + } + return tokens.flatMap((token) => splitDeep(token.trim())).filter(Boolean); + }; + + const initialChunks = []; + const lines = text.split(/\n+/).filter((line) => line.trim()); + for (const line of lines) { + initialChunks.push(...sentenceTokens(line.trim())); + } + + const finalChunks = []; + let currentChunk = ""; + + for (let index = 0; index < initialChunks.length; index++) { + const chunk = initialChunks[index]; + if (!currentChunk) { + currentChunk = chunk; + if ( + index === initialChunks.length - 1 || + currentChunk.length >= MIN_CHUNK_LENGTH + ) { + finalChunks.push(currentChunk); + currentChunk = ""; + } + continue; + } + + if (currentChunk.length < MIN_CHUNK_LENGTH) { + const merged = `${currentChunk} ${lineSeparator} ${chunk}`; + if (merged.length <= maxChunkLength) { + currentChunk = merged; + } else { + finalChunks.push(currentChunk); + currentChunk = chunk; + } + } else { + finalChunks.push(currentChunk); + currentChunk = chunk; + } + + if (index === initialChunks.length - 1 && currentChunk) { + finalChunks.push(currentChunk); + } + } + + return finalChunks.map((chunk) => chunk.trimEnd()); + } + + cleanText(text) { + const SUB = "\x1A"; + const codePlaceholder = `${SUB}code${SUB}`; + const tablePlaceholder = `${SUB}table${SUB}`; + + text = text.replace( + /```(?:[a-zA-Z0-9]*\n)?[\s\S]*?```/g, + codePlaceholder, + ); + text = text.replace(/```(?:[a-zA-Z0-9]*\n)?[\s\S]*$/g, codePlaceholder); + text = text.replace(/`([^`]*)`/g, "$1"); + + try { + const parser = new DOMParser(); + const doc = parser.parseFromString(`
    ${text}
    `, "text/html"); + doc.querySelectorAll("pre, code").forEach((element) => { + element.textContent = codePlaceholder; + }); + text = doc.body.textContent || ""; + } catch (_error) { + text = text.replace(/]*>[\s\S]*?<\/pre>/gi, codePlaceholder); + text = text.replace(/]*>[\s\S]*?<\/code>/gi, codePlaceholder); + text = text.replace(/<[^>]+>/g, ""); + } + + text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1"); + text = text.replace(/[*_#]+/g, ""); + + if (text.includes("|")) { + const tableLines = text + .split("\n") + .filter((line) => line.includes("|") && line.trim().startsWith("|")); + if (tableLines.length > 0) { + for (const line of tableLines) { + text = text.replace(line, tablePlaceholder); + } + } else { + text = text.replace(/\|[^\n]*\|/g, tablePlaceholder); + } + } + + text = text.replace( + /([\u2700-\u27BF]|[\uE000-\uF8FF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDFFF]|[\u2011-\u26FF]|\uD83E[\uDD10-\uDDFF])/g, + "", + ); + + text = text.replace(/https?:\/\/[^\s]+/g, (match) => { + try { + return new URL(match).hostname; + } catch { + return ""; + } + }); + + text = text.replace( + /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g, + "UUID", + ); + text = text.replace(/[ \t]+/g, " "); + + const mergePlaceholders = (value, placeholder, replacement) => { + const pattern = new RegExp(`${placeholder}\\s*${placeholder}`, "g"); + while (pattern.test(value)) { + value = value.replace(pattern, placeholder); + } + return value.replace(new RegExp(placeholder, "g"), replacement); + }; + + text = mergePlaceholders(text, codePlaceholder, "See code attached ..."); + text = mergePlaceholders(text, tablePlaceholder, "See table attached ..."); + + return text.trim(); + } +} + +export const ttsService = new TtsService(); +globalThis.ttsService = ttsService;