From 897ce33eb0567e557f9c45494e6d9fb3fa5997ef Mon Sep 17 00:00:00 2001 From: humabot <71288277+humabot@users.noreply.github.com> Date: Mon, 30 Mar 2026 15:10:21 -0500 Subject: [PATCH] Add local Whisper setup and harden Electron startup Expand speech support to include a repo-local OpenAI Whisper workflow alongside Azure Speech. This updates setup, configuration, settings UI, and runtime speech handling so the app can bootstrap and use a local Whisper CLI with sane defaults and a smoke-test script. Also harden Electron startup and renderer behavior by clearing ELECTRON_RUN_AS_NODE during npm start/dev, enforcing a single-instance lock, simplifying preload quit behavior, and replacing fragile CDN-backed UI assets with local resources or guarded fallbacks to reduce renderer crashes and background SSL noise. Documentation and examples are updated to match the new speech setup flow, environment variables, and startup behavior. --- .gitignore | 5 +- README.md | 78 +- chat.html | 12 +- env.example | 102 +-- llm-response.html | 48 +- main.js | 49 +- package-lock.json | 20 + package.json | 8 +- preload.js | 6 +- scripts/test-speech.js | 25 + settings.html | 63 +- setup.sh | 381 +++++---- src/core/config.js | 8 +- src/managers/window.manager.js | 2 +- src/services/speech.service.js | 1354 +++++++++++++++++++------------- src/styles/common.css | 4 +- src/ui/settings-window.js | 45 ++ 17 files changed, 1364 insertions(+), 846 deletions(-) create mode 100644 scripts/test-speech.js diff --git a/.gitignore b/.gitignore index 65a6a54..e249383 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ node_modules/ .env +.venv-whisper/ +.whisper-models/ eng.traineddata dist/ -.DS_Store \ No newline at end of file +.DS_Store +*.log diff --git a/README.md b/README.md index e9de5af..7b7015e 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Platform Stealth AI - Speech + Speech

--- @@ -53,7 +53,7 @@ https://github.com/user-attachments/assets/896a7140-1e85-405d-bfbe-e05c9f3a816b ### 🚀 **AI-Powered Intelligence** - **Direct Image Analysis**: Screenshots are analyzed by Gemini (no Tesseract OCR) -- **Voice Commands**: Optional Azure Speech (Whisper questions, get instant answers) +- **Voice Commands**: Optional Azure Speech or local OpenAI Whisper - **Context Memory**: Remembers entire interview conversation - **Multi-Language Support**: C++, Python, Java, JavaScript, C - **Smart Response Window**: Draggable with close button @@ -68,7 +68,7 @@ https://github.com/user-attachments/assets/896a7140-1e85-405d-bfbe-e05c9f3a816b - **Floating Overlay Bar**: Compact command center with camera, mic, and skill selector - **Draggable Answer Window**: Move and resize AI response window anywhere - **Close Button**: Clean × button to close answer window when needed -- **Auto-Hide Mic**: Microphone button appears only when Azure Speech is configured +- **Auto-Hide Mic**: Microphone button appears only when a speech provider is available - **Interactive Chat**: Full conversation window with markdown support ### 🎨 **Visual Design** @@ -133,7 +133,7 @@ https://github.com/user-attachments/assets/896a7140-1e85-405d-bfbe-e05c9f3a816b - [x] **Global shortcuts** (capture, visibility, interaction, chat, settings) - [x] **Session memory** and chat UI - [x] **Language picker** and DSA skill prompt -- [x] **Optional Azure Speech** integration with auto‑hide mic +- [x] **Optional Azure Speech / local Whisper** integration with auto‑hide mic - [x] **Multi‑monitor** and area capture APIs - [x] **Window binding** and positioning system - [x] **Settings management** with app icon/stealth modes @@ -157,12 +157,22 @@ The setup script automatically handles configuration. You only need: # Required: Google Gemini API Key (setup script will ask for this) GEMINI_API_KEY=your_gemini_api_key_here -# Optional: Azure Speech Recognition (add later if you want voice features) +# Optional: Speech Recognition (pick one provider) +SPEECH_PROVIDER=whisper + +# Azure option AZURE_SPEECH_KEY=your_azure_speech_key AZURE_SPEECH_REGION=your_region + +# Local Whisper option +WHISPER_COMMAND=whisper +WHISPER_MODEL_DIR=.whisper-models +WHISPER_MODEL=base +WHISPER_LANGUAGE=en +WHISPER_SEGMENT_MS=4000 ``` -**Note**: Speech recognition is completely optional. If Azure credentials are not provided, the microphone button will be automatically hidden from all interfaces. +**Note**: Speech recognition is completely optional. If no configured provider is available, the microphone button will be automatically hidden from all interfaces. ## 🚀 Quick Start & Installation @@ -187,7 +197,9 @@ AZURE_SPEECH_REGION=your_region **That's it!** The setup script will: - Install all dependencies automatically -- Create and configure your `.env` file +- Create your `.env` file from `env.example` if needed +- Set up a local Whisper virtualenv in `.venv-whisper` +- Configure `.env` to use local Whisper by default - Build the app (if needed) - Launch OpenCluely ready to use (if not works use npm install & then npm start) @@ -196,6 +208,8 @@ AZURE_SPEECH_REGION=your_region - **Windows**: Use Git Bash (comes with Git for Windows), WSL, or any bash environment - **macOS/Linux**: Use your regular terminal - **All platforms**: No manual npm commands needed - the setup script handles everything +- **Windows Whisper path**: `setup.sh` now writes `WHISPER_COMMAND=.venv-whisper/Scripts/whisper.exe` +- **macOS/Linux Whisper path**: `setup.sh` writes `WHISPER_COMMAND=.venv-whisper/bin/whisper` ### 🎛️ Setup Script Options @@ -204,28 +218,50 @@ AZURE_SPEECH_REGION=your_region ./setup.sh --ci # Use npm ci instead of npm install ./setup.sh --no-run # Setup only, don't launch the app ./setup.sh --install-system-deps # Install sox for microphone (optional) +./setup.sh --skip-whisper # Skip the local Whisper bootstrap ``` -### 🔧 **Optional: Azure Speech Setup** (For Voice Features) +### 🔧 **Optional: Speech Setup** (For Voice Features) + +Voice recognition is optional. You can use either Azure Speech or local OpenAI Whisper. -Voice recognition is completely optional. The setup script will create a `.env` file with just the required Gemini key. To add voice features: +For the local Whisper path, `./setup.sh` now handles the full repo-local setup: -1. Get Azure Speech credentials: +1. Creates `.venv-whisper` +2. Installs `openai-whisper` +3. Points `.env` at `.venv-whisper/bin/whisper` +4. Creates `.whisper-models` +5. Runs `npm run test-speech` + +1. For Azure Speech: - Visit [Azure Portal](https://portal.azure.com/) - Create a Speech Service - Copy your key and region -2. Add to your `.env` file: +2. For local Whisper: + - Run `./setup.sh --install-system-deps` + - Or install required audio tools such as `ffmpeg` and `sox` yourself + - On Windows, install audio tooling separately and prefer Git Bash or WSL for `setup.sh` + +3. Add one provider to your `.env` file: ```env - # Already configured by setup script GEMINI_API_KEY=your_gemini_api_key_here - - # Add these for voice features (optional) + SPEECH_PROVIDER=azure AZURE_SPEECH_KEY=your_azure_speech_key AZURE_SPEECH_REGION=your_region ``` -3. Restart the app - microphone buttons will now appear automatically + ```env + GEMINI_API_KEY=your_gemini_api_key_here + SPEECH_PROVIDER=whisper + WHISPER_COMMAND=whisper + WHISPER_MODEL_DIR=.whisper-models + WHISPER_MODEL=base + WHISPER_LANGUAGE=en + WHISPER_SEGMENT_MS=4000 + ``` + +4. Restart the app - microphone buttons will now appear automatically ## 🎮 How to Use @@ -265,10 +301,11 @@ Voice recognition is completely optional. The setup script will create a `.env` - **Image Understanding**: DSA prompt is applied only for new image-based queries; chat messages don’t include the full prompt - **Multi-monitor & Area Capture**: Programmatic APIs allow targeting a display and optional rectangular crop for focused analysis -#### 🔊 **Optional Voice Features** (Azure Speech) -- **Real-time Transcription**: Speak questions naturally +#### 🔊 **Optional Voice Features** (Azure Speech / Local Whisper) +- **Chunked Local Transcription**: Local Whisper transcribes short recorded segments on your machine +- **Real-time Transcription**: Azure Speech supports live interim recognition - **Listening Animation**: Visual feedback during recording -- **Interim Results**: See transcription as you speak +- **Interim Results**: Available with Azure Speech - **Auto-processing**: Instant AI responses to voice input ] --- @@ -305,7 +342,8 @@ Voice recognition is completely optional. The setup script will create a `.env` - **Microphone/voice not working** - Voice is optional - ignore related warnings if you don't need it - - To enable: install `sox` (Linux/macOS) and add Azure keys to `.env` + - Azure mode: add valid Azure keys to `.env` + - Whisper mode: install `openai-whisper`, `ffmpeg`, and `sox`, then set `SPEECH_PROVIDER=whisper` @@ -341,7 +379,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## 🙏 Acknowledgments - **Google Gemini**: Powering AI intelligence -- **Azure Speech**: Optional voice recognition +- **Azure Speech / Whisper**: Optional voice recognition - **Electron**: Cross-platform desktop framework - **Community**: Amazing contributors and feedback diff --git a/chat.html b/chat.html index d986477..baff03c 100644 --- a/chat.html +++ b/chat.html @@ -4,10 +4,8 @@ Chat - - - + @@ -336,6 +352,16 @@ Speech Recognition
+
+
+
Speech Provider
+
Choose Azure Speech or a local OpenAI Whisper CLI
+
+ +
Azure Speech Key
@@ -350,6 +376,39 @@
+
+
+
+
Whisper Command
+
CLI command for local Whisper, such as whisper or python3 -m whisper
+
+ +
+
+
+
+
Whisper Model
+
Local model name used by the Whisper CLI
+
+ +
+
+
+
Whisper Language
+
Language code for local transcription
+
+ +
+
+
+
Whisper Segment Length
+
Chunk size in milliseconds for local transcription
+
+ +
+
+ Local Whisper runs on this machine and needs a Whisper CLI installed. These settings apply immediately for the current app session; use .env for startup defaults. +
@@ -389,4 +448,4 @@ - \ No newline at end of file + diff --git a/setup.sh b/setup.sh index 4990721..d3cd481 100755 --- a/setup.sh +++ b/setup.sh @@ -1,16 +1,22 @@ #!/usr/bin/env bash set -euo pipefail -# OpenCluely one-shot setup: install deps, (optionally) build, and run -# Works on macOS, Linux, and Windows (Git Bash / MSYS2 / Cygwin) - -# Defaults DO_BUILD=0 DO_RUN=1 USE_CI=0 INSTALL_SYSTEM_DEPS=0 +SETUP_WHISPER=1 +WHISPER_MODEL="${WHISPER_MODEL:-base}" +WHISPER_LANGUAGE="${WHISPER_LANGUAGE:-en}" +WHISPER_SEGMENT_MS="${WHISPER_SEGMENT_MS:-4000}" +WHISPER_VENV_DIR=".venv-whisper" +WHISPER_MODEL_DIR=".whisper-models" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" OS_NAME="unknown" PLATFORM_BUILD_SCRIPT="build" +PYTHON_BIN="python3" +WHISPER_PIP_PATH="" +WHISPER_COMMAND_PATH="" print_header() { echo "========================================" @@ -23,25 +29,30 @@ usage() { Usage: ./setup.sh [options] This script will: -1. Install dependencies -2. Create .env file (if needed) -3. Guide you to add your Gemini API key -4. Optionally build the app -5. Start OpenCluely +1. Create .env from env.example when needed +2. Install Node dependencies +3. Optionally set up local Whisper in ${WHISPER_VENV_DIR} +4. Optionally install system audio dependencies +5. Optionally build the app +6. Optionally run OpenCluely Options: - --build Build a distributable for this OS (electron-builder) + --build Build a distributable for this OS --no-run Do not start the app after setup --run Start the app after setup (default) - --ci Use 'npm ci' instead of 'npm install' if lockfile exists - --install-system-deps Attempt to install required system dependencies (sox) where possible + --ci Use 'npm ci' instead of 'npm install' + --install-system-deps Attempt to install sox where possible + --skip-whisper Skip local Whisper environment setup -h, --help Show this help Environment variables: - GEMINI_API_KEY If provided, will be written into .env (skips manual setup) + GEMINI_API_KEY If provided, writes into .env + WHISPER_MODEL Whisper model to configure (default: base) + WHISPER_LANGUAGE Whisper language to configure (default: en) + WHISPER_SEGMENT_MS Segment size in ms (default: 4000) -Example with API key: - GEMINI_API_KEY=your_key_here ./setup.sh +Example: + GEMINI_API_KEY=your_key_here ./setup.sh --install-system-deps EOF } @@ -52,178 +63,210 @@ for arg in "$@"; do --run) DO_RUN=1 ;; --ci) USE_CI=1 ;; --install-system-deps) INSTALL_SYSTEM_DEPS=1 ;; + --skip-whisper) SETUP_WHISPER=0 ;; -h|--help) usage; exit 0 ;; *) echo "Unknown option: $arg"; usage; exit 1 ;; esac - shift || true done print_header +cd "$SCRIPT_DIR" -# Detect OS -UNAME_OUT=$(uname -s || echo "unknown") -case "$UNAME_OUT" in - Linux*) OS_NAME="linux" ;; - Darwin*) OS_NAME="macos" ;; - CYGWIN*|MINGW*|MSYS*) OS_NAME="windows" ;; - *) OS_NAME="unknown" ;; - esac +detect_os() { + local uname_out + uname_out=$(uname -s || echo "unknown") + case "$uname_out" in + Linux*) OS_NAME="linux" ;; + Darwin*) OS_NAME="macos" ;; + CYGWIN*|MINGW*|MSYS*) OS_NAME="windows" ;; + *) OS_NAME="unknown" ;; + esac -echo "Detected OS: $OS_NAME" + case "$OS_NAME" in + macos) PLATFORM_BUILD_SCRIPT="build:mac" ;; + windows) PLATFORM_BUILD_SCRIPT="build:win" ;; + linux) PLATFORM_BUILD_SCRIPT="build:linux" ;; + *) PLATFORM_BUILD_SCRIPT="build" ;; + esac -# Map build script per platform (optional) -case "$OS_NAME" in - macos) PLATFORM_BUILD_SCRIPT="build:mac" ;; - windows) PLATFORM_BUILD_SCRIPT="build:win" ;; - linux) PLATFORM_BUILD_SCRIPT="build:linux" ;; - *) PLATFORM_BUILD_SCRIPT="build" ;; - esac - -# Check Node & npm -if ! command -v node >/dev/null 2>&1; then - echo "Error: Node.js is not installed or not in PATH. Please install Node 18+ and retry." - exit 1 -fi -if ! command -v npm >/dev/null 2>&1; then - echo "Error: npm is not installed or not in PATH." - exit 1 -fi + case "$OS_NAME" in + windows) + PYTHON_BIN="python" + WHISPER_PIP_PATH="${WHISPER_VENV_DIR}/Scripts/pip.exe" + WHISPER_COMMAND_PATH="${WHISPER_VENV_DIR}/Scripts/whisper.exe" + ;; + *) + PYTHON_BIN="python3" + WHISPER_PIP_PATH="${WHISPER_VENV_DIR}/bin/pip" + WHISPER_COMMAND_PATH="${WHISPER_VENV_DIR}/bin/whisper" + ;; + esac +} -echo "Node: $(node -v)" -echo "npm: $(npm -v)" +require_command() { + local cmd="$1" + local message="$2" + if ! command -v "$cmd" >/dev/null 2>&1; then + echo "Error: ${message}" + exit 1 + fi +} -# Install system dependencies (optional best-effort) -if [[ "$INSTALL_SYSTEM_DEPS" -eq 1 ]]; then - echo "Attempting to install system dependencies (best effort)" - if ! command -v sox >/dev/null 2>&1; then - case "$OS_NAME" in - macos) - if command -v brew >/dev/null 2>&1; then - echo "Installing sox via Homebrew..." - brew install sox || echo "Could not install sox via brew. You can install it manually: brew install sox" - else - echo "Homebrew not found. Install sox manually: https://formulae.brew.sh/formula/sox" - fi - ;; - linux) - if command -v apt-get >/dev/null 2>&1; then - echo "Installing sox via apt-get (sudo may prompt)..." - sudo apt-get update -y && sudo apt-get install -y sox || echo "Could not install sox via apt-get." - elif command -v dnf >/dev/null 2>&1; then - echo "Installing sox via dnf (sudo may prompt)..." - sudo dnf install -y sox || echo "Could not install sox via dnf." - elif command -v pacman >/dev/null 2>&1; then - echo "Installing sox via pacman (sudo may prompt)..." - sudo pacman -S --noconfirm sox || echo "Could not install sox via pacman." - else - echo "Unknown package manager. Please install 'sox' manually." - fi - ;; - windows) - echo "On Windows, install sox via Chocolatey (Admin PowerShell): choco install sox" - ;; - *) - echo "Unknown OS; please install 'sox' manually if you need microphone capture." - ;; - esac - else - echo "sox already installed." +ensure_env_file() { + if [[ ! -f .env ]]; then + if [[ -f env.example ]]; then + echo "Creating .env from env.example" + cp env.example .env + else + echo "Error: env.example is missing" + exit 1 + fi fi -fi +} -# Project root -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" +upsert_env() { + local key="$1" + local value="$2" -# Ensure .env exists and has API key -ENV_NEEDS_CONFIG=0 -if [[ ! -f .env ]]; then - if [[ -f env.example ]]; then - echo "Creating .env from env.example" - cp env.example .env - ENV_NEEDS_CONFIG=1 + if grep -q "^${key}=" .env 2>/dev/null; then + perl -0pi -e "s/^${key}=.*\$/${key}=${value}/m" .env else - echo "Creating new .env file" - cat > .env << 'EOF' -# Google Gemini API Configuration -# Get your API key from: https://aistudio.google.com/ -GEMINI_API_KEY=your_gemini_api_key_here - -# Optional: Azure Speech Services Configuration -# AZURE_SPEECH_KEY=your_azure_speech_key_here -# AZURE_SPEECH_REGION=your_azure_region_here -EOF - ENV_NEEDS_CONFIG=1 + printf "%s=%s\n" "$key" "$value" >> .env fi -fi - -# If GEMINI_API_KEY is provided via env and .env lacks it, append it -if [[ -n "${GEMINI_API_KEY:-}" ]]; then - if ! grep -q '^GEMINI_API_KEY=' .env 2>/dev/null; then - echo "GEMINI_API_KEY is set in the environment; writing to .env" - printf "GEMINI_API_KEY=%s\n" "$GEMINI_API_KEY" >> .env - ENV_NEEDS_CONFIG=0 +} + +ensure_gemini_key() { + if [[ -n "${GEMINI_API_KEY:-}" ]]; then + upsert_env "GEMINI_API_KEY" "$GEMINI_API_KEY" fi -fi - -# Check if API key is configured -if [[ "$ENV_NEEDS_CONFIG" -eq 1 ]] || grep -q "your_gemini_api_key_here" .env 2>/dev/null; then - echo "" - echo "==========================================" - echo " ⚠️ API KEY REQUIRED" - echo "==========================================" - echo "" - echo "OpenCluely needs a Google Gemini API key to work." - echo "" - echo "Steps to get your API key:" - echo "1. Visit: https://aistudio.google.com/" - echo "2. Click 'Create API Key'" - echo "3. Copy the generated key" - echo "" - echo "Then edit your .env file and replace 'your_gemini_api_key_here' with your actual key:" - echo "" - echo "GEMINI_API_KEY=your_actual_api_key_here" - echo "" - echo "You can edit .env with any text editor:" - echo " nano .env (Linux/macOS)" - echo " notepad .env (Windows)" - echo " code .env (VS Code)" - echo "" - read -p "Press Enter after you've added your API key to continue..." - echo "" -fi - -# Install node dependencies -if [[ -f package-lock.json && "$USE_CI" -eq 1 ]]; then - echo "Installing dependencies with npm ci" - npm ci -else - echo "Installing dependencies with npm install" - npm install -fi - -# Build (optional) -if [[ "$DO_BUILD" -eq 1 ]]; then - echo "Building app for $OS_NAME via npm run $PLATFORM_BUILD_SCRIPT" - npm run "$PLATFORM_BUILD_SCRIPT" -fi - -# Run (default) -if [[ "$DO_RUN" -eq 1 ]]; then - # Final validation before starting - if grep -q "your_gemini_api_key_here" .env 2>/dev/null; then + + if ! grep -q '^GEMINI_API_KEY=' .env 2>/dev/null || grep -q 'your_gemini_api_key_here' .env 2>/dev/null; then echo "" - echo "❌ Error: API key not configured!" - echo "Please edit .env and replace 'your_gemini_api_key_here' with your actual Gemini API key." - echo "Get your key from: https://aistudio.google.com/" + echo "==========================================" + echo " API KEY REQUIRED" + echo "==========================================" echo "" - echo "Then run the setup script again:" - echo "./setup.sh" + echo "Add your Gemini API key to .env and rerun this script if needed." + echo "Get a key from: https://aistudio.google.com/" + echo "" + read -r -p "Press Enter after you've updated .env..." + fi + + if grep -q 'your_gemini_api_key_here' .env 2>/dev/null; then + echo "Error: GEMINI_API_KEY is still not configured in .env" exit 1 fi - - echo "Starting app (npm start)" - npm start -else - echo "Setup complete. Skipping run." -fi +} + +install_system_deps() { + if [[ "$INSTALL_SYSTEM_DEPS" -ne 1 ]]; then + return + fi + + echo "Attempting to install system audio dependencies" + + if command -v sox >/dev/null 2>&1; then + echo "sox already installed" + return + fi + + case "$OS_NAME" in + macos) + if command -v brew >/dev/null 2>&1; then + brew install sox || echo "Could not install sox automatically. Install it manually with: brew install sox" + else + echo "Homebrew not found. Install sox manually." + fi + ;; + linux) + if command -v apt-get >/dev/null 2>&1; then + sudo apt-get update -y && sudo apt-get install -y sox || echo "Could not install sox via apt-get" + elif command -v dnf >/dev/null 2>&1; then + sudo dnf install -y sox || echo "Could not install sox via dnf" + elif command -v pacman >/dev/null 2>&1; then + sudo pacman -S --noconfirm sox || echo "Could not install sox via pacman" + else + echo "Unknown package manager. Install sox manually." + fi + ;; + windows) + echo "Install sox manually on Windows, for example via Chocolatey: choco install sox" + ;; + *) + echo "Unknown OS. Install sox manually if you want microphone capture." + ;; + esac +} + +install_node_deps() { + if [[ -f package-lock.json && "$USE_CI" -eq 1 ]]; then + echo "Installing Node dependencies with npm ci" + npm ci + else + echo "Installing Node dependencies with npm install" + npm install + fi +} + +setup_whisper_env() { + if [[ "$SETUP_WHISPER" -ne 1 ]]; then + echo "Skipping local Whisper setup" + return + fi + + require_command "$PYTHON_BIN" "Python 3 is required for local Whisper setup." + + if [[ ! -d "$WHISPER_VENV_DIR" ]]; then + echo "Creating Whisper virtual environment at $WHISPER_VENV_DIR" + "$PYTHON_BIN" -m venv "$WHISPER_VENV_DIR" + fi + + echo "Installing local Whisper into $WHISPER_VENV_DIR" + "$WHISPER_PIP_PATH" install --upgrade pip + "$WHISPER_PIP_PATH" install openai-whisper + + mkdir -p "$WHISPER_MODEL_DIR" + + upsert_env "SPEECH_PROVIDER" "whisper" + upsert_env "AZURE_SPEECH_KEY" "" + upsert_env "AZURE_SPEECH_REGION" "" + upsert_env "WHISPER_COMMAND" "${WHISPER_COMMAND_PATH}" + upsert_env "WHISPER_MODEL_DIR" "${WHISPER_MODEL_DIR}" + upsert_env "WHISPER_MODEL" "${WHISPER_MODEL}" + upsert_env "WHISPER_LANGUAGE" "${WHISPER_LANGUAGE}" + upsert_env "WHISPER_SEGMENT_MS" "${WHISPER_SEGMENT_MS}" + + echo "Running Whisper smoke test" + npm run test-speech +} + +build_app() { + if [[ "$DO_BUILD" -eq 1 ]]; then + echo "Building app for $OS_NAME with npm run $PLATFORM_BUILD_SCRIPT" + npm run "$PLATFORM_BUILD_SCRIPT" + fi +} + +run_app() { + if [[ "$DO_RUN" -eq 1 ]]; then + echo "Starting app" + npm start + else + echo "Setup complete. Skipping run." + fi +} + +detect_os +echo "Detected OS: $OS_NAME" +require_command node "Node.js 18+ is required." +require_command npm "npm is required." +echo "Node: $(node -v)" +echo "npm: $(npm -v)" + +ensure_env_file +ensure_gemini_key +install_system_deps +install_node_deps +setup_whisper_env +build_app +run_app diff --git a/src/core/config.js b/src/core/config.js index 12dfff1..35f7ca5 100644 --- a/src/core/config.js +++ b/src/core/config.js @@ -55,11 +55,17 @@ class ConfigManager { }, speech: { + provider: 'azure', azure: { language: 'en-US', enableDictation: true, enableAudioLogging: false, outputFormat: 'detailed' + }, + whisper: { + model: 'base', + language: 'en', + segmentMs: 4000 } }, @@ -98,4 +104,4 @@ class ConfigManager { } } -module.exports = new ConfigManager(); \ No newline at end of file +module.exports = new ConfigManager(); diff --git a/src/managers/window.manager.js b/src/managers/window.manager.js index 6e0b78b..8c37825 100644 --- a/src/managers/window.manager.js +++ b/src/managers/window.manager.js @@ -1642,4 +1642,4 @@ class WindowManager { } } -module.exports = new WindowManager(); \ No newline at end of file +module.exports = new WindowManager(); diff --git a/src/services/speech.service.js b/src/services/speech.service.js index f371603..45875f5 100644 --- a/src/services/speech.service.js +++ b/src/services/speech.service.js @@ -1,7 +1,7 @@ // Enhanced polyfills for Azure Speech SDK in Node.js environment if (typeof window === 'undefined') { global.window = { - navigator: { + navigator: { userAgent: 'Node.js', platform: 'node', mediaDevices: { @@ -30,7 +30,7 @@ if (typeof window === 'undefined') { ]) } }, - document: { + document: { createElement: (tagName) => { const element = { addEventListener: () => {}, @@ -51,8 +51,7 @@ if (typeof window === 'undefined') { focus: () => {}, blur: () => {} }; - - // Special handling for audio elements + if (tagName.toLowerCase() === 'audio') { Object.assign(element, { play: () => Promise.resolve(), @@ -78,7 +77,7 @@ if (typeof window === 'undefined') { currentSrc: '' }); } - + return element; }, getElementById: () => null, @@ -99,7 +98,7 @@ if (typeof window === 'undefined') { style: {} } }, - location: { + location: { href: 'file:///', protocol: 'file:', host: '', @@ -118,7 +117,6 @@ if (typeof window === 'undefined') { clearInterval: global.clearInterval, requestAnimationFrame: (callback) => global.setTimeout(callback, 16), cancelAnimationFrame: global.clearTimeout, - // Add console methods if not available console: global.console || { log: () => {}, error: () => {}, @@ -127,50 +125,50 @@ if (typeof window === 'undefined') { debug: () => {} }, AudioContext: class AudioContext { - constructor() { - this.state = 'running'; + constructor() { + this.state = 'running'; this.sampleRate = 16000; this.currentTime = 0; this.listener = { setPosition: () => {}, setOrientation: () => {} }; - this.destination = { - connect: () => {}, + this.destination = { + connect: () => {}, disconnect: () => {}, channelCount: 2, channelCountMode: 'explicit', channelInterpretation: 'speakers' }; } - createMediaStreamSource(stream) { - return { - connect: () => {}, + createMediaStreamSource(stream) { + return { + connect: () => {}, disconnect: () => {}, mediaStream: stream - }; + }; } - createGain() { - return { - connect: () => {}, - disconnect: () => {}, - gain: { + createGain() { + return { + connect: () => {}, + disconnect: () => {}, + gain: { value: 1, setValueAtTime: () => {}, linearRampToValueAtTime: () => {}, exponentialRampToValueAtTime: () => {} } - }; + }; } - createScriptProcessor(bufferSize = 4096, inputChannels = 1, outputChannels = 1) { - return { - connect: () => {}, - disconnect: () => {}, + createScriptProcessor(bufferSize = 4096, inputChannels = 1, outputChannels = 1) { + return { + connect: () => {}, + disconnect: () => {}, onaudioprocess: null, bufferSize, numberOfInputs: inputChannels, numberOfOutputs: outputChannels - }; + }; } createAnalyser() { return { @@ -187,7 +185,7 @@ if (typeof window === 'undefined') { getFloatTimeDomainData: () => {} }; } - decodeAudioData(audioData) { + decodeAudioData() { return Promise.resolve({ length: 44100, sampleRate: 44100, @@ -196,64 +194,64 @@ if (typeof window === 'undefined') { getChannelData: () => new Float32Array(44100) }); } - suspend() { + suspend() { this.state = 'suspended'; - return Promise.resolve(); + return Promise.resolve(); } - resume() { + resume() { this.state = 'running'; - return Promise.resolve(); + return Promise.resolve(); } - close() { + close() { this.state = 'closed'; - return Promise.resolve(); + return Promise.resolve(); } }, webkitAudioContext: class webkitAudioContext { - constructor() { - this.state = 'running'; + constructor() { + this.state = 'running'; this.sampleRate = 16000; this.currentTime = 0; this.listener = { setPosition: () => {}, setOrientation: () => {} }; - this.destination = { - connect: () => {}, + this.destination = { + connect: () => {}, disconnect: () => {}, channelCount: 2, channelCountMode: 'explicit', channelInterpretation: 'speakers' }; } - createMediaStreamSource(stream) { - return { - connect: () => {}, + createMediaStreamSource(stream) { + return { + connect: () => {}, disconnect: () => {}, mediaStream: stream - }; + }; } - createGain() { - return { - connect: () => {}, - disconnect: () => {}, - gain: { + createGain() { + return { + connect: () => {}, + disconnect: () => {}, + gain: { value: 1, setValueAtTime: () => {}, linearRampToValueAtTime: () => {}, exponentialRampToValueAtTime: () => {} } - }; + }; } - createScriptProcessor(bufferSize = 4096, inputChannels = 1, outputChannels = 1) { - return { - connect: () => {}, - disconnect: () => {}, + createScriptProcessor(bufferSize = 4096, inputChannels = 1, outputChannels = 1) { + return { + connect: () => {}, + disconnect: () => {}, onaudioprocess: null, bufferSize, numberOfInputs: inputChannels, numberOfOutputs: outputChannels - }; + }; } createAnalyser() { return { @@ -270,7 +268,7 @@ if (typeof window === 'undefined') { getFloatTimeDomainData: () => {} }; } - decodeAudioData(audioData) { + decodeAudioData() { return Promise.resolve({ length: 44100, sampleRate: 44100, @@ -279,22 +277,21 @@ if (typeof window === 'undefined') { getChannelData: () => new Float32Array(44100) }); } - suspend() { + suspend() { this.state = 'suspended'; - return Promise.resolve(); + return Promise.resolve(); } - resume() { + resume() { this.state = 'running'; - return Promise.resolve(); + return Promise.resolve(); } - close() { + close() { this.state = 'closed'; - return Promise.resolve(); + return Promise.resolve(); } }, - // Add additional globals that might be needed URL: class URL { - constructor(url, base) { + constructor(url) { this.href = url; this.protocol = 'https:'; this.host = 'localhost'; @@ -305,7 +302,9 @@ if (typeof window === 'undefined') { this.hash = ''; this.origin = 'https://localhost'; } - toString() { return this.href; } + toString() { + return this.href; + } }, Blob: class Blob { constructor(parts = [], options = {}) { @@ -313,10 +312,18 @@ if (typeof window === 'undefined') { this.type = options.type || ''; this.parts = parts; } - slice() { return new Blob(); } - stream() { return new ReadableStream(); } - text() { return Promise.resolve(''); } - arrayBuffer() { return Promise.resolve(new ArrayBuffer(0)); } + slice() { + return new Blob(); + } + stream() { + return new ReadableStream(); + } + text() { + return Promise.resolve(''); + } + arrayBuffer() { + return Promise.resolve(new ArrayBuffer(0)); + } }, File: class File { constructor(parts, name, options = {}) { @@ -326,10 +333,18 @@ if (typeof window === 'undefined') { this.lastModified = Date.now(); this.parts = parts; } - slice() { return new File([], this.name); } - stream() { return new ReadableStream(); } - text() { return Promise.resolve(''); } - arrayBuffer() { return Promise.resolve(new ArrayBuffer(0)); } + slice() { + return new File([], this.name); + } + stream() { + return new ReadableStream(); + } + text() { + return Promise.resolve(''); + } + arrayBuffer() { + return Promise.resolve(new ArrayBuffer(0)); + } } }; global.document = global.window.document; @@ -339,8 +354,7 @@ if (typeof window === 'undefined') { global.URL = global.window.URL; global.Blob = global.window.Blob; global.File = global.window.File; - - // Additional polyfills that might be needed + if (!global.performance) { global.performance = { now: () => Date.now(), @@ -352,7 +366,7 @@ if (typeof window === 'undefined') { getEntriesByType: () => [] }; } - + if (!global.crypto) { global.crypto = { getRandomValues: (arr) => { @@ -365,12 +379,28 @@ if (typeof window === 'undefined') { } } -const sdk = require('microsoft-cognitiveservices-speech-sdk'); -const recorder = require('node-record-lpcm16'); +const fs = require('fs'); +const os = require('os'); +const path = require('path'); +const { spawn, spawnSync } = require('child_process'); const { EventEmitter } = require('events'); const logger = require('../core/logger').createServiceLogger('SPEECH'); const config = require('../core/config'); +let sdk = null; +try { + sdk = require('microsoft-cognitiveservices-speech-sdk'); +} catch (error) { + logger.warn('Azure Speech SDK unavailable', { error: error.message }); +} + +let recorder = null; +try { + recorder = require('node-record-lpcm16'); +} catch (error) { + logger.warn('Local audio recorder dependency unavailable', { error: error.message }); +} + class SpeechService extends EventEmitter { constructor() { super(); @@ -383,72 +413,133 @@ class SpeechService extends EventEmitter { this.maxRetries = 3; this.pushStream = null; this.recording = null; - this.available = false; // track availability - + this.available = false; + this.provider = 'disabled'; + this.runtimeSettings = {}; + this.segmentBuffers = []; + this.segmentBytes = 0; + this.segmentTimer = null; + this.transcriptionInFlight = false; + this.pendingFlush = false; + this.audioProgram = null; + this.whisperCommand = null; + this.initializeClient(); } initializeClient() { + this._cleanup(); + this.provider = 'disabled'; + this.available = false; + this.speechConfig = null; + this.whisperCommand = null; + + const provider = this._getConfiguredProvider(); + this.provider = provider; + + if (provider === 'azure') { + this._initializeAzureClient(); + return; + } + + if (provider === 'whisper') { + this._initializeWhisperClient(); + return; + } + + const reason = 'Speech recognition disabled. Configure Azure or local Whisper.'; + logger.warn(reason); + this.emit('status', reason); + } + + _initializeAzureClient() { try { - // Get Azure Speech credentials from environment variables - const subscriptionKey = process.env.AZURE_SPEECH_KEY; - const region = process.env.AZURE_SPEECH_REGION; - + if (!sdk) { + throw new Error('Azure Speech SDK dependency is not installed'); + } + + if (!recorder || typeof recorder.record !== 'function') { + throw new Error('Local microphone recorder dependency is not installed'); + } + + const subscriptionKey = this._getSetting('azureKey') || process.env.AZURE_SPEECH_KEY; + const region = this._getSetting('azureRegion') || process.env.AZURE_SPEECH_REGION; + if (!subscriptionKey || !region) { const reason = 'Azure Speech credentials not found. Speech recognition disabled.'; - logger.warn('Speech service disabled (missing credentials)'); - this.available = false; + logger.warn('Speech service disabled (missing Azure credentials)'); this.emit('status', reason); return; } - // Validate region format - const validRegions = ['eastus', 'westus', 'westus2', 'eastus2', 'centralus', 'northcentralus', 'southcentralus', 'westcentralus', 'canadacentral', 'canadaeast', 'brazilsouth', 'northeurope', 'westeurope', 'uksouth', 'ukwest', 'francecentral', 'germanywestcentral', 'norwayeast', 'switzerlandnorth', 'switzerlandwest', 'swedencentral', 'uaenorth', 'southafricanorth', 'centralindia', 'southindia', 'westindia', 'eastasia', 'southeastasia', 'japaneast', 'japanwest', 'koreacentral', 'koreasouth', 'australiaeast', 'australiasoutheast']; - - if (!validRegions.includes(region.toLowerCase())) { - logger.warn('Potentially invalid Azure region specified', { region }); - } - - // Initialize Azure Speech configuration this.speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey, region); - - // Configure speech recognition settings with better defaults + const azureConfig = config.get('speech.azure') || {}; this.speechConfig.speechRecognitionLanguage = azureConfig.language || 'en-US'; this.speechConfig.outputFormat = sdk.OutputFormat.Detailed; - - // Set additional properties for better recognition - this.speechConfig.setProperty(sdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "5000"); - this.speechConfig.setProperty(sdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs, "2000"); - this.speechConfig.setProperty(sdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "2000"); - + this.speechConfig.setProperty(sdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, '5000'); + this.speechConfig.setProperty(sdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs, '2000'); + this.speechConfig.setProperty(sdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, '2000'); + if (azureConfig.enableDictation) { this.speechConfig.enableDictation(); } - + if (azureConfig.enableAudioLogging) { this.speechConfig.enableAudioLogging(); } - + + this.available = true; logger.info('Azure Speech service initialized successfully', { region, language: azureConfig.language || 'en-US' }); - - this.available = true; this.emit('status', 'Azure Speech Services ready'); - } catch (error) { - logger.error('Failed to initialize Azure Speech client', { error: error.message, stack: error.stack }); + logger.error('Failed to initialize Azure Speech client', { + error: error.message, + stack: error.stack + }); + this.available = false; + this.emit('status', 'Azure speech unavailable'); + } + } + + _initializeWhisperClient() { + try { + if (!recorder || typeof recorder.record !== 'function') { + throw new Error('Local microphone recorder dependency is not installed'); + } + + this.whisperCommand = this._resolveWhisperCommand(); + if (!this.whisperCommand) { + const reason = 'Local Whisper unavailable. Install the Whisper CLI or set WHISPER_COMMAND.'; + logger.warn(reason); + this.emit('status', reason); + return; + } + + this.available = true; + logger.info('Local Whisper service initialized successfully', { + command: [this.whisperCommand.command, ...this.whisperCommand.baseArgs].join(' '), + model: this._getWhisperModel(), + language: this._getWhisperLanguage() + }); + this.emit('status', 'Local Whisper ready'); + } catch (error) { + logger.error('Failed to initialize local Whisper client', { + error: error.message, + stack: error.stack + }); this.available = false; - this.emit('status', 'Speech recognition unavailable'); + this.emit('status', 'Local Whisper unavailable'); } } startRecording() { try { - if (!this.speechConfig) { - const errorMsg = 'Azure Speech client not initialized'; + if (!this.available) { + const errorMsg = `Speech provider "${this.provider}" is not available`; logger.error(errorMsg); this.emit('error', errorMsg); return; @@ -462,7 +553,17 @@ class SpeechService extends EventEmitter { this.sessionStartTime = Date.now(); this.retryCount = 0; - this._attemptRecording(); + if (this.provider === 'azure') { + this._startAzureRecording(); + return; + } + + if (this.provider === 'whisper') { + this._startWhisperRecording(); + return; + } + + throw new Error(`Unsupported speech provider: ${this.provider}`); } catch (error) { logger.error('Critical error in startRecording', { error: error.message, stack: error.stack }); this.emit('error', `Speech recognition failed to start: ${error.message}`); @@ -470,182 +571,125 @@ class SpeechService extends EventEmitter { } } - _attemptRecording() { + _startAzureRecording() { + if (!this.speechConfig) { + throw new Error('Azure Speech client not initialized'); + } + + this.isRecording = true; + this.emit('recording-started'); + this.emit('status', 'Azure recording started'); + this._cleanup(); + try { - this.isRecording = true; - this.emit('recording-started'); - - // Clean up any existing resources - this._cleanup(); - - // Use push stream with Node.js audio capture (more reliable for Electron main process) - try { - this.pushStream = sdk.AudioInputStream.createPushStream(); - this.audioConfig = sdk.AudioConfig.fromStreamInput(this.pushStream); - - // Start capturing real microphone audio - this._startMicrophoneCapture(); - - } catch (audioError) { - logger.error('Failed to create audio config', { error: audioError.message }); - this.emit('error', 'Audio configuration failed. Please check microphone permissions.'); - this.isRecording = false; - return; - } - - // Create speech recognizer - try { - this.recognizer = new sdk.SpeechRecognizer(this.speechConfig, this.audioConfig); - } catch (recognizerError) { - throw recognizerError; - } - - // Set up event handlers with better error handling - this.recognizer.recognizing = (s, e) => { - try { - if (e.result.reason === sdk.ResultReason.RecognizingSpeech) { - logger.debug('Interim transcription received', { - text: e.result.text, - offset: e.result.offset, - duration: e.result.duration - }); - this.emit('interim-transcription', e.result.text); - } - } catch (error) { - logger.error('Error in recognizing handler', { error: error.message }); - } - }; - - this.recognizer.recognized = (s, e) => { - try { - if (e.result.reason === sdk.ResultReason.RecognizedSpeech) { - const sessionDuration = Date.now() - this.sessionStartTime; - - // Only emit transcription if there's actual text content - if (e.result.text && e.result.text.trim().length > 0) { - logger.info('Final transcription received', { - text: e.result.text, - sessionDuration: `${sessionDuration}ms`, - textLength: e.result.text.length, - confidence: e.result.properties?.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult) - }); - - this.emit('transcription', e.result.text); - } else { - logger.debug('Empty transcription result ignored', { - sessionDuration: `${sessionDuration}ms`, - confidence: e.result.properties?.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult) - }); - } - } else if (e.result.reason === sdk.ResultReason.NoMatch) { - logger.debug('No speech pattern detected in audio'); - - // Check if there's detailed no-match information - const noMatchDetails = e.result.properties?.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult); - if (noMatchDetails) { - logger.debug('No match details', { details: noMatchDetails }); - } - } - } catch (error) { - logger.error('Error in recognized handler', { error: error.message }); - } - }; - - this.recognizer.canceled = (s, e) => { - logger.warn('Recognition session canceled', { - reason: e.reason, - errorCode: e.errorCode, - errorDetails: e.errorDetails - }); - - if (e.reason === sdk.CancellationReason.Error) { - const errorMsg = `Recognition error: ${e.errorDetails}`; - - // Check for specific error types and provide better messages - if (e.errorDetails.includes('1006')) { - this.emit('error', 'Network connection failed. Please check your internet connection.'); - } else if (e.errorDetails.includes('InvalidServiceCredentials')) { - this.emit('error', 'Invalid Azure Speech credentials. Please check AZURE_SPEECH_KEY and AZURE_SPEECH_REGION.'); - } else if (e.errorDetails.includes('Forbidden')) { - this.emit('error', 'Access denied. Please check your Azure Speech service subscription and region.'); - } else if (e.errorDetails.includes('AudioInputMicrophone_InitializationFailure')) { - this.emit('error', 'Microphone initialization failed. Please check microphone permissions and availability.'); - } else { - this.emit('error', errorMsg); - } - - // Attempt retry for transient errors - if (this.retryCount < this.maxRetries && ( - e.errorDetails.includes('1006') || - e.errorDetails.includes('timeout') || - e.errorDetails.includes('network') - )) { - this.retryCount++; - logger.info(`Retrying recognition (attempt ${this.retryCount}/${this.maxRetries})`); - setTimeout(() => { - if (!this.isRecording) { - this._attemptRecording(); - } - }, 1000 * this.retryCount); - return; - } + this.pushStream = sdk.AudioInputStream.createPushStream(); + this.audioConfig = sdk.AudioConfig.fromStreamInput(this.pushStream); + this._startMicrophoneCapture(); + this.recognizer = new sdk.SpeechRecognizer(this.speechConfig, this.audioConfig); + } catch (error) { + logger.error('Failed to start Azure recording session', { error: error.message }); + this.emit('error', `Audio configuration failed: ${error.message}`); + this.isRecording = false; + return; + } + + this.recognizer.recognizing = (s, e) => { + try { + if (e.result.reason === sdk.ResultReason.RecognizingSpeech) { + this.emit('interim-transcription', e.result.text); } - this.stopRecording(); - }; + } catch (error) { + logger.error('Error in recognizing handler', { error: error.message }); + } + }; - this.recognizer.sessionStarted = (s, e) => { - logger.info('Recognition session started', { sessionId: e.sessionId }); - }; + this.recognizer.recognized = (s, e) => { + try { + if (e.result.reason === sdk.ResultReason.RecognizedSpeech && e.result.text && e.result.text.trim()) { + this.emit('transcription', e.result.text); + } + } catch (error) { + logger.error('Error in recognized handler', { error: error.message }); + } + }; - this.recognizer.sessionStopped = (s, e) => { - logger.info('Recognition session ended', { sessionId: e.sessionId }); - this.stopRecording(); - }; + this.recognizer.canceled = (s, e) => { + logger.warn('Recognition session canceled', { + reason: e.reason, + errorCode: e.errorCode, + errorDetails: e.errorDetails + }); - // Start continuous recognition with timeout - const startTimeout = setTimeout(() => { - logger.error('Recognition start timeout'); - this.emit('error', 'Speech recognition start timeout. Please try again.'); - this.stopRecording(); - }, 10000); // 10 second timeout - - this.recognizer.startContinuousRecognitionAsync( - () => { - clearTimeout(startTimeout); - logger.info('Continuous speech recognition started successfully'); - if (global.windowManager) { - global.windowManager.handleRecordingStarted(); - } - }, - (error) => { - clearTimeout(startTimeout); - logger.error('Failed to start continuous recognition', { - error: error.toString(), - retryCount: this.retryCount - }); - - // Attempt retry for initialization failures - if (this.retryCount < this.maxRetries) { - this.retryCount++; - logger.info(`Retrying recognition start (attempt ${this.retryCount}/${this.maxRetries})`); - this.isRecording = false; - setTimeout(() => { - this._attemptRecording(); - }, 2000 * this.retryCount); - } else { - this.emit('error', `Recognition startup failed after ${this.maxRetries} attempts: ${error}`); - this.isRecording = false; - } - } - ); + if (e.reason === sdk.CancellationReason.Error) { + const details = e.errorDetails || ''; + if (details.includes('1006')) { + this.emit('error', 'Network connection failed. Please check your internet connection.'); + } else if (details.includes('InvalidServiceCredentials')) { + this.emit('error', 'Invalid Azure Speech credentials. Please check AZURE_SPEECH_KEY and AZURE_SPEECH_REGION.'); + } else if (details.includes('Forbidden')) { + this.emit('error', 'Access denied. Please check your Azure Speech service subscription and region.'); + } else if (details.includes('AudioInputMicrophone_InitializationFailure')) { + this.emit('error', 'Microphone initialization failed. Please check microphone permissions and availability.'); + } else { + this.emit('error', `Recognition error: ${details}`); + } + } - } catch (error) { - logger.error('Failed to start recording session', { - error: error.message, - stack: error.stack + this.stopRecording(); + }; + + this.recognizer.sessionStarted = (s, e) => { + logger.info('Recognition session started', { sessionId: e.sessionId }); + }; + + this.recognizer.sessionStopped = () => { + this.stopRecording(); + }; + + const startTimeout = setTimeout(() => { + logger.error('Recognition start timeout'); + this.emit('error', 'Speech recognition start timeout. Please try again.'); + this.stopRecording(); + }, 10000); + + this.recognizer.startContinuousRecognitionAsync( + () => { + clearTimeout(startTimeout); + logger.info('Continuous Azure speech recognition started successfully'); + if (global.windowManager) { + global.windowManager.handleRecordingStarted(); + } + }, + (error) => { + clearTimeout(startTimeout); + logger.error('Failed to start continuous recognition', { error: error.toString() }); + this.emit('error', `Recognition startup failed: ${error}`); + this.isRecording = false; + this._cleanup(); + } + ); + } + + _startWhisperRecording() { + this._cleanup(); + this.isRecording = true; + this.segmentBuffers = []; + this.segmentBytes = 0; + this.transcriptionInFlight = false; + this.pendingFlush = false; + this.emit('recording-started'); + this.emit('status', 'Local Whisper recording started'); + this._startMicrophoneCapture(); + + const segmentMs = this._getWhisperSegmentMs(); + this.segmentTimer = setInterval(() => { + this._flushWhisperSegment({ final: false }).catch((error) => { + logger.error('Whisper segment transcription failed', { error: error.message }); }); - this.emit('error', `Recording startup failed: ${error.message}`); - this.isRecording = false; + }, segmentMs); + + if (global.windowManager) { + global.windowManager.handleRecordingStarted(); } } @@ -656,40 +700,77 @@ class SpeechService extends EventEmitter { this.isRecording = false; const sessionDuration = this.sessionStartTime ? Date.now() - this.sessionStartTime : 0; - - logger.info('Stopping speech recognition session', { - sessionDuration: `${sessionDuration}ms` + logger.info('Stopping speech recognition session', { + provider: this.provider, + sessionDuration: `${sessionDuration}ms` }); - // Stop continuous recognition - if (this.recognizer) { + if (this.provider === 'azure' && this.recognizer) { try { this.recognizer.stopContinuousRecognitionAsync( () => { - logger.info('Speech recognition stopped successfully'); - this.emit('recording-stopped'); - this.emit('status', 'Recording stopped'); - if (global.windowManager) { - global.windowManager.handleRecordingStopped(); - } - this._cleanup(); + this._finalizeStop('Recording stopped'); }, (error) => { logger.error('Error during recognition stop', { error: error.toString() }); - this._cleanup(); + this._finalizeStop('Recording stopped'); } ); } catch (error) { logger.error('Error stopping recognizer', { error: error.message }); - this._cleanup(); + this._finalizeStop('Recording stopped'); + } + return; + } + + if (this.provider === 'whisper') { + this._finalizeWhisperStop(); + return; + } + + this._finalizeStop('Recording stopped'); + } + + async _finalizeWhisperStop() { + if (this.segmentTimer) { + clearInterval(this.segmentTimer); + this.segmentTimer = null; + } + + if (this.recording) { + try { + this.recording.stop(); + } catch (error) { + logger.error('Error stopping audio recording', { error: error.message }); } - } else { - this._cleanup(); + this.recording = null; + } + + try { + await this._flushWhisperSegment({ final: true }); + } catch (error) { + logger.error('Final Whisper transcription failed', { error: error.message }); + this.emit('error', `Whisper transcription failed: ${error.message}`); + } finally { + this._finalizeStop('Recording stopped'); + } + } + + _finalizeStop(statusMessage) { + this._cleanup(); + this.emit('recording-stopped'); + this.emit('status', statusMessage); + if (global.windowManager) { + global.windowManager.handleRecordingStopped(); } } _cleanup() { - // Clean up recognizer + if (this.segmentTimer) { + clearInterval(this.segmentTimer); + this.segmentTimer = null; + } + if (this.recognizer) { try { this.recognizer.close(); @@ -699,73 +780,50 @@ class SpeechService extends EventEmitter { this.recognizer = null; } - // Clean up audio config - if (this.audioConfig) { - try { - // Check if close method exists and call it appropriately - if (typeof this.audioConfig.close === 'function') { - try { - const closeResult = this.audioConfig.close(); - // If it returns a promise, handle it, otherwise just continue - if (closeResult && typeof closeResult.then === 'function') { - // It's a promise, but we don't need to wait for it in cleanup - closeResult.catch((error) => { - logger.error('Error closing audio config', { error: error.message }); - }); - } - } catch (closeError) { - logger.error('Error closing audio config', { error: closeError.message }); - } - } - } catch (error) { - logger.error('Error closing audio config', { error: error.message }); - } - this.audioConfig = null; - } - - // Stop audio recording - if (this.recording) { - try { - this.recording.stop(); - this.recording = null; - } catch (error) { - logger.error('Error stopping audio recording', { error: error.message }); - } - } - - // Clean up push stream - if (this.pushStream) { - try { - // Check if close method exists and call it appropriately - if (typeof this.pushStream.close === 'function') { - const closeResult = this.pushStream.close(); - // If it returns a promise, we can await it, otherwise just continue - if (closeResult && typeof closeResult.then === 'function') { - // It's a promise, but we don't need to wait for it in cleanup - closeResult.catch((error) => { - }); - } - } - } catch (error) { - logger.error('Error closing push stream', { error: error.message }); - } - this.pushStream = null; - } - - // Reset audio data logging flag - this._audioDataLogged = false; + if (this.audioConfig) { + try { + if (typeof this.audioConfig.close === 'function') { + this.audioConfig.close(); + } + } catch (error) { + logger.error('Error closing audio config', { error: error.message }); + } + this.audioConfig = null; + } + + if (this.recording) { + try { + this.recording.stop(); + } catch (error) { + logger.error('Error stopping audio recording', { error: error.message }); + } + this.recording = null; + } + + if (this.pushStream) { + try { + if (typeof this.pushStream.close === 'function') { + this.pushStream.close(); + } + } catch (error) { + logger.error('Error closing push stream', { error: error.message }); + } + this.pushStream = null; + } + + this.segmentBuffers = []; + this.segmentBytes = 0; + this.transcriptionInFlight = false; + this.pendingFlush = false; + this._audioDataLogged = false; } async recognizeFromFile(audioFilePath) { - if (!this.speechConfig) { - throw new Error('Speech service not initialized'); - } + if (this.provider === 'azure') { + if (!this.speechConfig) { + throw new Error('Speech service not initialized'); + } - const startTime = Date.now(); - - try { - // Validate file exists and is readable - const fs = require('fs'); if (!fs.existsSync(audioFilePath)) { throw new Error(`Audio file not found: ${audioFilePath}`); } @@ -773,206 +831,438 @@ class SpeechService extends EventEmitter { const audioConfig = sdk.AudioConfig.fromWavFileInput(audioFilePath); const recognizer = new sdk.SpeechRecognizer(this.speechConfig, audioConfig); - const result = await new Promise((resolve, reject) => { - const timeout = setTimeout(() => { - reject(new Error('File recognition timeout')); - recognizer.close(); - }, 30000); // 30 second timeout - + return await new Promise((resolve, reject) => { recognizer.recognizeOnceAsync( (result) => { - clearTimeout(timeout); - if (result.reason === sdk.ResultReason.RecognizedSpeech) { - resolve(result.text); - } else if (result.reason === sdk.ResultReason.NoMatch) { - resolve(''); // No speech detected in file - } else { - reject(new Error(`File recognition failed: ${result.reason}`)); - } + resolve(result.reason === sdk.ResultReason.RecognizedSpeech ? result.text : ''); recognizer.close(); audioConfig.close(); }, (error) => { - clearTimeout(timeout); reject(new Error(`File recognition error: ${error}`)); recognizer.close(); audioConfig.close(); } ); }); + } - logger.logPerformance('File speech recognition', startTime, { - filePath: audioFilePath, - textLength: result.length - }); + if (this.provider === 'whisper') { + return this._transcribeWhisperFile(audioFilePath); + } - return result; - } catch (error) { - logger.error('File recognition failed', { - filePath: audioFilePath, - error: error.message - }); - throw error; + throw new Error('Speech service not initialized'); + } + + async testConnection() { + if (this.provider === 'azure') { + if (!this.speechConfig) { + throw new Error('Speech service not initialized'); + } + + try { + const audioConfig = sdk.AudioConfig.fromDefaultMicrophoneInput(); + const recognizer = new sdk.SpeechRecognizer(this.speechConfig, audioConfig); + recognizer.close(); + audioConfig.close(); + return { success: true, message: 'Azure connection test successful' }; + } catch (error) { + return { success: false, message: error.message }; + } } + + if (this.provider === 'whisper') { + return { + success: !!this.whisperCommand, + message: this.whisperCommand ? 'Local Whisper CLI detected' : 'Local Whisper CLI not found' + }; + } + + return { success: false, message: 'Speech service not initialized' }; } getStatus() { return { + provider: this.provider, isRecording: this.isRecording, - isInitialized: !!this.speechConfig, + isInitialized: this.provider === 'azure' ? !!this.speechConfig : !!this.whisperCommand, sessionDuration: this.sessionStartTime ? Date.now() - this.sessionStartTime : 0, retryCount: this.retryCount, - config: config.get('speech.azure') || {} + effectiveSettings: { + speechProvider: this.provider, + azureKey: this._getSetting('azureKey') || '', + azureRegion: this._getSetting('azureRegion') || process.env.AZURE_SPEECH_REGION || '', + whisperCommand: this._getSetting('whisperCommand') || process.env.WHISPER_COMMAND || '', + whisperModelDir: this._getWhisperModelDir(), + whisperModel: this._getWhisperModel(), + whisperLanguage: this._getWhisperLanguage(), + whisperSegmentMs: String(this._getWhisperSegmentMs()) + }, + config: { + azure: config.get('speech.azure') || {}, + whisper: config.get('speech.whisper') || {}, + selectedProvider: this.provider + } }; } - // Test connection method - async testConnection() { - if (!this.speechConfig) { - throw new Error('Speech service not initialized'); - } - - try { - // Create a simple test recognizer - const audioConfig = sdk.AudioConfig.fromDefaultMicrophoneInput(); - const recognizer = new sdk.SpeechRecognizer(this.speechConfig, audioConfig); - - // Test by attempting to create the recognizer (this validates credentials) - recognizer.close(); - audioConfig.close(); - - return { success: true, message: 'Connection test successful' }; - } catch (error) { - return { success: false, message: error.message }; - } - } - - // Start capturing real microphone audio using node-record-lpcm16 - _startMicrophoneCapture() { - if (!this.pushStream) return; - - try { - // Check if recorder is available - if (!recorder || typeof recorder.record !== 'function') { - throw new Error('node-record-lpcm16 not available or not properly installed'); - } - - // Configure audio recording with error handling - this.recording = recorder.record({ - sampleRateHertz: 16000, // Azure Speech SDK prefers 16kHz - threshold: 0, // No silence threshold - verbose: false, // Quiet logging - recordProgram: 'sox', // Try 'sox' first (most common on macOS) - silence: '10.0s' // Longer silence threshold - }); - - if (!this.recording) { - throw new Error('Failed to create audio recording instance'); - } - - // Add error handler for the recording stream before using it - this.recording.stream().on('error', (error) => { - logger.error('Audio recording stream error', { error: error.message }); - - // Don't emit error immediately, try to recover - this._handleAudioError(error); - }); - - // Pipe audio data to Azure Speech SDK - this.recording.stream().on('data', (chunk) => { - if (this.pushStream && this.isRecording) { - try { - this.pushStream.write(chunk); - // Console log only first few chunks to avoid spam - if (!this._audioDataLogged) { - this._audioDataLogged = true; - } - } catch (error) { - } - } - }); - - } catch (error) { - logger.error('Failed to start microphone capture', { error: error.message, stack: error.stack }); - - // Fall back to no audio capture (Azure SDK will still work without audio) - this.emit('error', `Microphone capture failed: ${error.message}. Speech recognition may not work properly.`); - } - } - - // Handle audio recording errors with recovery attempts - _handleAudioError(error) { - - // Try to restart recording with different program - if (this.recording) { - try { - this.recording.stop(); - } catch (stopError) { - } - this.recording = null; - } - - // Try with different recording program - setTimeout(() => { - if (this.isRecording) { - this._startMicrophoneCaptureWithFallback(); - } - }, 1000); - } - - // Try microphone capture with different programs as fallback - _startMicrophoneCaptureWithFallback() { - const programs = ['sox', 'rec', 'arecord']; - let currentProgramIndex = 0; - - const tryNextProgram = () => { - if (currentProgramIndex >= programs.length) { - this.emit('error', 'Could not start microphone capture with any audio program'); - return; - } - - const program = programs[currentProgramIndex]; - - try { - this.recording = recorder.record({ - sampleRateHertz: 16000, - threshold: 0, - verbose: false, - recordProgram: program, - silence: '10.0s' - }); - - this.recording.stream().on('error', (error) => { - currentProgramIndex++; - tryNextProgram(); - }); - - this.recording.stream().on('data', (chunk) => { - if (this.pushStream && this.isRecording) { - try { - this.pushStream.write(chunk); - if (!this._audioDataLogged) { - this._audioDataLogged = true; - } - } catch (error) { - logger.error('Error writing audio data', { error: error.message }); - } - } - }); - } catch (error) { - logger.error(`${program} configuration failed`, { error: error.message }); - currentProgramIndex++; - tryNextProgram(); - } - }; - - tryNextProgram(); - } - - // Expose availability to UI isAvailable() { - return !!this.speechConfig && !!this.available; + if (this.provider === 'azure') { + return !!this.speechConfig && !!this.available; + } + + if (this.provider === 'whisper') { + return !!this.whisperCommand && !!this.available; + } + + return false; + } + + updateSettings(settings = {}) { + const speechKeys = ['speechProvider', 'azureKey', 'azureRegion', 'whisperCommand', 'whisperModelDir', 'whisperModel', 'whisperLanguage', 'whisperSegmentMs']; + let changed = false; + + for (const key of speechKeys) { + if (Object.prototype.hasOwnProperty.call(settings, key)) { + this.runtimeSettings[key] = settings[key]; + changed = true; + } + } + + if (changed) { + this.initializeClient(); + } + + return this.getStatus(); + } + + _getConfiguredProvider() { + const provider = String(this._getSetting('speechProvider') || process.env.SPEECH_PROVIDER || '').trim().toLowerCase(); + + if (provider === 'azure' || provider === 'whisper') { + return provider; + } + + const hasAzure = !!((this._getSetting('azureKey') || process.env.AZURE_SPEECH_KEY) && + (this._getSetting('azureRegion') || process.env.AZURE_SPEECH_REGION)); + + if (hasAzure) { + return 'azure'; + } + + return 'whisper'; + } + + _getWhisperModel() { + return this._getSetting('whisperModel') || process.env.WHISPER_MODEL || config.get('speech.whisper.model') || 'base'; + } + + _getWhisperModelDir() { + return this._getSetting('whisperModelDir') || process.env.WHISPER_MODEL_DIR || ''; + } + + _getWhisperLanguage() { + return this._getSetting('whisperLanguage') || process.env.WHISPER_LANGUAGE || config.get('speech.whisper.language') || 'en'; + } + + _getWhisperSegmentMs() { + const rawValue = this._getSetting('whisperSegmentMs') || process.env.WHISPER_SEGMENT_MS || config.get('speech.whisper.segmentMs') || 4000; + const parsed = Number(rawValue); + return Number.isFinite(parsed) ? Math.max(2000, parsed) : 4000; + } + + _getSetting(key) { + const value = this.runtimeSettings[key]; + return value === '' ? null : value; + } + + _resolveWhisperCommand() { + const configured = this._getSetting('whisperCommand') || process.env.WHISPER_COMMAND; + const candidates = []; + + if (configured) { + candidates.push(...this._expandConfiguredWhisperCandidates(configured)); + } + + candidates.push({ command: 'whisper', baseArgs: [] }); + candidates.push({ command: 'whisper.exe', baseArgs: [] }); + candidates.push({ command: 'py', baseArgs: ['-3', '-m', 'whisper'] }); + candidates.push({ command: 'python3', baseArgs: ['-m', 'whisper'] }); + candidates.push({ command: 'python', baseArgs: ['-m', 'whisper'] }); + + for (const candidate of candidates) { + if (!candidate || !candidate.command) { + continue; + } + + const probe = spawnSync(candidate.command, [...candidate.baseArgs, '--help'], { + encoding: 'utf8', + timeout: 5000 + }); + + const output = `${probe.stdout || ''}\n${probe.stderr || ''}`; + if (!probe.error && probe.status === 0 && !output.includes('No module named whisper')) { + return candidate; + } + } + + return null; + } + + _expandConfiguredWhisperCandidates(rawCommand) { + const parsed = this._parseCommand(rawCommand); + if (!parsed) { + return []; + } + + const candidates = [parsed]; + const resolvedPath = path.resolve(parsed.command); + + if (resolvedPath !== parsed.command) { + candidates.push({ command: resolvedPath, baseArgs: parsed.baseArgs }); + } + + if (process.platform === 'win32') { + if (!/\.(exe|cmd|bat)$/i.test(parsed.command)) { + candidates.push({ command: `${parsed.command}.exe`, baseArgs: parsed.baseArgs }); + candidates.push({ command: `${parsed.command}.cmd`, baseArgs: parsed.baseArgs }); + candidates.push({ command: `${resolvedPath}.exe`, baseArgs: parsed.baseArgs }); + candidates.push({ command: `${resolvedPath}.cmd`, baseArgs: parsed.baseArgs }); + } + } + + return candidates; + } + + _parseCommand(rawCommand) { + const parts = String(rawCommand || '').trim().split(/\s+/).filter(Boolean); + if (parts.length === 0) { + return null; + } + + return { + command: parts[0], + baseArgs: parts.slice(1) + }; + } + + _startMicrophoneCapture() { + if (!recorder || typeof recorder.record !== 'function') { + this.emit('error', 'Local microphone capture dependency is missing. Run npm install to restore speech recording support.'); + return; + } + + this._startMicrophoneCaptureWithFallback(['sox', 'rec', 'arecord']); + } + + _startMicrophoneCaptureWithFallback(programs) { + const queue = [...programs]; + + const tryNextProgram = () => { + const program = queue.shift(); + if (!program) { + this.emit('error', 'Could not start microphone capture with any audio program'); + return; + } + + try { + this.recording = recorder.record({ + sampleRateHertz: 16000, + channels: 1, + threshold: 0, + verbose: false, + recordProgram: program, + silence: '10.0s' + }); + + const stream = this.recording.stream(); + this.audioProgram = program; + + stream.on('error', (error) => { + logger.error('Audio recording stream error', { error: error.message, program }); + if (this.recording) { + try { + this.recording.stop(); + } catch (stopError) { + logger.error('Error stopping failed recording program', { error: stopError.message }); + } + this.recording = null; + } + + if (this.isRecording) { + tryNextProgram(); + } + }); + + stream.on('data', (chunk) => { + this._handleAudioChunk(chunk); + }); + } catch (error) { + logger.error('Failed to start microphone capture program', { program, error: error.message }); + tryNextProgram(); + } + }; + + tryNextProgram(); + } + + _handleAudioChunk(chunk) { + if (!chunk || !chunk.length || !this.isRecording) { + return; + } + + if (this.provider === 'azure' && this.pushStream) { + try { + this.pushStream.write(chunk); + } catch (error) { + logger.error('Error writing audio data to Azure push stream', { error: error.message }); + } + return; + } + + if (this.provider === 'whisper') { + this.segmentBuffers.push(Buffer.from(chunk)); + this.segmentBytes += chunk.length; + } + } + + async _flushWhisperSegment({ final }) { + if (this.transcriptionInFlight) { + this.pendingFlush = this.pendingFlush || final; + return; + } + + if (!this.segmentBytes) { + return; + } + + const audioBuffer = Buffer.concat(this.segmentBuffers, this.segmentBytes); + this.segmentBuffers = []; + this.segmentBytes = 0; + + this.transcriptionInFlight = true; + + try { + const transcript = await this._transcribeWhisperBuffer(audioBuffer); + if (transcript && transcript.trim()) { + this.emit('transcription', transcript.trim()); + } + } finally { + this.transcriptionInFlight = false; + + if (this.pendingFlush) { + const shouldRunFinal = this.pendingFlush; + this.pendingFlush = false; + await this._flushWhisperSegment({ final: shouldRunFinal }); + } + } + } + + async _transcribeWhisperBuffer(audioBuffer) { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencluely-whisper-')); + const audioFilePath = path.join(tempDir, 'segment.wav'); + + try { + fs.writeFileSync(audioFilePath, this._createWavBuffer(audioBuffer)); + return await this._transcribeWhisperFile(audioFilePath); + } finally { + this._removeTempDir(tempDir); + } + } + + async _transcribeWhisperFile(audioFilePath) { + if (!this.whisperCommand) { + throw new Error('Local Whisper CLI not configured'); + } + + const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencluely-whisper-out-')); + const args = [ + ...this.whisperCommand.baseArgs, + audioFilePath, + '--model', this._getWhisperModel(), + '--language', this._getWhisperLanguage(), + '--task', 'transcribe', + '--output_format', 'txt', + '--output_dir', outputDir, + '--verbose', 'False', + '--fp16', 'False' + ]; + + if (this._getWhisperModelDir()) { + args.push('--model_dir', this._getWhisperModelDir()); + } + + try { + await new Promise((resolve, reject) => { + const child = spawn(this.whisperCommand.command, args, { + stdio: ['ignore', 'pipe', 'pipe'] + }); + + let stderr = ''; + child.stderr.on('data', (chunk) => { + stderr += chunk.toString(); + }); + + child.on('error', (error) => { + reject(error); + }); + + child.on('close', (code) => { + if (code === 0) { + resolve(); + return; + } + + reject(new Error(stderr.trim() || `Whisper exited with code ${code}`)); + }); + }); + + const transcriptPath = path.join(outputDir, `${path.parse(audioFilePath).name}.txt`); + if (!fs.existsSync(transcriptPath)) { + return ''; + } + + return fs.readFileSync(transcriptPath, 'utf8').trim(); + } finally { + this._removeTempDir(outputDir); + } + } + + _createWavBuffer(rawPcmBuffer) { + const header = Buffer.alloc(44); + const sampleRate = 16000; + const channels = 1; + const bitsPerSample = 16; + const byteRate = sampleRate * channels * (bitsPerSample / 8); + const blockAlign = channels * (bitsPerSample / 8); + + header.write('RIFF', 0); + header.writeUInt32LE(36 + rawPcmBuffer.length, 4); + header.write('WAVE', 8); + header.write('fmt ', 12); + header.writeUInt32LE(16, 16); + header.writeUInt16LE(1, 20); + header.writeUInt16LE(channels, 22); + header.writeUInt32LE(sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(bitsPerSample, 34); + header.write('data', 36); + header.writeUInt32LE(rawPcmBuffer.length, 40); + + return Buffer.concat([header, rawPcmBuffer]); + } + + _removeTempDir(tempDir) { + try { + fs.rmSync(tempDir, { recursive: true, force: true }); + } catch (error) { + logger.error('Failed to remove Whisper temp directory', { + tempDir, + error: error.message + }); + } } } -module.exports = new SpeechService(); \ No newline at end of file +module.exports = new SpeechService(); diff --git a/src/styles/common.css b/src/styles/common.css index 478f715..2db6f85 100644 --- a/src/styles/common.css +++ b/src/styles/common.css @@ -1,7 +1,7 @@ /* Common Styles for OpenCluely UI Components */ /* Font imports */ -@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css'); +@import url('../../node_modules/@fortawesome/fontawesome-free/css/all.min.css'); /* Base styles */ body { @@ -576,4 +576,4 @@ body { .hide-scrollbar { scrollbar-width: none; -ms-overflow-style: none; -} \ No newline at end of file +} diff --git a/src/ui/settings-window.js b/src/ui/settings-window.js index 1062c3c..2a7047e 100644 --- a/src/ui/settings-window.js +++ b/src/ui/settings-window.js @@ -6,8 +6,13 @@ document.addEventListener('DOMContentLoaded', () => { // Get DOM elements const closeButton = document.getElementById('closeButton'); const quitButton = document.getElementById('quitButton'); + const speechProviderSelect = document.getElementById('speechProvider'); const azureKeyInput = document.getElementById('azureKey'); const azureRegionInput = document.getElementById('azureRegion'); + const whisperCommandInput = document.getElementById('whisperCommand'); + const whisperModelInput = document.getElementById('whisperModel'); + const whisperLanguageInput = document.getElementById('whisperLanguage'); + const whisperSegmentMsInput = document.getElementById('whisperSegmentMs'); const geminiKeyInput = document.getElementById('geminiKey'); const windowGapInput = document.getElementById('windowGap'); const codingLanguageSelect = document.getElementById('codingLanguage'); @@ -66,8 +71,13 @@ document.addEventListener('DOMContentLoaded', () => { // Function to load settings into UI const loadSettingsIntoUI = (settings) => { + if (settings.speechProvider && speechProviderSelect) speechProviderSelect.value = settings.speechProvider; if (settings.azureKey && azureKeyInput) azureKeyInput.value = settings.azureKey; if (settings.azureRegion && azureRegionInput) azureRegionInput.value = settings.azureRegion; + if (settings.whisperCommand && whisperCommandInput) whisperCommandInput.value = settings.whisperCommand; + if (settings.whisperModel && whisperModelInput) whisperModelInput.value = settings.whisperModel; + if (settings.whisperLanguage && whisperLanguageInput) whisperLanguageInput.value = settings.whisperLanguage; + if (settings.whisperSegmentMs && whisperSegmentMsInput) whisperSegmentMsInput.value = settings.whisperSegmentMs; if (settings.geminiKey && geminiKeyInput) geminiKeyInput.value = settings.geminiKey; if (settings.windowGap && windowGapInput) windowGapInput.value = settings.windowGap; @@ -90,6 +100,8 @@ document.addEventListener('DOMContentLoaded', () => { } }); } + + updateSpeechFieldStates(); }; // Load settings when window opens @@ -115,8 +127,13 @@ document.addEventListener('DOMContentLoaded', () => { // Save settings helper function const saveSettings = () => { const settings = {}; + if (speechProviderSelect) settings.speechProvider = speechProviderSelect.value; if (azureKeyInput) settings.azureKey = azureKeyInput.value; if (azureRegionInput) settings.azureRegion = azureRegionInput.value; + if (whisperCommandInput) settings.whisperCommand = whisperCommandInput.value; + if (whisperModelInput) settings.whisperModel = whisperModelInput.value; + if (whisperLanguageInput) settings.whisperLanguage = whisperLanguageInput.value; + if (whisperSegmentMsInput) settings.whisperSegmentMs = whisperSegmentMsInput.value; if (geminiKeyInput) settings.geminiKey = geminiKeyInput.value; if (windowGapInput) settings.windowGap = windowGapInput.value; if (codingLanguageSelect) settings.codingLanguage = codingLanguageSelect.value; @@ -125,10 +142,29 @@ document.addEventListener('DOMContentLoaded', () => { window.api.send('save-settings', settings); }; + const updateSpeechFieldStates = () => { + const provider = speechProviderSelect ? speechProviderSelect.value : 'azure'; + const azureDisabled = provider !== 'azure'; + const whisperDisabled = provider !== 'whisper'; + + [azureKeyInput, azureRegionInput].forEach(input => { + if (input) input.disabled = azureDisabled; + }); + + [whisperCommandInput, whisperModelInput, whisperLanguageInput, whisperSegmentMsInput].forEach(input => { + if (input) input.disabled = whisperDisabled; + }); + }; + // Add event listeners for all inputs const inputs = [ + speechProviderSelect, azureKeyInput, azureRegionInput, + whisperCommandInput, + whisperModelInput, + whisperLanguageInput, + whisperSegmentMsInput, geminiKeyInput, windowGapInput ]; @@ -140,6 +176,13 @@ document.addEventListener('DOMContentLoaded', () => { } }); + if (speechProviderSelect) { + speechProviderSelect.addEventListener('change', () => { + updateSpeechFieldStates(); + saveSettings(); + }); + } + // Language selection handler if (codingLanguageSelect) { codingLanguageSelect.addEventListener('change', (e) => { @@ -163,6 +206,8 @@ document.addEventListener('DOMContentLoaded', () => { }); } + updateSpeechFieldStates(); + // Initialize icon grid with correct paths const initializeIconGrid = () => { if (!iconGrid) return;