diff --git a/.gitignore b/.gitignore index 65a6a54..e249383 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ node_modules/ .env +.venv-whisper/ +.whisper-models/ eng.traineddata dist/ -.DS_Store \ No newline at end of file +.DS_Store +*.log diff --git a/README.md b/README.md index e9de5af..7b7015e 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Platform Stealth AI - Speech + Speech

--- @@ -53,7 +53,7 @@ https://github.com/user-attachments/assets/896a7140-1e85-405d-bfbe-e05c9f3a816b ### 🚀 **AI-Powered Intelligence** - **Direct Image Analysis**: Screenshots are analyzed by Gemini (no Tesseract OCR) -- **Voice Commands**: Optional Azure Speech (Whisper questions, get instant answers) +- **Voice Commands**: Optional Azure Speech or local OpenAI Whisper - **Context Memory**: Remembers entire interview conversation - **Multi-Language Support**: C++, Python, Java, JavaScript, C - **Smart Response Window**: Draggable with close button @@ -68,7 +68,7 @@ https://github.com/user-attachments/assets/896a7140-1e85-405d-bfbe-e05c9f3a816b - **Floating Overlay Bar**: Compact command center with camera, mic, and skill selector - **Draggable Answer Window**: Move and resize AI response window anywhere - **Close Button**: Clean × button to close answer window when needed -- **Auto-Hide Mic**: Microphone button appears only when Azure Speech is configured +- **Auto-Hide Mic**: Microphone button appears only when a speech provider is available - **Interactive Chat**: Full conversation window with markdown support ### 🎨 **Visual Design** @@ -133,7 +133,7 @@ https://github.com/user-attachments/assets/896a7140-1e85-405d-bfbe-e05c9f3a816b - [x] **Global shortcuts** (capture, visibility, interaction, chat, settings) - [x] **Session memory** and chat UI - [x] **Language picker** and DSA skill prompt -- [x] **Optional Azure Speech** integration with auto‑hide mic +- [x] **Optional Azure Speech / local Whisper** integration with auto‑hide mic - [x] **Multi‑monitor** and area capture APIs - [x] **Window binding** and positioning system - [x] **Settings management** with app icon/stealth modes @@ -157,12 +157,22 @@ The setup script automatically handles configuration. You only need: # Required: Google Gemini API Key (setup script will ask for this) GEMINI_API_KEY=your_gemini_api_key_here -# Optional: Azure Speech Recognition (add later if you want voice features) +# Optional: Speech Recognition (pick one provider) +SPEECH_PROVIDER=whisper + +# Azure option AZURE_SPEECH_KEY=your_azure_speech_key AZURE_SPEECH_REGION=your_region + +# Local Whisper option +WHISPER_COMMAND=whisper +WHISPER_MODEL_DIR=.whisper-models +WHISPER_MODEL=base +WHISPER_LANGUAGE=en +WHISPER_SEGMENT_MS=4000 ``` -**Note**: Speech recognition is completely optional. If Azure credentials are not provided, the microphone button will be automatically hidden from all interfaces. +**Note**: Speech recognition is completely optional. If no configured provider is available, the microphone button will be automatically hidden from all interfaces. ## 🚀 Quick Start & Installation @@ -187,7 +197,9 @@ AZURE_SPEECH_REGION=your_region **That's it!** The setup script will: - Install all dependencies automatically -- Create and configure your `.env` file +- Create your `.env` file from `env.example` if needed +- Set up a local Whisper virtualenv in `.venv-whisper` +- Configure `.env` to use local Whisper by default - Build the app (if needed) - Launch OpenCluely ready to use (if not works use npm install & then npm start) @@ -196,6 +208,8 @@ AZURE_SPEECH_REGION=your_region - **Windows**: Use Git Bash (comes with Git for Windows), WSL, or any bash environment - **macOS/Linux**: Use your regular terminal - **All platforms**: No manual npm commands needed - the setup script handles everything +- **Windows Whisper path**: `setup.sh` now writes `WHISPER_COMMAND=.venv-whisper/Scripts/whisper.exe` +- **macOS/Linux Whisper path**: `setup.sh` writes `WHISPER_COMMAND=.venv-whisper/bin/whisper` ### 🎛️ Setup Script Options @@ -204,28 +218,50 @@ AZURE_SPEECH_REGION=your_region ./setup.sh --ci # Use npm ci instead of npm install ./setup.sh --no-run # Setup only, don't launch the app ./setup.sh --install-system-deps # Install sox for microphone (optional) +./setup.sh --skip-whisper # Skip the local Whisper bootstrap ``` -### 🔧 **Optional: Azure Speech Setup** (For Voice Features) +### 🔧 **Optional: Speech Setup** (For Voice Features) + +Voice recognition is optional. You can use either Azure Speech or local OpenAI Whisper. -Voice recognition is completely optional. The setup script will create a `.env` file with just the required Gemini key. To add voice features: +For the local Whisper path, `./setup.sh` now handles the full repo-local setup: -1. Get Azure Speech credentials: +1. Creates `.venv-whisper` +2. Installs `openai-whisper` +3. Points `.env` at `.venv-whisper/bin/whisper` +4. Creates `.whisper-models` +5. Runs `npm run test-speech` + +1. For Azure Speech: - Visit [Azure Portal](https://portal.azure.com/) - Create a Speech Service - Copy your key and region -2. Add to your `.env` file: +2. For local Whisper: + - Run `./setup.sh --install-system-deps` + - Or install required audio tools such as `ffmpeg` and `sox` yourself + - On Windows, install audio tooling separately and prefer Git Bash or WSL for `setup.sh` + +3. Add one provider to your `.env` file: ```env - # Already configured by setup script GEMINI_API_KEY=your_gemini_api_key_here - - # Add these for voice features (optional) + SPEECH_PROVIDER=azure AZURE_SPEECH_KEY=your_azure_speech_key AZURE_SPEECH_REGION=your_region ``` -3. Restart the app - microphone buttons will now appear automatically + ```env + GEMINI_API_KEY=your_gemini_api_key_here + SPEECH_PROVIDER=whisper + WHISPER_COMMAND=whisper + WHISPER_MODEL_DIR=.whisper-models + WHISPER_MODEL=base + WHISPER_LANGUAGE=en + WHISPER_SEGMENT_MS=4000 + ``` + +4. Restart the app - microphone buttons will now appear automatically ## 🎮 How to Use @@ -265,10 +301,11 @@ Voice recognition is completely optional. The setup script will create a `.env` - **Image Understanding**: DSA prompt is applied only for new image-based queries; chat messages don’t include the full prompt - **Multi-monitor & Area Capture**: Programmatic APIs allow targeting a display and optional rectangular crop for focused analysis -#### 🔊 **Optional Voice Features** (Azure Speech) -- **Real-time Transcription**: Speak questions naturally +#### 🔊 **Optional Voice Features** (Azure Speech / Local Whisper) +- **Chunked Local Transcription**: Local Whisper transcribes short recorded segments on your machine +- **Real-time Transcription**: Azure Speech supports live interim recognition - **Listening Animation**: Visual feedback during recording -- **Interim Results**: See transcription as you speak +- **Interim Results**: Available with Azure Speech - **Auto-processing**: Instant AI responses to voice input ] --- @@ -305,7 +342,8 @@ Voice recognition is completely optional. The setup script will create a `.env` - **Microphone/voice not working** - Voice is optional - ignore related warnings if you don't need it - - To enable: install `sox` (Linux/macOS) and add Azure keys to `.env` + - Azure mode: add valid Azure keys to `.env` + - Whisper mode: install `openai-whisper`, `ffmpeg`, and `sox`, then set `SPEECH_PROVIDER=whisper` @@ -341,7 +379,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## 🙏 Acknowledgments - **Google Gemini**: Powering AI intelligence -- **Azure Speech**: Optional voice recognition +- **Azure Speech / Whisper**: Optional voice recognition - **Electron**: Cross-platform desktop framework - **Community**: Amazing contributors and feedback diff --git a/chat.html b/chat.html index d986477..baff03c 100644 --- a/chat.html +++ b/chat.html @@ -4,10 +4,8 @@ Chat - - - + @@ -336,6 +352,16 @@ Speech Recognition
+
+
+
Speech Provider
+
Choose Azure Speech or a local OpenAI Whisper CLI
+
+ +
Azure Speech Key
@@ -350,6 +376,39 @@
+
+
+
+
Whisper Command
+
CLI command for local Whisper, such as whisper or python3 -m whisper
+
+ +
+
+
+
+
Whisper Model
+
Local model name used by the Whisper CLI
+
+ +
+
+
+
Whisper Language
+
Language code for local transcription
+
+ +
+
+
+
Whisper Segment Length
+
Chunk size in milliseconds for local transcription
+
+ +
+
+ Local Whisper runs on this machine and needs a Whisper CLI installed. These settings apply immediately for the current app session; use .env for startup defaults. +
@@ -389,4 +448,4 @@ - \ No newline at end of file + diff --git a/setup.sh b/setup.sh index 4990721..d3cd481 100755 --- a/setup.sh +++ b/setup.sh @@ -1,16 +1,22 @@ #!/usr/bin/env bash set -euo pipefail -# OpenCluely one-shot setup: install deps, (optionally) build, and run -# Works on macOS, Linux, and Windows (Git Bash / MSYS2 / Cygwin) - -# Defaults DO_BUILD=0 DO_RUN=1 USE_CI=0 INSTALL_SYSTEM_DEPS=0 +SETUP_WHISPER=1 +WHISPER_MODEL="${WHISPER_MODEL:-base}" +WHISPER_LANGUAGE="${WHISPER_LANGUAGE:-en}" +WHISPER_SEGMENT_MS="${WHISPER_SEGMENT_MS:-4000}" +WHISPER_VENV_DIR=".venv-whisper" +WHISPER_MODEL_DIR=".whisper-models" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" OS_NAME="unknown" PLATFORM_BUILD_SCRIPT="build" +PYTHON_BIN="python3" +WHISPER_PIP_PATH="" +WHISPER_COMMAND_PATH="" print_header() { echo "========================================" @@ -23,25 +29,30 @@ usage() { Usage: ./setup.sh [options] This script will: -1. Install dependencies -2. Create .env file (if needed) -3. Guide you to add your Gemini API key -4. Optionally build the app -5. Start OpenCluely +1. Create .env from env.example when needed +2. Install Node dependencies +3. Optionally set up local Whisper in ${WHISPER_VENV_DIR} +4. Optionally install system audio dependencies +5. Optionally build the app +6. Optionally run OpenCluely Options: - --build Build a distributable for this OS (electron-builder) + --build Build a distributable for this OS --no-run Do not start the app after setup --run Start the app after setup (default) - --ci Use 'npm ci' instead of 'npm install' if lockfile exists - --install-system-deps Attempt to install required system dependencies (sox) where possible + --ci Use 'npm ci' instead of 'npm install' + --install-system-deps Attempt to install sox where possible + --skip-whisper Skip local Whisper environment setup -h, --help Show this help Environment variables: - GEMINI_API_KEY If provided, will be written into .env (skips manual setup) + GEMINI_API_KEY If provided, writes into .env + WHISPER_MODEL Whisper model to configure (default: base) + WHISPER_LANGUAGE Whisper language to configure (default: en) + WHISPER_SEGMENT_MS Segment size in ms (default: 4000) -Example with API key: - GEMINI_API_KEY=your_key_here ./setup.sh +Example: + GEMINI_API_KEY=your_key_here ./setup.sh --install-system-deps EOF } @@ -52,178 +63,210 @@ for arg in "$@"; do --run) DO_RUN=1 ;; --ci) USE_CI=1 ;; --install-system-deps) INSTALL_SYSTEM_DEPS=1 ;; + --skip-whisper) SETUP_WHISPER=0 ;; -h|--help) usage; exit 0 ;; *) echo "Unknown option: $arg"; usage; exit 1 ;; esac - shift || true done print_header +cd "$SCRIPT_DIR" -# Detect OS -UNAME_OUT=$(uname -s || echo "unknown") -case "$UNAME_OUT" in - Linux*) OS_NAME="linux" ;; - Darwin*) OS_NAME="macos" ;; - CYGWIN*|MINGW*|MSYS*) OS_NAME="windows" ;; - *) OS_NAME="unknown" ;; - esac +detect_os() { + local uname_out + uname_out=$(uname -s || echo "unknown") + case "$uname_out" in + Linux*) OS_NAME="linux" ;; + Darwin*) OS_NAME="macos" ;; + CYGWIN*|MINGW*|MSYS*) OS_NAME="windows" ;; + *) OS_NAME="unknown" ;; + esac -echo "Detected OS: $OS_NAME" + case "$OS_NAME" in + macos) PLATFORM_BUILD_SCRIPT="build:mac" ;; + windows) PLATFORM_BUILD_SCRIPT="build:win" ;; + linux) PLATFORM_BUILD_SCRIPT="build:linux" ;; + *) PLATFORM_BUILD_SCRIPT="build" ;; + esac -# Map build script per platform (optional) -case "$OS_NAME" in - macos) PLATFORM_BUILD_SCRIPT="build:mac" ;; - windows) PLATFORM_BUILD_SCRIPT="build:win" ;; - linux) PLATFORM_BUILD_SCRIPT="build:linux" ;; - *) PLATFORM_BUILD_SCRIPT="build" ;; - esac - -# Check Node & npm -if ! command -v node >/dev/null 2>&1; then - echo "Error: Node.js is not installed or not in PATH. Please install Node 18+ and retry." - exit 1 -fi -if ! command -v npm >/dev/null 2>&1; then - echo "Error: npm is not installed or not in PATH." - exit 1 -fi + case "$OS_NAME" in + windows) + PYTHON_BIN="python" + WHISPER_PIP_PATH="${WHISPER_VENV_DIR}/Scripts/pip.exe" + WHISPER_COMMAND_PATH="${WHISPER_VENV_DIR}/Scripts/whisper.exe" + ;; + *) + PYTHON_BIN="python3" + WHISPER_PIP_PATH="${WHISPER_VENV_DIR}/bin/pip" + WHISPER_COMMAND_PATH="${WHISPER_VENV_DIR}/bin/whisper" + ;; + esac +} -echo "Node: $(node -v)" -echo "npm: $(npm -v)" +require_command() { + local cmd="$1" + local message="$2" + if ! command -v "$cmd" >/dev/null 2>&1; then + echo "Error: ${message}" + exit 1 + fi +} -# Install system dependencies (optional best-effort) -if [[ "$INSTALL_SYSTEM_DEPS" -eq 1 ]]; then - echo "Attempting to install system dependencies (best effort)" - if ! command -v sox >/dev/null 2>&1; then - case "$OS_NAME" in - macos) - if command -v brew >/dev/null 2>&1; then - echo "Installing sox via Homebrew..." - brew install sox || echo "Could not install sox via brew. You can install it manually: brew install sox" - else - echo "Homebrew not found. Install sox manually: https://formulae.brew.sh/formula/sox" - fi - ;; - linux) - if command -v apt-get >/dev/null 2>&1; then - echo "Installing sox via apt-get (sudo may prompt)..." - sudo apt-get update -y && sudo apt-get install -y sox || echo "Could not install sox via apt-get." - elif command -v dnf >/dev/null 2>&1; then - echo "Installing sox via dnf (sudo may prompt)..." - sudo dnf install -y sox || echo "Could not install sox via dnf." - elif command -v pacman >/dev/null 2>&1; then - echo "Installing sox via pacman (sudo may prompt)..." - sudo pacman -S --noconfirm sox || echo "Could not install sox via pacman." - else - echo "Unknown package manager. Please install 'sox' manually." - fi - ;; - windows) - echo "On Windows, install sox via Chocolatey (Admin PowerShell): choco install sox" - ;; - *) - echo "Unknown OS; please install 'sox' manually if you need microphone capture." - ;; - esac - else - echo "sox already installed." +ensure_env_file() { + if [[ ! -f .env ]]; then + if [[ -f env.example ]]; then + echo "Creating .env from env.example" + cp env.example .env + else + echo "Error: env.example is missing" + exit 1 + fi fi -fi +} -# Project root -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" +upsert_env() { + local key="$1" + local value="$2" -# Ensure .env exists and has API key -ENV_NEEDS_CONFIG=0 -if [[ ! -f .env ]]; then - if [[ -f env.example ]]; then - echo "Creating .env from env.example" - cp env.example .env - ENV_NEEDS_CONFIG=1 + if grep -q "^${key}=" .env 2>/dev/null; then + perl -0pi -e "s/^${key}=.*\$/${key}=${value}/m" .env else - echo "Creating new .env file" - cat > .env << 'EOF' -# Google Gemini API Configuration -# Get your API key from: https://aistudio.google.com/ -GEMINI_API_KEY=your_gemini_api_key_here - -# Optional: Azure Speech Services Configuration -# AZURE_SPEECH_KEY=your_azure_speech_key_here -# AZURE_SPEECH_REGION=your_azure_region_here -EOF - ENV_NEEDS_CONFIG=1 + printf "%s=%s\n" "$key" "$value" >> .env fi -fi - -# If GEMINI_API_KEY is provided via env and .env lacks it, append it -if [[ -n "${GEMINI_API_KEY:-}" ]]; then - if ! grep -q '^GEMINI_API_KEY=' .env 2>/dev/null; then - echo "GEMINI_API_KEY is set in the environment; writing to .env" - printf "GEMINI_API_KEY=%s\n" "$GEMINI_API_KEY" >> .env - ENV_NEEDS_CONFIG=0 +} + +ensure_gemini_key() { + if [[ -n "${GEMINI_API_KEY:-}" ]]; then + upsert_env "GEMINI_API_KEY" "$GEMINI_API_KEY" fi -fi - -# Check if API key is configured -if [[ "$ENV_NEEDS_CONFIG" -eq 1 ]] || grep -q "your_gemini_api_key_here" .env 2>/dev/null; then - echo "" - echo "==========================================" - echo " ⚠️ API KEY REQUIRED" - echo "==========================================" - echo "" - echo "OpenCluely needs a Google Gemini API key to work." - echo "" - echo "Steps to get your API key:" - echo "1. Visit: https://aistudio.google.com/" - echo "2. Click 'Create API Key'" - echo "3. Copy the generated key" - echo "" - echo "Then edit your .env file and replace 'your_gemini_api_key_here' with your actual key:" - echo "" - echo "GEMINI_API_KEY=your_actual_api_key_here" - echo "" - echo "You can edit .env with any text editor:" - echo " nano .env (Linux/macOS)" - echo " notepad .env (Windows)" - echo " code .env (VS Code)" - echo "" - read -p "Press Enter after you've added your API key to continue..." - echo "" -fi - -# Install node dependencies -if [[ -f package-lock.json && "$USE_CI" -eq 1 ]]; then - echo "Installing dependencies with npm ci" - npm ci -else - echo "Installing dependencies with npm install" - npm install -fi - -# Build (optional) -if [[ "$DO_BUILD" -eq 1 ]]; then - echo "Building app for $OS_NAME via npm run $PLATFORM_BUILD_SCRIPT" - npm run "$PLATFORM_BUILD_SCRIPT" -fi - -# Run (default) -if [[ "$DO_RUN" -eq 1 ]]; then - # Final validation before starting - if grep -q "your_gemini_api_key_here" .env 2>/dev/null; then + + if ! grep -q '^GEMINI_API_KEY=' .env 2>/dev/null || grep -q 'your_gemini_api_key_here' .env 2>/dev/null; then echo "" - echo "❌ Error: API key not configured!" - echo "Please edit .env and replace 'your_gemini_api_key_here' with your actual Gemini API key." - echo "Get your key from: https://aistudio.google.com/" + echo "==========================================" + echo " API KEY REQUIRED" + echo "==========================================" echo "" - echo "Then run the setup script again:" - echo "./setup.sh" + echo "Add your Gemini API key to .env and rerun this script if needed." + echo "Get a key from: https://aistudio.google.com/" + echo "" + read -r -p "Press Enter after you've updated .env..." + fi + + if grep -q 'your_gemini_api_key_here' .env 2>/dev/null; then + echo "Error: GEMINI_API_KEY is still not configured in .env" exit 1 fi - - echo "Starting app (npm start)" - npm start -else - echo "Setup complete. Skipping run." -fi +} + +install_system_deps() { + if [[ "$INSTALL_SYSTEM_DEPS" -ne 1 ]]; then + return + fi + + echo "Attempting to install system audio dependencies" + + if command -v sox >/dev/null 2>&1; then + echo "sox already installed" + return + fi + + case "$OS_NAME" in + macos) + if command -v brew >/dev/null 2>&1; then + brew install sox || echo "Could not install sox automatically. Install it manually with: brew install sox" + else + echo "Homebrew not found. Install sox manually." + fi + ;; + linux) + if command -v apt-get >/dev/null 2>&1; then + sudo apt-get update -y && sudo apt-get install -y sox || echo "Could not install sox via apt-get" + elif command -v dnf >/dev/null 2>&1; then + sudo dnf install -y sox || echo "Could not install sox via dnf" + elif command -v pacman >/dev/null 2>&1; then + sudo pacman -S --noconfirm sox || echo "Could not install sox via pacman" + else + echo "Unknown package manager. Install sox manually." + fi + ;; + windows) + echo "Install sox manually on Windows, for example via Chocolatey: choco install sox" + ;; + *) + echo "Unknown OS. Install sox manually if you want microphone capture." + ;; + esac +} + +install_node_deps() { + if [[ -f package-lock.json && "$USE_CI" -eq 1 ]]; then + echo "Installing Node dependencies with npm ci" + npm ci + else + echo "Installing Node dependencies with npm install" + npm install + fi +} + +setup_whisper_env() { + if [[ "$SETUP_WHISPER" -ne 1 ]]; then + echo "Skipping local Whisper setup" + return + fi + + require_command "$PYTHON_BIN" "Python 3 is required for local Whisper setup." + + if [[ ! -d "$WHISPER_VENV_DIR" ]]; then + echo "Creating Whisper virtual environment at $WHISPER_VENV_DIR" + "$PYTHON_BIN" -m venv "$WHISPER_VENV_DIR" + fi + + echo "Installing local Whisper into $WHISPER_VENV_DIR" + "$WHISPER_PIP_PATH" install --upgrade pip + "$WHISPER_PIP_PATH" install openai-whisper + + mkdir -p "$WHISPER_MODEL_DIR" + + upsert_env "SPEECH_PROVIDER" "whisper" + upsert_env "AZURE_SPEECH_KEY" "" + upsert_env "AZURE_SPEECH_REGION" "" + upsert_env "WHISPER_COMMAND" "${WHISPER_COMMAND_PATH}" + upsert_env "WHISPER_MODEL_DIR" "${WHISPER_MODEL_DIR}" + upsert_env "WHISPER_MODEL" "${WHISPER_MODEL}" + upsert_env "WHISPER_LANGUAGE" "${WHISPER_LANGUAGE}" + upsert_env "WHISPER_SEGMENT_MS" "${WHISPER_SEGMENT_MS}" + + echo "Running Whisper smoke test" + npm run test-speech +} + +build_app() { + if [[ "$DO_BUILD" -eq 1 ]]; then + echo "Building app for $OS_NAME with npm run $PLATFORM_BUILD_SCRIPT" + npm run "$PLATFORM_BUILD_SCRIPT" + fi +} + +run_app() { + if [[ "$DO_RUN" -eq 1 ]]; then + echo "Starting app" + npm start + else + echo "Setup complete. Skipping run." + fi +} + +detect_os +echo "Detected OS: $OS_NAME" +require_command node "Node.js 18+ is required." +require_command npm "npm is required." +echo "Node: $(node -v)" +echo "npm: $(npm -v)" + +ensure_env_file +ensure_gemini_key +install_system_deps +install_node_deps +setup_whisper_env +build_app +run_app diff --git a/src/core/config.js b/src/core/config.js index 12dfff1..35f7ca5 100644 --- a/src/core/config.js +++ b/src/core/config.js @@ -55,11 +55,17 @@ class ConfigManager { }, speech: { + provider: 'azure', azure: { language: 'en-US', enableDictation: true, enableAudioLogging: false, outputFormat: 'detailed' + }, + whisper: { + model: 'base', + language: 'en', + segmentMs: 4000 } }, @@ -98,4 +104,4 @@ class ConfigManager { } } -module.exports = new ConfigManager(); \ No newline at end of file +module.exports = new ConfigManager(); diff --git a/src/managers/window.manager.js b/src/managers/window.manager.js index 6e0b78b..8c37825 100644 --- a/src/managers/window.manager.js +++ b/src/managers/window.manager.js @@ -1642,4 +1642,4 @@ class WindowManager { } } -module.exports = new WindowManager(); \ No newline at end of file +module.exports = new WindowManager(); diff --git a/src/services/speech.service.js b/src/services/speech.service.js index f371603..45875f5 100644 --- a/src/services/speech.service.js +++ b/src/services/speech.service.js @@ -1,7 +1,7 @@ // Enhanced polyfills for Azure Speech SDK in Node.js environment if (typeof window === 'undefined') { global.window = { - navigator: { + navigator: { userAgent: 'Node.js', platform: 'node', mediaDevices: { @@ -30,7 +30,7 @@ if (typeof window === 'undefined') { ]) } }, - document: { + document: { createElement: (tagName) => { const element = { addEventListener: () => {}, @@ -51,8 +51,7 @@ if (typeof window === 'undefined') { focus: () => {}, blur: () => {} }; - - // Special handling for audio elements + if (tagName.toLowerCase() === 'audio') { Object.assign(element, { play: () => Promise.resolve(), @@ -78,7 +77,7 @@ if (typeof window === 'undefined') { currentSrc: '' }); } - + return element; }, getElementById: () => null, @@ -99,7 +98,7 @@ if (typeof window === 'undefined') { style: {} } }, - location: { + location: { href: 'file:///', protocol: 'file:', host: '', @@ -118,7 +117,6 @@ if (typeof window === 'undefined') { clearInterval: global.clearInterval, requestAnimationFrame: (callback) => global.setTimeout(callback, 16), cancelAnimationFrame: global.clearTimeout, - // Add console methods if not available console: global.console || { log: () => {}, error: () => {}, @@ -127,50 +125,50 @@ if (typeof window === 'undefined') { debug: () => {} }, AudioContext: class AudioContext { - constructor() { - this.state = 'running'; + constructor() { + this.state = 'running'; this.sampleRate = 16000; this.currentTime = 0; this.listener = { setPosition: () => {}, setOrientation: () => {} }; - this.destination = { - connect: () => {}, + this.destination = { + connect: () => {}, disconnect: () => {}, channelCount: 2, channelCountMode: 'explicit', channelInterpretation: 'speakers' }; } - createMediaStreamSource(stream) { - return { - connect: () => {}, + createMediaStreamSource(stream) { + return { + connect: () => {}, disconnect: () => {}, mediaStream: stream - }; + }; } - createGain() { - return { - connect: () => {}, - disconnect: () => {}, - gain: { + createGain() { + return { + connect: () => {}, + disconnect: () => {}, + gain: { value: 1, setValueAtTime: () => {}, linearRampToValueAtTime: () => {}, exponentialRampToValueAtTime: () => {} } - }; + }; } - createScriptProcessor(bufferSize = 4096, inputChannels = 1, outputChannels = 1) { - return { - connect: () => {}, - disconnect: () => {}, + createScriptProcessor(bufferSize = 4096, inputChannels = 1, outputChannels = 1) { + return { + connect: () => {}, + disconnect: () => {}, onaudioprocess: null, bufferSize, numberOfInputs: inputChannels, numberOfOutputs: outputChannels - }; + }; } createAnalyser() { return { @@ -187,7 +185,7 @@ if (typeof window === 'undefined') { getFloatTimeDomainData: () => {} }; } - decodeAudioData(audioData) { + decodeAudioData() { return Promise.resolve({ length: 44100, sampleRate: 44100, @@ -196,64 +194,64 @@ if (typeof window === 'undefined') { getChannelData: () => new Float32Array(44100) }); } - suspend() { + suspend() { this.state = 'suspended'; - return Promise.resolve(); + return Promise.resolve(); } - resume() { + resume() { this.state = 'running'; - return Promise.resolve(); + return Promise.resolve(); } - close() { + close() { this.state = 'closed'; - return Promise.resolve(); + return Promise.resolve(); } }, webkitAudioContext: class webkitAudioContext { - constructor() { - this.state = 'running'; + constructor() { + this.state = 'running'; this.sampleRate = 16000; this.currentTime = 0; this.listener = { setPosition: () => {}, setOrientation: () => {} }; - this.destination = { - connect: () => {}, + this.destination = { + connect: () => {}, disconnect: () => {}, channelCount: 2, channelCountMode: 'explicit', channelInterpretation: 'speakers' }; } - createMediaStreamSource(stream) { - return { - connect: () => {}, + createMediaStreamSource(stream) { + return { + connect: () => {}, disconnect: () => {}, mediaStream: stream - }; + }; } - createGain() { - return { - connect: () => {}, - disconnect: () => {}, - gain: { + createGain() { + return { + connect: () => {}, + disconnect: () => {}, + gain: { value: 1, setValueAtTime: () => {}, linearRampToValueAtTime: () => {}, exponentialRampToValueAtTime: () => {} } - }; + }; } - createScriptProcessor(bufferSize = 4096, inputChannels = 1, outputChannels = 1) { - return { - connect: () => {}, - disconnect: () => {}, + createScriptProcessor(bufferSize = 4096, inputChannels = 1, outputChannels = 1) { + return { + connect: () => {}, + disconnect: () => {}, onaudioprocess: null, bufferSize, numberOfInputs: inputChannels, numberOfOutputs: outputChannels - }; + }; } createAnalyser() { return { @@ -270,7 +268,7 @@ if (typeof window === 'undefined') { getFloatTimeDomainData: () => {} }; } - decodeAudioData(audioData) { + decodeAudioData() { return Promise.resolve({ length: 44100, sampleRate: 44100, @@ -279,22 +277,21 @@ if (typeof window === 'undefined') { getChannelData: () => new Float32Array(44100) }); } - suspend() { + suspend() { this.state = 'suspended'; - return Promise.resolve(); + return Promise.resolve(); } - resume() { + resume() { this.state = 'running'; - return Promise.resolve(); + return Promise.resolve(); } - close() { + close() { this.state = 'closed'; - return Promise.resolve(); + return Promise.resolve(); } }, - // Add additional globals that might be needed URL: class URL { - constructor(url, base) { + constructor(url) { this.href = url; this.protocol = 'https:'; this.host = 'localhost'; @@ -305,7 +302,9 @@ if (typeof window === 'undefined') { this.hash = ''; this.origin = 'https://localhost'; } - toString() { return this.href; } + toString() { + return this.href; + } }, Blob: class Blob { constructor(parts = [], options = {}) { @@ -313,10 +312,18 @@ if (typeof window === 'undefined') { this.type = options.type || ''; this.parts = parts; } - slice() { return new Blob(); } - stream() { return new ReadableStream(); } - text() { return Promise.resolve(''); } - arrayBuffer() { return Promise.resolve(new ArrayBuffer(0)); } + slice() { + return new Blob(); + } + stream() { + return new ReadableStream(); + } + text() { + return Promise.resolve(''); + } + arrayBuffer() { + return Promise.resolve(new ArrayBuffer(0)); + } }, File: class File { constructor(parts, name, options = {}) { @@ -326,10 +333,18 @@ if (typeof window === 'undefined') { this.lastModified = Date.now(); this.parts = parts; } - slice() { return new File([], this.name); } - stream() { return new ReadableStream(); } - text() { return Promise.resolve(''); } - arrayBuffer() { return Promise.resolve(new ArrayBuffer(0)); } + slice() { + return new File([], this.name); + } + stream() { + return new ReadableStream(); + } + text() { + return Promise.resolve(''); + } + arrayBuffer() { + return Promise.resolve(new ArrayBuffer(0)); + } } }; global.document = global.window.document; @@ -339,8 +354,7 @@ if (typeof window === 'undefined') { global.URL = global.window.URL; global.Blob = global.window.Blob; global.File = global.window.File; - - // Additional polyfills that might be needed + if (!global.performance) { global.performance = { now: () => Date.now(), @@ -352,7 +366,7 @@ if (typeof window === 'undefined') { getEntriesByType: () => [] }; } - + if (!global.crypto) { global.crypto = { getRandomValues: (arr) => { @@ -365,12 +379,28 @@ if (typeof window === 'undefined') { } } -const sdk = require('microsoft-cognitiveservices-speech-sdk'); -const recorder = require('node-record-lpcm16'); +const fs = require('fs'); +const os = require('os'); +const path = require('path'); +const { spawn, spawnSync } = require('child_process'); const { EventEmitter } = require('events'); const logger = require('../core/logger').createServiceLogger('SPEECH'); const config = require('../core/config'); +let sdk = null; +try { + sdk = require('microsoft-cognitiveservices-speech-sdk'); +} catch (error) { + logger.warn('Azure Speech SDK unavailable', { error: error.message }); +} + +let recorder = null; +try { + recorder = require('node-record-lpcm16'); +} catch (error) { + logger.warn('Local audio recorder dependency unavailable', { error: error.message }); +} + class SpeechService extends EventEmitter { constructor() { super(); @@ -383,72 +413,133 @@ class SpeechService extends EventEmitter { this.maxRetries = 3; this.pushStream = null; this.recording = null; - this.available = false; // track availability - + this.available = false; + this.provider = 'disabled'; + this.runtimeSettings = {}; + this.segmentBuffers = []; + this.segmentBytes = 0; + this.segmentTimer = null; + this.transcriptionInFlight = false; + this.pendingFlush = false; + this.audioProgram = null; + this.whisperCommand = null; + this.initializeClient(); } initializeClient() { + this._cleanup(); + this.provider = 'disabled'; + this.available = false; + this.speechConfig = null; + this.whisperCommand = null; + + const provider = this._getConfiguredProvider(); + this.provider = provider; + + if (provider === 'azure') { + this._initializeAzureClient(); + return; + } + + if (provider === 'whisper') { + this._initializeWhisperClient(); + return; + } + + const reason = 'Speech recognition disabled. Configure Azure or local Whisper.'; + logger.warn(reason); + this.emit('status', reason); + } + + _initializeAzureClient() { try { - // Get Azure Speech credentials from environment variables - const subscriptionKey = process.env.AZURE_SPEECH_KEY; - const region = process.env.AZURE_SPEECH_REGION; - + if (!sdk) { + throw new Error('Azure Speech SDK dependency is not installed'); + } + + if (!recorder || typeof recorder.record !== 'function') { + throw new Error('Local microphone recorder dependency is not installed'); + } + + const subscriptionKey = this._getSetting('azureKey') || process.env.AZURE_SPEECH_KEY; + const region = this._getSetting('azureRegion') || process.env.AZURE_SPEECH_REGION; + if (!subscriptionKey || !region) { const reason = 'Azure Speech credentials not found. Speech recognition disabled.'; - logger.warn('Speech service disabled (missing credentials)'); - this.available = false; + logger.warn('Speech service disabled (missing Azure credentials)'); this.emit('status', reason); return; } - // Validate region format - const validRegions = ['eastus', 'westus', 'westus2', 'eastus2', 'centralus', 'northcentralus', 'southcentralus', 'westcentralus', 'canadacentral', 'canadaeast', 'brazilsouth', 'northeurope', 'westeurope', 'uksouth', 'ukwest', 'francecentral', 'germanywestcentral', 'norwayeast', 'switzerlandnorth', 'switzerlandwest', 'swedencentral', 'uaenorth', 'southafricanorth', 'centralindia', 'southindia', 'westindia', 'eastasia', 'southeastasia', 'japaneast', 'japanwest', 'koreacentral', 'koreasouth', 'australiaeast', 'australiasoutheast']; - - if (!validRegions.includes(region.toLowerCase())) { - logger.warn('Potentially invalid Azure region specified', { region }); - } - - // Initialize Azure Speech configuration this.speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey, region); - - // Configure speech recognition settings with better defaults + const azureConfig = config.get('speech.azure') || {}; this.speechConfig.speechRecognitionLanguage = azureConfig.language || 'en-US'; this.speechConfig.outputFormat = sdk.OutputFormat.Detailed; - - // Set additional properties for better recognition - this.speechConfig.setProperty(sdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "5000"); - this.speechConfig.setProperty(sdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs, "2000"); - this.speechConfig.setProperty(sdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "2000"); - + this.speechConfig.setProperty(sdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, '5000'); + this.speechConfig.setProperty(sdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs, '2000'); + this.speechConfig.setProperty(sdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, '2000'); + if (azureConfig.enableDictation) { this.speechConfig.enableDictation(); } - + if (azureConfig.enableAudioLogging) { this.speechConfig.enableAudioLogging(); } - + + this.available = true; logger.info('Azure Speech service initialized successfully', { region, language: azureConfig.language || 'en-US' }); - - this.available = true; this.emit('status', 'Azure Speech Services ready'); - } catch (error) { - logger.error('Failed to initialize Azure Speech client', { error: error.message, stack: error.stack }); + logger.error('Failed to initialize Azure Speech client', { + error: error.message, + stack: error.stack + }); + this.available = false; + this.emit('status', 'Azure speech unavailable'); + } + } + + _initializeWhisperClient() { + try { + if (!recorder || typeof recorder.record !== 'function') { + throw new Error('Local microphone recorder dependency is not installed'); + } + + this.whisperCommand = this._resolveWhisperCommand(); + if (!this.whisperCommand) { + const reason = 'Local Whisper unavailable. Install the Whisper CLI or set WHISPER_COMMAND.'; + logger.warn(reason); + this.emit('status', reason); + return; + } + + this.available = true; + logger.info('Local Whisper service initialized successfully', { + command: [this.whisperCommand.command, ...this.whisperCommand.baseArgs].join(' '), + model: this._getWhisperModel(), + language: this._getWhisperLanguage() + }); + this.emit('status', 'Local Whisper ready'); + } catch (error) { + logger.error('Failed to initialize local Whisper client', { + error: error.message, + stack: error.stack + }); this.available = false; - this.emit('status', 'Speech recognition unavailable'); + this.emit('status', 'Local Whisper unavailable'); } } startRecording() { try { - if (!this.speechConfig) { - const errorMsg = 'Azure Speech client not initialized'; + if (!this.available) { + const errorMsg = `Speech provider "${this.provider}" is not available`; logger.error(errorMsg); this.emit('error', errorMsg); return; @@ -462,7 +553,17 @@ class SpeechService extends EventEmitter { this.sessionStartTime = Date.now(); this.retryCount = 0; - this._attemptRecording(); + if (this.provider === 'azure') { + this._startAzureRecording(); + return; + } + + if (this.provider === 'whisper') { + this._startWhisperRecording(); + return; + } + + throw new Error(`Unsupported speech provider: ${this.provider}`); } catch (error) { logger.error('Critical error in startRecording', { error: error.message, stack: error.stack }); this.emit('error', `Speech recognition failed to start: ${error.message}`); @@ -470,182 +571,125 @@ class SpeechService extends EventEmitter { } } - _attemptRecording() { + _startAzureRecording() { + if (!this.speechConfig) { + throw new Error('Azure Speech client not initialized'); + } + + this.isRecording = true; + this.emit('recording-started'); + this.emit('status', 'Azure recording started'); + this._cleanup(); + try { - this.isRecording = true; - this.emit('recording-started'); - - // Clean up any existing resources - this._cleanup(); - - // Use push stream with Node.js audio capture (more reliable for Electron main process) - try { - this.pushStream = sdk.AudioInputStream.createPushStream(); - this.audioConfig = sdk.AudioConfig.fromStreamInput(this.pushStream); - - // Start capturing real microphone audio - this._startMicrophoneCapture(); - - } catch (audioError) { - logger.error('Failed to create audio config', { error: audioError.message }); - this.emit('error', 'Audio configuration failed. Please check microphone permissions.'); - this.isRecording = false; - return; - } - - // Create speech recognizer - try { - this.recognizer = new sdk.SpeechRecognizer(this.speechConfig, this.audioConfig); - } catch (recognizerError) { - throw recognizerError; - } - - // Set up event handlers with better error handling - this.recognizer.recognizing = (s, e) => { - try { - if (e.result.reason === sdk.ResultReason.RecognizingSpeech) { - logger.debug('Interim transcription received', { - text: e.result.text, - offset: e.result.offset, - duration: e.result.duration - }); - this.emit('interim-transcription', e.result.text); - } - } catch (error) { - logger.error('Error in recognizing handler', { error: error.message }); - } - }; - - this.recognizer.recognized = (s, e) => { - try { - if (e.result.reason === sdk.ResultReason.RecognizedSpeech) { - const sessionDuration = Date.now() - this.sessionStartTime; - - // Only emit transcription if there's actual text content - if (e.result.text && e.result.text.trim().length > 0) { - logger.info('Final transcription received', { - text: e.result.text, - sessionDuration: `${sessionDuration}ms`, - textLength: e.result.text.length, - confidence: e.result.properties?.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult) - }); - - this.emit('transcription', e.result.text); - } else { - logger.debug('Empty transcription result ignored', { - sessionDuration: `${sessionDuration}ms`, - confidence: e.result.properties?.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult) - }); - } - } else if (e.result.reason === sdk.ResultReason.NoMatch) { - logger.debug('No speech pattern detected in audio'); - - // Check if there's detailed no-match information - const noMatchDetails = e.result.properties?.getProperty(sdk.PropertyId.SpeechServiceResponse_JsonResult); - if (noMatchDetails) { - logger.debug('No match details', { details: noMatchDetails }); - } - } - } catch (error) { - logger.error('Error in recognized handler', { error: error.message }); - } - }; - - this.recognizer.canceled = (s, e) => { - logger.warn('Recognition session canceled', { - reason: e.reason, - errorCode: e.errorCode, - errorDetails: e.errorDetails - }); - - if (e.reason === sdk.CancellationReason.Error) { - const errorMsg = `Recognition error: ${e.errorDetails}`; - - // Check for specific error types and provide better messages - if (e.errorDetails.includes('1006')) { - this.emit('error', 'Network connection failed. Please check your internet connection.'); - } else if (e.errorDetails.includes('InvalidServiceCredentials')) { - this.emit('error', 'Invalid Azure Speech credentials. Please check AZURE_SPEECH_KEY and AZURE_SPEECH_REGION.'); - } else if (e.errorDetails.includes('Forbidden')) { - this.emit('error', 'Access denied. Please check your Azure Speech service subscription and region.'); - } else if (e.errorDetails.includes('AudioInputMicrophone_InitializationFailure')) { - this.emit('error', 'Microphone initialization failed. Please check microphone permissions and availability.'); - } else { - this.emit('error', errorMsg); - } - - // Attempt retry for transient errors - if (this.retryCount < this.maxRetries && ( - e.errorDetails.includes('1006') || - e.errorDetails.includes('timeout') || - e.errorDetails.includes('network') - )) { - this.retryCount++; - logger.info(`Retrying recognition (attempt ${this.retryCount}/${this.maxRetries})`); - setTimeout(() => { - if (!this.isRecording) { - this._attemptRecording(); - } - }, 1000 * this.retryCount); - return; - } + this.pushStream = sdk.AudioInputStream.createPushStream(); + this.audioConfig = sdk.AudioConfig.fromStreamInput(this.pushStream); + this._startMicrophoneCapture(); + this.recognizer = new sdk.SpeechRecognizer(this.speechConfig, this.audioConfig); + } catch (error) { + logger.error('Failed to start Azure recording session', { error: error.message }); + this.emit('error', `Audio configuration failed: ${error.message}`); + this.isRecording = false; + return; + } + + this.recognizer.recognizing = (s, e) => { + try { + if (e.result.reason === sdk.ResultReason.RecognizingSpeech) { + this.emit('interim-transcription', e.result.text); } - this.stopRecording(); - }; + } catch (error) { + logger.error('Error in recognizing handler', { error: error.message }); + } + }; - this.recognizer.sessionStarted = (s, e) => { - logger.info('Recognition session started', { sessionId: e.sessionId }); - }; + this.recognizer.recognized = (s, e) => { + try { + if (e.result.reason === sdk.ResultReason.RecognizedSpeech && e.result.text && e.result.text.trim()) { + this.emit('transcription', e.result.text); + } + } catch (error) { + logger.error('Error in recognized handler', { error: error.message }); + } + }; - this.recognizer.sessionStopped = (s, e) => { - logger.info('Recognition session ended', { sessionId: e.sessionId }); - this.stopRecording(); - }; + this.recognizer.canceled = (s, e) => { + logger.warn('Recognition session canceled', { + reason: e.reason, + errorCode: e.errorCode, + errorDetails: e.errorDetails + }); - // Start continuous recognition with timeout - const startTimeout = setTimeout(() => { - logger.error('Recognition start timeout'); - this.emit('error', 'Speech recognition start timeout. Please try again.'); - this.stopRecording(); - }, 10000); // 10 second timeout - - this.recognizer.startContinuousRecognitionAsync( - () => { - clearTimeout(startTimeout); - logger.info('Continuous speech recognition started successfully'); - if (global.windowManager) { - global.windowManager.handleRecordingStarted(); - } - }, - (error) => { - clearTimeout(startTimeout); - logger.error('Failed to start continuous recognition', { - error: error.toString(), - retryCount: this.retryCount - }); - - // Attempt retry for initialization failures - if (this.retryCount < this.maxRetries) { - this.retryCount++; - logger.info(`Retrying recognition start (attempt ${this.retryCount}/${this.maxRetries})`); - this.isRecording = false; - setTimeout(() => { - this._attemptRecording(); - }, 2000 * this.retryCount); - } else { - this.emit('error', `Recognition startup failed after ${this.maxRetries} attempts: ${error}`); - this.isRecording = false; - } - } - ); + if (e.reason === sdk.CancellationReason.Error) { + const details = e.errorDetails || ''; + if (details.includes('1006')) { + this.emit('error', 'Network connection failed. Please check your internet connection.'); + } else if (details.includes('InvalidServiceCredentials')) { + this.emit('error', 'Invalid Azure Speech credentials. Please check AZURE_SPEECH_KEY and AZURE_SPEECH_REGION.'); + } else if (details.includes('Forbidden')) { + this.emit('error', 'Access denied. Please check your Azure Speech service subscription and region.'); + } else if (details.includes('AudioInputMicrophone_InitializationFailure')) { + this.emit('error', 'Microphone initialization failed. Please check microphone permissions and availability.'); + } else { + this.emit('error', `Recognition error: ${details}`); + } + } - } catch (error) { - logger.error('Failed to start recording session', { - error: error.message, - stack: error.stack + this.stopRecording(); + }; + + this.recognizer.sessionStarted = (s, e) => { + logger.info('Recognition session started', { sessionId: e.sessionId }); + }; + + this.recognizer.sessionStopped = () => { + this.stopRecording(); + }; + + const startTimeout = setTimeout(() => { + logger.error('Recognition start timeout'); + this.emit('error', 'Speech recognition start timeout. Please try again.'); + this.stopRecording(); + }, 10000); + + this.recognizer.startContinuousRecognitionAsync( + () => { + clearTimeout(startTimeout); + logger.info('Continuous Azure speech recognition started successfully'); + if (global.windowManager) { + global.windowManager.handleRecordingStarted(); + } + }, + (error) => { + clearTimeout(startTimeout); + logger.error('Failed to start continuous recognition', { error: error.toString() }); + this.emit('error', `Recognition startup failed: ${error}`); + this.isRecording = false; + this._cleanup(); + } + ); + } + + _startWhisperRecording() { + this._cleanup(); + this.isRecording = true; + this.segmentBuffers = []; + this.segmentBytes = 0; + this.transcriptionInFlight = false; + this.pendingFlush = false; + this.emit('recording-started'); + this.emit('status', 'Local Whisper recording started'); + this._startMicrophoneCapture(); + + const segmentMs = this._getWhisperSegmentMs(); + this.segmentTimer = setInterval(() => { + this._flushWhisperSegment({ final: false }).catch((error) => { + logger.error('Whisper segment transcription failed', { error: error.message }); }); - this.emit('error', `Recording startup failed: ${error.message}`); - this.isRecording = false; + }, segmentMs); + + if (global.windowManager) { + global.windowManager.handleRecordingStarted(); } } @@ -656,40 +700,77 @@ class SpeechService extends EventEmitter { this.isRecording = false; const sessionDuration = this.sessionStartTime ? Date.now() - this.sessionStartTime : 0; - - logger.info('Stopping speech recognition session', { - sessionDuration: `${sessionDuration}ms` + logger.info('Stopping speech recognition session', { + provider: this.provider, + sessionDuration: `${sessionDuration}ms` }); - // Stop continuous recognition - if (this.recognizer) { + if (this.provider === 'azure' && this.recognizer) { try { this.recognizer.stopContinuousRecognitionAsync( () => { - logger.info('Speech recognition stopped successfully'); - this.emit('recording-stopped'); - this.emit('status', 'Recording stopped'); - if (global.windowManager) { - global.windowManager.handleRecordingStopped(); - } - this._cleanup(); + this._finalizeStop('Recording stopped'); }, (error) => { logger.error('Error during recognition stop', { error: error.toString() }); - this._cleanup(); + this._finalizeStop('Recording stopped'); } ); } catch (error) { logger.error('Error stopping recognizer', { error: error.message }); - this._cleanup(); + this._finalizeStop('Recording stopped'); + } + return; + } + + if (this.provider === 'whisper') { + this._finalizeWhisperStop(); + return; + } + + this._finalizeStop('Recording stopped'); + } + + async _finalizeWhisperStop() { + if (this.segmentTimer) { + clearInterval(this.segmentTimer); + this.segmentTimer = null; + } + + if (this.recording) { + try { + this.recording.stop(); + } catch (error) { + logger.error('Error stopping audio recording', { error: error.message }); } - } else { - this._cleanup(); + this.recording = null; + } + + try { + await this._flushWhisperSegment({ final: true }); + } catch (error) { + logger.error('Final Whisper transcription failed', { error: error.message }); + this.emit('error', `Whisper transcription failed: ${error.message}`); + } finally { + this._finalizeStop('Recording stopped'); + } + } + + _finalizeStop(statusMessage) { + this._cleanup(); + this.emit('recording-stopped'); + this.emit('status', statusMessage); + if (global.windowManager) { + global.windowManager.handleRecordingStopped(); } } _cleanup() { - // Clean up recognizer + if (this.segmentTimer) { + clearInterval(this.segmentTimer); + this.segmentTimer = null; + } + if (this.recognizer) { try { this.recognizer.close(); @@ -699,73 +780,50 @@ class SpeechService extends EventEmitter { this.recognizer = null; } - // Clean up audio config - if (this.audioConfig) { - try { - // Check if close method exists and call it appropriately - if (typeof this.audioConfig.close === 'function') { - try { - const closeResult = this.audioConfig.close(); - // If it returns a promise, handle it, otherwise just continue - if (closeResult && typeof closeResult.then === 'function') { - // It's a promise, but we don't need to wait for it in cleanup - closeResult.catch((error) => { - logger.error('Error closing audio config', { error: error.message }); - }); - } - } catch (closeError) { - logger.error('Error closing audio config', { error: closeError.message }); - } - } - } catch (error) { - logger.error('Error closing audio config', { error: error.message }); - } - this.audioConfig = null; - } - - // Stop audio recording - if (this.recording) { - try { - this.recording.stop(); - this.recording = null; - } catch (error) { - logger.error('Error stopping audio recording', { error: error.message }); - } - } - - // Clean up push stream - if (this.pushStream) { - try { - // Check if close method exists and call it appropriately - if (typeof this.pushStream.close === 'function') { - const closeResult = this.pushStream.close(); - // If it returns a promise, we can await it, otherwise just continue - if (closeResult && typeof closeResult.then === 'function') { - // It's a promise, but we don't need to wait for it in cleanup - closeResult.catch((error) => { - }); - } - } - } catch (error) { - logger.error('Error closing push stream', { error: error.message }); - } - this.pushStream = null; - } - - // Reset audio data logging flag - this._audioDataLogged = false; + if (this.audioConfig) { + try { + if (typeof this.audioConfig.close === 'function') { + this.audioConfig.close(); + } + } catch (error) { + logger.error('Error closing audio config', { error: error.message }); + } + this.audioConfig = null; + } + + if (this.recording) { + try { + this.recording.stop(); + } catch (error) { + logger.error('Error stopping audio recording', { error: error.message }); + } + this.recording = null; + } + + if (this.pushStream) { + try { + if (typeof this.pushStream.close === 'function') { + this.pushStream.close(); + } + } catch (error) { + logger.error('Error closing push stream', { error: error.message }); + } + this.pushStream = null; + } + + this.segmentBuffers = []; + this.segmentBytes = 0; + this.transcriptionInFlight = false; + this.pendingFlush = false; + this._audioDataLogged = false; } async recognizeFromFile(audioFilePath) { - if (!this.speechConfig) { - throw new Error('Speech service not initialized'); - } + if (this.provider === 'azure') { + if (!this.speechConfig) { + throw new Error('Speech service not initialized'); + } - const startTime = Date.now(); - - try { - // Validate file exists and is readable - const fs = require('fs'); if (!fs.existsSync(audioFilePath)) { throw new Error(`Audio file not found: ${audioFilePath}`); } @@ -773,206 +831,438 @@ class SpeechService extends EventEmitter { const audioConfig = sdk.AudioConfig.fromWavFileInput(audioFilePath); const recognizer = new sdk.SpeechRecognizer(this.speechConfig, audioConfig); - const result = await new Promise((resolve, reject) => { - const timeout = setTimeout(() => { - reject(new Error('File recognition timeout')); - recognizer.close(); - }, 30000); // 30 second timeout - + return await new Promise((resolve, reject) => { recognizer.recognizeOnceAsync( (result) => { - clearTimeout(timeout); - if (result.reason === sdk.ResultReason.RecognizedSpeech) { - resolve(result.text); - } else if (result.reason === sdk.ResultReason.NoMatch) { - resolve(''); // No speech detected in file - } else { - reject(new Error(`File recognition failed: ${result.reason}`)); - } + resolve(result.reason === sdk.ResultReason.RecognizedSpeech ? result.text : ''); recognizer.close(); audioConfig.close(); }, (error) => { - clearTimeout(timeout); reject(new Error(`File recognition error: ${error}`)); recognizer.close(); audioConfig.close(); } ); }); + } - logger.logPerformance('File speech recognition', startTime, { - filePath: audioFilePath, - textLength: result.length - }); + if (this.provider === 'whisper') { + return this._transcribeWhisperFile(audioFilePath); + } - return result; - } catch (error) { - logger.error('File recognition failed', { - filePath: audioFilePath, - error: error.message - }); - throw error; + throw new Error('Speech service not initialized'); + } + + async testConnection() { + if (this.provider === 'azure') { + if (!this.speechConfig) { + throw new Error('Speech service not initialized'); + } + + try { + const audioConfig = sdk.AudioConfig.fromDefaultMicrophoneInput(); + const recognizer = new sdk.SpeechRecognizer(this.speechConfig, audioConfig); + recognizer.close(); + audioConfig.close(); + return { success: true, message: 'Azure connection test successful' }; + } catch (error) { + return { success: false, message: error.message }; + } } + + if (this.provider === 'whisper') { + return { + success: !!this.whisperCommand, + message: this.whisperCommand ? 'Local Whisper CLI detected' : 'Local Whisper CLI not found' + }; + } + + return { success: false, message: 'Speech service not initialized' }; } getStatus() { return { + provider: this.provider, isRecording: this.isRecording, - isInitialized: !!this.speechConfig, + isInitialized: this.provider === 'azure' ? !!this.speechConfig : !!this.whisperCommand, sessionDuration: this.sessionStartTime ? Date.now() - this.sessionStartTime : 0, retryCount: this.retryCount, - config: config.get('speech.azure') || {} + effectiveSettings: { + speechProvider: this.provider, + azureKey: this._getSetting('azureKey') || '', + azureRegion: this._getSetting('azureRegion') || process.env.AZURE_SPEECH_REGION || '', + whisperCommand: this._getSetting('whisperCommand') || process.env.WHISPER_COMMAND || '', + whisperModelDir: this._getWhisperModelDir(), + whisperModel: this._getWhisperModel(), + whisperLanguage: this._getWhisperLanguage(), + whisperSegmentMs: String(this._getWhisperSegmentMs()) + }, + config: { + azure: config.get('speech.azure') || {}, + whisper: config.get('speech.whisper') || {}, + selectedProvider: this.provider + } }; } - // Test connection method - async testConnection() { - if (!this.speechConfig) { - throw new Error('Speech service not initialized'); - } - - try { - // Create a simple test recognizer - const audioConfig = sdk.AudioConfig.fromDefaultMicrophoneInput(); - const recognizer = new sdk.SpeechRecognizer(this.speechConfig, audioConfig); - - // Test by attempting to create the recognizer (this validates credentials) - recognizer.close(); - audioConfig.close(); - - return { success: true, message: 'Connection test successful' }; - } catch (error) { - return { success: false, message: error.message }; - } - } - - // Start capturing real microphone audio using node-record-lpcm16 - _startMicrophoneCapture() { - if (!this.pushStream) return; - - try { - // Check if recorder is available - if (!recorder || typeof recorder.record !== 'function') { - throw new Error('node-record-lpcm16 not available or not properly installed'); - } - - // Configure audio recording with error handling - this.recording = recorder.record({ - sampleRateHertz: 16000, // Azure Speech SDK prefers 16kHz - threshold: 0, // No silence threshold - verbose: false, // Quiet logging - recordProgram: 'sox', // Try 'sox' first (most common on macOS) - silence: '10.0s' // Longer silence threshold - }); - - if (!this.recording) { - throw new Error('Failed to create audio recording instance'); - } - - // Add error handler for the recording stream before using it - this.recording.stream().on('error', (error) => { - logger.error('Audio recording stream error', { error: error.message }); - - // Don't emit error immediately, try to recover - this._handleAudioError(error); - }); - - // Pipe audio data to Azure Speech SDK - this.recording.stream().on('data', (chunk) => { - if (this.pushStream && this.isRecording) { - try { - this.pushStream.write(chunk); - // Console log only first few chunks to avoid spam - if (!this._audioDataLogged) { - this._audioDataLogged = true; - } - } catch (error) { - } - } - }); - - } catch (error) { - logger.error('Failed to start microphone capture', { error: error.message, stack: error.stack }); - - // Fall back to no audio capture (Azure SDK will still work without audio) - this.emit('error', `Microphone capture failed: ${error.message}. Speech recognition may not work properly.`); - } - } - - // Handle audio recording errors with recovery attempts - _handleAudioError(error) { - - // Try to restart recording with different program - if (this.recording) { - try { - this.recording.stop(); - } catch (stopError) { - } - this.recording = null; - } - - // Try with different recording program - setTimeout(() => { - if (this.isRecording) { - this._startMicrophoneCaptureWithFallback(); - } - }, 1000); - } - - // Try microphone capture with different programs as fallback - _startMicrophoneCaptureWithFallback() { - const programs = ['sox', 'rec', 'arecord']; - let currentProgramIndex = 0; - - const tryNextProgram = () => { - if (currentProgramIndex >= programs.length) { - this.emit('error', 'Could not start microphone capture with any audio program'); - return; - } - - const program = programs[currentProgramIndex]; - - try { - this.recording = recorder.record({ - sampleRateHertz: 16000, - threshold: 0, - verbose: false, - recordProgram: program, - silence: '10.0s' - }); - - this.recording.stream().on('error', (error) => { - currentProgramIndex++; - tryNextProgram(); - }); - - this.recording.stream().on('data', (chunk) => { - if (this.pushStream && this.isRecording) { - try { - this.pushStream.write(chunk); - if (!this._audioDataLogged) { - this._audioDataLogged = true; - } - } catch (error) { - logger.error('Error writing audio data', { error: error.message }); - } - } - }); - } catch (error) { - logger.error(`${program} configuration failed`, { error: error.message }); - currentProgramIndex++; - tryNextProgram(); - } - }; - - tryNextProgram(); - } - - // Expose availability to UI isAvailable() { - return !!this.speechConfig && !!this.available; + if (this.provider === 'azure') { + return !!this.speechConfig && !!this.available; + } + + if (this.provider === 'whisper') { + return !!this.whisperCommand && !!this.available; + } + + return false; + } + + updateSettings(settings = {}) { + const speechKeys = ['speechProvider', 'azureKey', 'azureRegion', 'whisperCommand', 'whisperModelDir', 'whisperModel', 'whisperLanguage', 'whisperSegmentMs']; + let changed = false; + + for (const key of speechKeys) { + if (Object.prototype.hasOwnProperty.call(settings, key)) { + this.runtimeSettings[key] = settings[key]; + changed = true; + } + } + + if (changed) { + this.initializeClient(); + } + + return this.getStatus(); + } + + _getConfiguredProvider() { + const provider = String(this._getSetting('speechProvider') || process.env.SPEECH_PROVIDER || '').trim().toLowerCase(); + + if (provider === 'azure' || provider === 'whisper') { + return provider; + } + + const hasAzure = !!((this._getSetting('azureKey') || process.env.AZURE_SPEECH_KEY) && + (this._getSetting('azureRegion') || process.env.AZURE_SPEECH_REGION)); + + if (hasAzure) { + return 'azure'; + } + + return 'whisper'; + } + + _getWhisperModel() { + return this._getSetting('whisperModel') || process.env.WHISPER_MODEL || config.get('speech.whisper.model') || 'base'; + } + + _getWhisperModelDir() { + return this._getSetting('whisperModelDir') || process.env.WHISPER_MODEL_DIR || ''; + } + + _getWhisperLanguage() { + return this._getSetting('whisperLanguage') || process.env.WHISPER_LANGUAGE || config.get('speech.whisper.language') || 'en'; + } + + _getWhisperSegmentMs() { + const rawValue = this._getSetting('whisperSegmentMs') || process.env.WHISPER_SEGMENT_MS || config.get('speech.whisper.segmentMs') || 4000; + const parsed = Number(rawValue); + return Number.isFinite(parsed) ? Math.max(2000, parsed) : 4000; + } + + _getSetting(key) { + const value = this.runtimeSettings[key]; + return value === '' ? null : value; + } + + _resolveWhisperCommand() { + const configured = this._getSetting('whisperCommand') || process.env.WHISPER_COMMAND; + const candidates = []; + + if (configured) { + candidates.push(...this._expandConfiguredWhisperCandidates(configured)); + } + + candidates.push({ command: 'whisper', baseArgs: [] }); + candidates.push({ command: 'whisper.exe', baseArgs: [] }); + candidates.push({ command: 'py', baseArgs: ['-3', '-m', 'whisper'] }); + candidates.push({ command: 'python3', baseArgs: ['-m', 'whisper'] }); + candidates.push({ command: 'python', baseArgs: ['-m', 'whisper'] }); + + for (const candidate of candidates) { + if (!candidate || !candidate.command) { + continue; + } + + const probe = spawnSync(candidate.command, [...candidate.baseArgs, '--help'], { + encoding: 'utf8', + timeout: 5000 + }); + + const output = `${probe.stdout || ''}\n${probe.stderr || ''}`; + if (!probe.error && probe.status === 0 && !output.includes('No module named whisper')) { + return candidate; + } + } + + return null; + } + + _expandConfiguredWhisperCandidates(rawCommand) { + const parsed = this._parseCommand(rawCommand); + if (!parsed) { + return []; + } + + const candidates = [parsed]; + const resolvedPath = path.resolve(parsed.command); + + if (resolvedPath !== parsed.command) { + candidates.push({ command: resolvedPath, baseArgs: parsed.baseArgs }); + } + + if (process.platform === 'win32') { + if (!/\.(exe|cmd|bat)$/i.test(parsed.command)) { + candidates.push({ command: `${parsed.command}.exe`, baseArgs: parsed.baseArgs }); + candidates.push({ command: `${parsed.command}.cmd`, baseArgs: parsed.baseArgs }); + candidates.push({ command: `${resolvedPath}.exe`, baseArgs: parsed.baseArgs }); + candidates.push({ command: `${resolvedPath}.cmd`, baseArgs: parsed.baseArgs }); + } + } + + return candidates; + } + + _parseCommand(rawCommand) { + const parts = String(rawCommand || '').trim().split(/\s+/).filter(Boolean); + if (parts.length === 0) { + return null; + } + + return { + command: parts[0], + baseArgs: parts.slice(1) + }; + } + + _startMicrophoneCapture() { + if (!recorder || typeof recorder.record !== 'function') { + this.emit('error', 'Local microphone capture dependency is missing. Run npm install to restore speech recording support.'); + return; + } + + this._startMicrophoneCaptureWithFallback(['sox', 'rec', 'arecord']); + } + + _startMicrophoneCaptureWithFallback(programs) { + const queue = [...programs]; + + const tryNextProgram = () => { + const program = queue.shift(); + if (!program) { + this.emit('error', 'Could not start microphone capture with any audio program'); + return; + } + + try { + this.recording = recorder.record({ + sampleRateHertz: 16000, + channels: 1, + threshold: 0, + verbose: false, + recordProgram: program, + silence: '10.0s' + }); + + const stream = this.recording.stream(); + this.audioProgram = program; + + stream.on('error', (error) => { + logger.error('Audio recording stream error', { error: error.message, program }); + if (this.recording) { + try { + this.recording.stop(); + } catch (stopError) { + logger.error('Error stopping failed recording program', { error: stopError.message }); + } + this.recording = null; + } + + if (this.isRecording) { + tryNextProgram(); + } + }); + + stream.on('data', (chunk) => { + this._handleAudioChunk(chunk); + }); + } catch (error) { + logger.error('Failed to start microphone capture program', { program, error: error.message }); + tryNextProgram(); + } + }; + + tryNextProgram(); + } + + _handleAudioChunk(chunk) { + if (!chunk || !chunk.length || !this.isRecording) { + return; + } + + if (this.provider === 'azure' && this.pushStream) { + try { + this.pushStream.write(chunk); + } catch (error) { + logger.error('Error writing audio data to Azure push stream', { error: error.message }); + } + return; + } + + if (this.provider === 'whisper') { + this.segmentBuffers.push(Buffer.from(chunk)); + this.segmentBytes += chunk.length; + } + } + + async _flushWhisperSegment({ final }) { + if (this.transcriptionInFlight) { + this.pendingFlush = this.pendingFlush || final; + return; + } + + if (!this.segmentBytes) { + return; + } + + const audioBuffer = Buffer.concat(this.segmentBuffers, this.segmentBytes); + this.segmentBuffers = []; + this.segmentBytes = 0; + + this.transcriptionInFlight = true; + + try { + const transcript = await this._transcribeWhisperBuffer(audioBuffer); + if (transcript && transcript.trim()) { + this.emit('transcription', transcript.trim()); + } + } finally { + this.transcriptionInFlight = false; + + if (this.pendingFlush) { + const shouldRunFinal = this.pendingFlush; + this.pendingFlush = false; + await this._flushWhisperSegment({ final: shouldRunFinal }); + } + } + } + + async _transcribeWhisperBuffer(audioBuffer) { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencluely-whisper-')); + const audioFilePath = path.join(tempDir, 'segment.wav'); + + try { + fs.writeFileSync(audioFilePath, this._createWavBuffer(audioBuffer)); + return await this._transcribeWhisperFile(audioFilePath); + } finally { + this._removeTempDir(tempDir); + } + } + + async _transcribeWhisperFile(audioFilePath) { + if (!this.whisperCommand) { + throw new Error('Local Whisper CLI not configured'); + } + + const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencluely-whisper-out-')); + const args = [ + ...this.whisperCommand.baseArgs, + audioFilePath, + '--model', this._getWhisperModel(), + '--language', this._getWhisperLanguage(), + '--task', 'transcribe', + '--output_format', 'txt', + '--output_dir', outputDir, + '--verbose', 'False', + '--fp16', 'False' + ]; + + if (this._getWhisperModelDir()) { + args.push('--model_dir', this._getWhisperModelDir()); + } + + try { + await new Promise((resolve, reject) => { + const child = spawn(this.whisperCommand.command, args, { + stdio: ['ignore', 'pipe', 'pipe'] + }); + + let stderr = ''; + child.stderr.on('data', (chunk) => { + stderr += chunk.toString(); + }); + + child.on('error', (error) => { + reject(error); + }); + + child.on('close', (code) => { + if (code === 0) { + resolve(); + return; + } + + reject(new Error(stderr.trim() || `Whisper exited with code ${code}`)); + }); + }); + + const transcriptPath = path.join(outputDir, `${path.parse(audioFilePath).name}.txt`); + if (!fs.existsSync(transcriptPath)) { + return ''; + } + + return fs.readFileSync(transcriptPath, 'utf8').trim(); + } finally { + this._removeTempDir(outputDir); + } + } + + _createWavBuffer(rawPcmBuffer) { + const header = Buffer.alloc(44); + const sampleRate = 16000; + const channels = 1; + const bitsPerSample = 16; + const byteRate = sampleRate * channels * (bitsPerSample / 8); + const blockAlign = channels * (bitsPerSample / 8); + + header.write('RIFF', 0); + header.writeUInt32LE(36 + rawPcmBuffer.length, 4); + header.write('WAVE', 8); + header.write('fmt ', 12); + header.writeUInt32LE(16, 16); + header.writeUInt16LE(1, 20); + header.writeUInt16LE(channels, 22); + header.writeUInt32LE(sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(bitsPerSample, 34); + header.write('data', 36); + header.writeUInt32LE(rawPcmBuffer.length, 40); + + return Buffer.concat([header, rawPcmBuffer]); + } + + _removeTempDir(tempDir) { + try { + fs.rmSync(tempDir, { recursive: true, force: true }); + } catch (error) { + logger.error('Failed to remove Whisper temp directory', { + tempDir, + error: error.message + }); + } } } -module.exports = new SpeechService(); \ No newline at end of file +module.exports = new SpeechService(); diff --git a/src/styles/common.css b/src/styles/common.css index 478f715..2db6f85 100644 --- a/src/styles/common.css +++ b/src/styles/common.css @@ -1,7 +1,7 @@ /* Common Styles for OpenCluely UI Components */ /* Font imports */ -@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css'); +@import url('../../node_modules/@fortawesome/fontawesome-free/css/all.min.css'); /* Base styles */ body { @@ -576,4 +576,4 @@ body { .hide-scrollbar { scrollbar-width: none; -ms-overflow-style: none; -} \ No newline at end of file +} diff --git a/src/ui/settings-window.js b/src/ui/settings-window.js index 1062c3c..2a7047e 100644 --- a/src/ui/settings-window.js +++ b/src/ui/settings-window.js @@ -6,8 +6,13 @@ document.addEventListener('DOMContentLoaded', () => { // Get DOM elements const closeButton = document.getElementById('closeButton'); const quitButton = document.getElementById('quitButton'); + const speechProviderSelect = document.getElementById('speechProvider'); const azureKeyInput = document.getElementById('azureKey'); const azureRegionInput = document.getElementById('azureRegion'); + const whisperCommandInput = document.getElementById('whisperCommand'); + const whisperModelInput = document.getElementById('whisperModel'); + const whisperLanguageInput = document.getElementById('whisperLanguage'); + const whisperSegmentMsInput = document.getElementById('whisperSegmentMs'); const geminiKeyInput = document.getElementById('geminiKey'); const windowGapInput = document.getElementById('windowGap'); const codingLanguageSelect = document.getElementById('codingLanguage'); @@ -66,8 +71,13 @@ document.addEventListener('DOMContentLoaded', () => { // Function to load settings into UI const loadSettingsIntoUI = (settings) => { + if (settings.speechProvider && speechProviderSelect) speechProviderSelect.value = settings.speechProvider; if (settings.azureKey && azureKeyInput) azureKeyInput.value = settings.azureKey; if (settings.azureRegion && azureRegionInput) azureRegionInput.value = settings.azureRegion; + if (settings.whisperCommand && whisperCommandInput) whisperCommandInput.value = settings.whisperCommand; + if (settings.whisperModel && whisperModelInput) whisperModelInput.value = settings.whisperModel; + if (settings.whisperLanguage && whisperLanguageInput) whisperLanguageInput.value = settings.whisperLanguage; + if (settings.whisperSegmentMs && whisperSegmentMsInput) whisperSegmentMsInput.value = settings.whisperSegmentMs; if (settings.geminiKey && geminiKeyInput) geminiKeyInput.value = settings.geminiKey; if (settings.windowGap && windowGapInput) windowGapInput.value = settings.windowGap; @@ -90,6 +100,8 @@ document.addEventListener('DOMContentLoaded', () => { } }); } + + updateSpeechFieldStates(); }; // Load settings when window opens @@ -115,8 +127,13 @@ document.addEventListener('DOMContentLoaded', () => { // Save settings helper function const saveSettings = () => { const settings = {}; + if (speechProviderSelect) settings.speechProvider = speechProviderSelect.value; if (azureKeyInput) settings.azureKey = azureKeyInput.value; if (azureRegionInput) settings.azureRegion = azureRegionInput.value; + if (whisperCommandInput) settings.whisperCommand = whisperCommandInput.value; + if (whisperModelInput) settings.whisperModel = whisperModelInput.value; + if (whisperLanguageInput) settings.whisperLanguage = whisperLanguageInput.value; + if (whisperSegmentMsInput) settings.whisperSegmentMs = whisperSegmentMsInput.value; if (geminiKeyInput) settings.geminiKey = geminiKeyInput.value; if (windowGapInput) settings.windowGap = windowGapInput.value; if (codingLanguageSelect) settings.codingLanguage = codingLanguageSelect.value; @@ -125,10 +142,29 @@ document.addEventListener('DOMContentLoaded', () => { window.api.send('save-settings', settings); }; + const updateSpeechFieldStates = () => { + const provider = speechProviderSelect ? speechProviderSelect.value : 'azure'; + const azureDisabled = provider !== 'azure'; + const whisperDisabled = provider !== 'whisper'; + + [azureKeyInput, azureRegionInput].forEach(input => { + if (input) input.disabled = azureDisabled; + }); + + [whisperCommandInput, whisperModelInput, whisperLanguageInput, whisperSegmentMsInput].forEach(input => { + if (input) input.disabled = whisperDisabled; + }); + }; + // Add event listeners for all inputs const inputs = [ + speechProviderSelect, azureKeyInput, azureRegionInput, + whisperCommandInput, + whisperModelInput, + whisperLanguageInput, + whisperSegmentMsInput, geminiKeyInput, windowGapInput ]; @@ -140,6 +176,13 @@ document.addEventListener('DOMContentLoaded', () => { } }); + if (speechProviderSelect) { + speechProviderSelect.addEventListener('change', () => { + updateSpeechFieldStates(); + saveSettings(); + }); + } + // Language selection handler if (codingLanguageSelect) { codingLanguageSelect.addEventListener('change', (e) => { @@ -163,6 +206,8 @@ document.addEventListener('DOMContentLoaded', () => { }); } + updateSpeechFieldStates(); + // Initialize icon grid with correct paths const initializeIconGrid = () => { if (!iconGrid) return;