diff --git a/.gitignore b/.gitignore index 3cd7f95..64fadbf 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,11 @@ Icon? build/ *.dmg +# sherpa-onnx build artifacts +vendor/sherpa-onnx-src/ +vendor/sherpa-onnx-build/ +vendor/sherpa-onnx.xcframework/ + # Misc *.swp *.swo diff --git a/README.md b/README.md index ee765dd..36ef1d6 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,20 @@ open TransFlow/TransFlow.xcodeproj 在 Xcode 中选择 TransFlow target,点击运行即可。 +### 本地 STT 开发环境(sherpa-onnx) + +如果你要开发本地离线 STT(Parakeet/Nemotron)能力,请先构建 sherpa-onnx XCFramework: + +```bash +./scripts/build-sherpa-onnx.sh +``` + +如果需要完整清理后重建(源码 + 构建产物): + +```bash +./scripts/build-sherpa-onnx.sh --clean --reclone +``` + ## 🚀 快速开始 1. 启动 TransFlow,授予麦克风权限 diff --git a/README_EN.md b/README_EN.md index feac692..57f8633 100644 --- a/README_EN.md +++ b/README_EN.md @@ -71,6 +71,20 @@ open TransFlow/TransFlow.xcodeproj Select the TransFlow target in Xcode and click Run. +### Local STT Developer Setup (sherpa-onnx) + +If you are developing the local on-device STT path (Parakeet/Nemotron), build the sherpa-onnx XCFramework first: + +```bash +./scripts/build-sherpa-onnx.sh +``` + +For a full clean rebuild of source + artifacts: + +```bash +./scripts/build-sherpa-onnx.sh --clean --reclone +``` + ## 🚀 Quick Start 1. Launch TransFlow and grant microphone permission diff --git a/TransFlow/TransFlow.xcodeproj/project.pbxproj b/TransFlow/TransFlow.xcodeproj/project.pbxproj index 2dd5480..f90c2cd 100644 --- a/TransFlow/TransFlow.xcodeproj/project.pbxproj +++ b/TransFlow/TransFlow.xcodeproj/project.pbxproj @@ -6,6 +6,10 @@ objectVersion = 77; objects = { +/* Begin PBXBuildFile section */ + AA00000200000002 /* sherpa-onnx.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = AA00000100000001 /* sherpa-onnx.xcframework */; }; +/* End PBXBuildFile section */ + /* Begin PBXContainerItemProxy section */ F4C5EE022F3638BE006D8A00 /* PBXContainerItemProxy */ = { isa = PBXContainerItemProxy; @@ -24,6 +28,7 @@ /* End PBXContainerItemProxy section */ /* Begin PBXFileReference section */ + AA00000100000001 /* sherpa-onnx.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = "sherpa-onnx.xcframework"; path = "../vendor/sherpa-onnx.xcframework"; sourceTree = ""; }; F4C5EDF42F3638BD006D8A00 /* TransFlow.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TransFlow.app; sourceTree = BUILT_PRODUCTS_DIR; }; F4C5EE012F3638BE006D8A00 /* TransFlowTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = TransFlowTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; F4C5EE0B2F3638BE006D8A00 /* TransFlowUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = TransFlowUITests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; @@ -65,6 +70,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + AA00000200000002 /* sherpa-onnx.xcframework in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -88,6 +94,7 @@ F4C5EDEB2F3638BD006D8A00 = { isa = PBXGroup; children = ( + AA00000100000001 /* sherpa-onnx.xcframework */, F4C5EDF62F3638BD006D8A00 /* TransFlow */, F4C5EE042F3638BE006D8A00 /* TransFlowTests */, F4C5EE0E2F3638BE006D8A00 /* TransFlowUITests */, @@ -421,7 +428,8 @@ "$(inherited)", "@executable_path/../Frameworks", ); - MARKETING_VERSION = 1.2.0; + MARKETING_VERSION = 1.2.0; + OTHER_LDFLAGS = "-lc++"; PRODUCT_BUNDLE_IDENTIFIER = com.cyron.TransFlow; PRODUCT_NAME = "$(TARGET_NAME)"; REGISTER_APP_GROUPS = YES; @@ -429,6 +437,7 @@ SWIFT_APPROACHABLE_CONCURRENCY = YES; SWIFT_DEFAULT_ACTOR_ISOLATION = MainActor; SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "TransFlow/TransFlow-Bridging-Header.h"; SWIFT_STRICT_CONCURRENCY = complete; SWIFT_UPCOMING_FEATURE_MEMBER_IMPORT_VISIBILITY = YES; SWIFT_VERSION = 6.0; @@ -455,7 +464,8 @@ "$(inherited)", "@executable_path/../Frameworks", ); - MARKETING_VERSION = 1.2.0; + MARKETING_VERSION = 1.2.0; + OTHER_LDFLAGS = "-lc++"; PRODUCT_BUNDLE_IDENTIFIER = com.cyron.TransFlow; PRODUCT_NAME = "$(TARGET_NAME)"; REGISTER_APP_GROUPS = YES; @@ -463,6 +473,7 @@ SWIFT_APPROACHABLE_CONCURRENCY = YES; SWIFT_DEFAULT_ACTOR_ISOLATION = MainActor; SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "TransFlow/TransFlow-Bridging-Header.h"; SWIFT_STRICT_CONCURRENCY = complete; SWIFT_UPCOMING_FEATURE_MEMBER_IMPORT_VISIBILITY = YES; SWIFT_VERSION = 6.0; diff --git a/TransFlow/TransFlow/Localizable.xcstrings b/TransFlow/TransFlow/Localizable.xcstrings index 9e2a72b..cc6fdc5 100644 --- a/TransFlow/TransFlow/Localizable.xcstrings +++ b/TransFlow/TransFlow/Localizable.xcstrings @@ -381,6 +381,57 @@ } } }, + "history.clear_all" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Clear All" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "清除全部" + } + } + } + }, + "history.clear_all_confirm_message" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Are you sure you want to delete all transcription history? This cannot be undone." + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "确定要删除所有转录历史吗?此操作无法撤销。" + } + } + } + }, + "history.clear_all_confirm_title" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Clear All History" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "清除所有历史" + } + } + } + }, "history.delete" : { "extractionState" : "manual", "localizations" : { @@ -568,155 +619,189 @@ } } }, - "history.clear_all" : { + "history.sessions" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Clear All" + "value" : "Sessions" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "清除全部" + "value" : "会话列表" } } } }, - "history.clear_all_confirm_message" : { + "history.transcriptions" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Are you sure you want to delete all transcription history? This cannot be undone." + "value" : "Transcriptions" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "确定要删除所有转录历史吗?此操作无法撤销。" + "value" : "转录历史" } } } }, - "history.clear_all_confirm_title" : { + "language.en" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Clear All History" + "value" : "English" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "清除所有历史" + "value" : "English" } } } }, - "history.sessions" : { + "language.system" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Sessions" + "value" : "System" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "会话列表" + "value" : "跟随系统" } } } }, - "history.transcriptions" : { + "language.zh-Hans" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Transcriptions" + "value" : "简体中文" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "转录历史" + "value" : "简体中文" } } } }, - "model_alert.go_to_settings" : { + "menu.clear_history" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Go to Settings" + "value" : "Clear History" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "前往设置" + "value" : "清除历史" } } } }, - "model_alert.message" : { + "menu.export_srt" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "The speech recognition model for the selected language has not been downloaded yet. Please go to Settings to download it first." + "value" : "Export SRT…" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "所选语言的语音识别模型尚未下载,请先前往设置页面下载。" + "value" : "导出 SRT…" } } } }, - "model_alert.title" : { + "model_action.download" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Model Not Ready" + "value" : "Download" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "模型未就绪" + "value" : "下载" } } } }, - "model_action.download" : { + "model_alert.go_to_settings" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Download" + "value" : "Go to Settings" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "下载" + "value" : "前往设置" + } + } + } + }, + "model_alert.message" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "The speech recognition model for the selected language has not been downloaded yet. Please go to Settings to download it first." + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "所选语言的语音识别模型尚未下载,请先前往设置页面下载。" + } + } + } + }, + "model_alert.title" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Model Not Ready" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "模型未就绪" } } } @@ -891,342 +976,719 @@ } } }, - "language.en" : { + "session.cancel" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "English" + "value" : "Cancel" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "English" + "value" : "取消" } } } }, - "language.system" : { + "session.create" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "System" + "value" : "Create" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "跟随系统" + "value" : "创建" } } } }, - "language.zh-Hans" : { + "session.filename_placeholder" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "简体中文" + "value" : "Session filename" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "简体中文" + "value" : "会话文件名" } } } }, - "menu.clear_history" : { + "session.new_session" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Clear History" + "value" : "New Session" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "清除历史" + "value" : "新建会话" } } } }, - "menu.export_srt" : { + "settings.about" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Export SRT…" + "value" : "About" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "导出 SRT…" + "value" : "关于" } } } }, - "session.cancel" : { + "settings.appearance" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Cancel" + "value" : "Appearance" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "取消" + "value" : "外观" } } } }, - "session.create" : { + "settings.engine" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Create" + "value" : "Speech Recognition Engine" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "创建" + "value" : "语音识别引擎" } } } }, - "session.filename_placeholder" : { + "settings.engine.apple" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Session filename" + "value" : "Apple Speech" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "会话文件名" + "value" : "Apple 语音" } } } }, - "session.new_session" : { + "settings.engine.local" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "New Session" + "value" : "Local (On-Device)" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "新建会话" + "value" : "本地(设备端)" } } } }, - "settings.models_loading" : { + "settings.engine.parakeet" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Loading model information…" + "value" : "Parakeet (Local)" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "正在加载模型信息…" + "value" : "Parakeet(本地)" } } } }, - "settings.open_logs" : { + "settings.feedback" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Error Logs" + "value" : "Feedback" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "错误日志" + "value" : "反馈" } } } }, - "settings.open_logs_description" : { + "settings.feedback_description" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Open logs folder for troubleshooting" + "value" : "Report issues or suggest features" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "打开日志文件夹以排查问题" + "value" : "报告问题或建议功能" } } } }, - "settings.speech_models" : { + "settings.general" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Speech Models" + "value" : "General" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "语音模型" + "value" : "通用" } } } }, - "settings.about" : { + "settings.language" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "About" + "value" : "Language" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "关于" + "value" : "语言" } } } }, - "settings.appearance" : { + "settings.local_model" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Appearance" + "value" : "Local Model" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "外观" + "value" : "本地模型" } } } }, - "settings.feedback" : { + "settings.local_model.nemotron" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Feedback" + "value" : "Nemotron Streaming 0.6B (int8)" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "反馈" + "value" : "Nemotron Streaming 0.6B(int8)" } } } }, - "settings.feedback_description" : { + "settings.local_model.parakeet" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Report issues or suggest features" + "value" : "Parakeet TDT 0.6B v2 (int8)" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "报告问题或建议功能" + "value" : "Parakeet TDT 0.6B v2(int8)" } } } }, - "settings.general" : { + "settings.model.action.cancel" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "General" + "value" : "Cancel" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "通用" + "value" : "取消" } } } }, - "settings.language" : { + "settings.model.action.resume" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Language" + "value" : "Resume" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "语言" + "value" : "继续" } } } }, - "settings.send_feedback" : { + "settings.model.delete" : { "extractionState" : "manual", "localizations" : { "en" : { "stringUnit" : { "state" : "translated", - "value" : "Send Feedback" + "value" : "Delete" } }, "zh-Hans" : { "stringUnit" : { "state" : "translated", - "value" : "发送反馈" + "value" : "删除" + } + } + } + }, + "settings.model.download" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Download" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "下载" + } + } + } + }, + "settings.model.error.retry_failed %lld" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Download failed after %lld retries." + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "重试 %lld 次后下载失败。" + } + } + } + }, + "settings.model.error.validation_failed" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Model validation failed. Please try downloading again." + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "模型验证失败,请重新下载。" + } + } + } + }, + "settings.model.license_notice" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Parakeet TDT model by NVIDIA, licensed under CC-BY-4.0. Powered by sherpa-onnx." + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "Parakeet TDT 模型由 NVIDIA 提供,基于 CC-BY-4.0 许可证。由 sherpa-onnx 驱动。" + } + } + } + }, + "settings.model.license_notice.nemotron" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Nemotron Speech model by NVIDIA. Please follow the upstream model license terms." + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "Nemotron Speech 模型由 NVIDIA 提供,请遵循其上游模型许可条款。" + } + } + } + }, + "settings.model.license_notice.parakeet" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Parakeet TDT model by NVIDIA, licensed under CC-BY-4.0. Powered by sherpa-onnx." + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "Parakeet TDT 模型由 NVIDIA 提供,基于 CC-BY-4.0 许可证。由 sherpa-onnx 驱动。" + } + } + } + }, + "settings.model.manage" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Model Storage" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "模型存储" + } + } + } + }, + "settings.model.parakeet_name" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Parakeet TDT 0.6B v2 (int8)" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "Parakeet TDT 0.6B v2(int8)" + } + } + } + }, + "settings.model.progress.eta %@" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "ETA %@" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "预计剩余 %@" + } + } + } + }, + "settings.model.progress.speed %@" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Speed %@" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "速度 %@" + } + } + } + }, + "settings.model.progress.speed %@/s" : { + + }, + "settings.model.status.downloading %lld" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Downloading… %lld%%" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "下载中… %lld%%" + } + } + } + }, + "settings.model.status.failed %@" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Failed: %@" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "失败:%@" + } + } + } + }, + "settings.model.status.not_downloaded" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Not downloaded (~631 MB)" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "未下载(约 631 MB)" + } + } + } + }, + "settings.model.status.not_downloaded.nemotron" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Not downloaded (~663 MB)" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "未下载(约 663 MB)" + } + } + } + }, + "settings.model.status.not_downloaded.parakeet" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Not downloaded (~631 MB)" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "未下载(约 631 MB)" + } + } + } + }, + "settings.model.status.ready" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Ready" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "就绪" + } + } + } + }, + "settings.model.status.resuming %lld" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Resuming… %lld%%" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "继续下载… %lld%%" + } + } + } + }, + "settings.models_loading" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Loading model information…" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "正在加载模型信息…" + } + } + } + }, + "settings.open_logs" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Error Logs" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "错误日志" + } + } + } + }, + "settings.open_logs_description" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Open logs folder for troubleshooting" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "打开日志文件夹以排查问题" + } + } + } + }, + "settings.send_feedback" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Send Feedback" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "发送反馈" + } + } + } + }, + "settings.speech_models" : { + "extractionState" : "manual", + "localizations" : { + "en" : { + "stringUnit" : { + "state" : "translated", + "value" : "Speech Models" + } + }, + "zh-Hans" : { + "stringUnit" : { + "state" : "translated", + "value" : "语音模型" } } } diff --git a/TransFlow/TransFlow/Models/AppSettings.swift b/TransFlow/TransFlow/Models/AppSettings.swift index 24b1350..ea3d941 100644 --- a/TransFlow/TransFlow/Models/AppSettings.swift +++ b/TransFlow/TransFlow/Models/AppSettings.swift @@ -73,6 +73,20 @@ final class AppSettings { } } + /// The selected speech recognition engine. + var selectedEngine: TranscriptionEngineKind { + didSet { + UserDefaults.standard.set(selectedEngine.rawValue, forKey: "selectedEngine") + } + } + + /// Selected local ASR model when `selectedEngine == .local`. + var selectedLocalModel: LocalTranscriptionModelKind { + didSet { + UserDefaults.standard.set(selectedLocalModel.rawValue, forKey: "selectedLocalModel") + } + } + /// The resolved locale used for SwiftUI environment. var locale: Locale @@ -84,6 +98,19 @@ final class AppSettings { let storedAppearance = UserDefaults.standard.string(forKey: "appAppearance") ?? "system" self.appAppearance = AppAppearance(rawValue: storedAppearance) ?? .system + let storedEngine = UserDefaults.standard.string(forKey: "selectedEngine") ?? "apple" + if storedEngine == "parakeetLocal" { + // Backward compatibility for previous engine key. + self.selectedEngine = .local + } else { + self.selectedEngine = TranscriptionEngineKind(rawValue: storedEngine) ?? .apple + } + + let storedLocalModel = UserDefaults.standard.string(forKey: "selectedLocalModel") + ?? LocalTranscriptionModelKind.parakeetOfflineInt8.rawValue + self.selectedLocalModel = LocalTranscriptionModelKind(rawValue: storedLocalModel) + ?? .parakeetOfflineInt8 + if let identifier = language.localeIdentifier { self.locale = Locale(identifier: identifier) } else { diff --git a/TransFlow/TransFlow/Models/TranscriptionModels.swift b/TransFlow/TransFlow/Models/TranscriptionModels.swift index e91c496..5fed409 100644 --- a/TransFlow/TransFlow/Models/TranscriptionModels.swift +++ b/TransFlow/TransFlow/Models/TranscriptionModels.swift @@ -1,4 +1,79 @@ -import Foundation +import SwiftUI + +/// Which speech-to-text backend to use. +enum TranscriptionEngineKind: String, CaseIterable, Identifiable, Sendable { + case apple = "apple" + case local = "local" + + var id: String { rawValue } + + var displayName: LocalizedStringKey { + switch self { + case .apple: "settings.engine.apple" + case .local: "settings.engine.local" + } + } +} + +/// Which local ASR model to use when the local engine is selected. +enum LocalTranscriptionModelKind: String, CaseIterable, Identifiable, Sendable { + case parakeetOfflineInt8 = "parakeetOfflineInt8" + case nemotronStreamingInt8 = "nemotronStreamingInt8" + + var id: String { rawValue } + + var displayName: LocalizedStringKey { + switch self { + case .parakeetOfflineInt8: "settings.local_model.parakeet" + case .nemotronStreamingInt8: "settings.local_model.nemotron" + } + } + + var licenseNoticeKey: LocalizedStringKey { + switch self { + case .parakeetOfflineInt8: + "settings.model.license_notice.parakeet" + case .nemotronStreamingInt8: + "settings.model.license_notice.nemotron" + } + } +} + +/// Status of the locally-downloaded Parakeet model. +enum LocalModelStatus: Equatable, Sendable { + /// Model files have not been downloaded yet. + case notDownloaded + /// Download is in progress. + case downloading(progress: Double) + /// Model is validated and ready to use. + case ready + /// Download or validation failed. + case failed(message: String) + + var isReady: Bool { + if case .ready = self { return true } + return false + } + + var isDownloading: Bool { + if case .downloading = self { return true } + return false + } +} + +/// Runtime details for an in-progress model download. +struct LocalModelDownloadDetail: Equatable, Sendable { + /// Bytes persisted so far. + let downloadedBytes: Int64 + /// Total expected bytes if known. + let totalBytes: Int64? + /// Instantaneous transfer speed in bytes/second if available. + let bytesPerSecond: Double? + /// Estimated remaining time in seconds if available. + let etaSeconds: Double? + /// Whether the current task resumed from previous partial data. + let isResuming: Bool +} /// A completed transcription sentence with timestamp and optional translation. struct TranscriptionSentence: Identifiable, Sendable { diff --git a/TransFlow/TransFlow/Services/LocalModelManager.swift b/TransFlow/TransFlow/Services/LocalModelManager.swift new file mode 100644 index 0000000..2114324 --- /dev/null +++ b/TransFlow/TransFlow/Services/LocalModelManager.swift @@ -0,0 +1,686 @@ +import Foundation + +/// Manages on-demand download, validation, and deletion of local ASR models. +@Observable +@MainActor +final class LocalModelManager { + static let shared = LocalModelManager() + + struct SupplementalDownload: Sendable { + let fileName: String + let url: URL + let minSize: Int64 + } + + struct LocalModelSpec: Sendable { + let kind: LocalTranscriptionModelKind + let directoryPath: String + let legacyDirectoryPaths: [String] + let archiveURL: URL + let requiredFiles: [String: Int64] + let supplementalDownloads: [SupplementalDownload] + let estimatedSizeBytes: Int64 + } + + // MARK: - Observable State + + private(set) var statuses: [LocalTranscriptionModelKind: LocalModelStatus] + private(set) var diskSizeBytesByModel: [LocalTranscriptionModelKind: Int64] + private(set) var downloadDetailsByModel: [LocalTranscriptionModelKind: LocalModelDownloadDetail] + + // MARK: - Private + + private var downloadTasks: [LocalTranscriptionModelKind: Task] = [:] + private var lastProgressSampleTimeByRole: [String: Date] = [:] + private var lastProgressSampleBytesByRole: [String: Int64] = [:] + + // MARK: - Constants + + /// Base directory for all app local models. + private static let modelsRoot: URL = { + let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + return appSupport.appending(path: "TransFlow/Models", directoryHint: .isDirectory) + }() + private static let resumeRootRelativePath = ".resume" + private static let stagingRootRelativePath = ".staging" + private static let maxDownloadRetries = 3 + + private static let specs: [LocalTranscriptionModelKind: LocalModelSpec] = [ + .parakeetOfflineInt8: LocalModelSpec( + kind: .parakeetOfflineInt8, + directoryPath: "Local/parakeet-tdt-0.6b-v2-int8", + // Backward compatibility with the previous path. + legacyDirectoryPaths: ["ParakeetTDT0.6Bv2/int8"], + archiveURL: URL(string: "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2")!, + requiredFiles: [ + "encoder.int8.onnx": 100_000_000, + "decoder.int8.onnx": 1_000_000, + "joiner.int8.onnx": 500_000, + "tokens.txt": 1_000, + "silero_vad.onnx": 500_000, + ], + supplementalDownloads: [ + SupplementalDownload( + fileName: "silero_vad.onnx", + url: URL(string: "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx")!, + minSize: 500_000 + ), + ], + estimatedSizeBytes: 631_000_000 + ), + .nemotronStreamingInt8: LocalModelSpec( + kind: .nemotronStreamingInt8, + directoryPath: "Local/nemotron-speech-streaming-en-0.6b-int8-2026-01-14", + legacyDirectoryPaths: [], + archiveURL: URL(string: "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemotron-speech-streaming-en-0.6b-int8-2026-01-14.tar.bz2")!, + requiredFiles: [ + "encoder.int8.onnx": 100_000_000, + "decoder.int8.onnx": 1_000_000, + "joiner.int8.onnx": 500_000, + "tokens.txt": 1_000, + ], + supplementalDownloads: [], + estimatedSizeBytes: 663_000_000 + ), + ] + + private init() { + statuses = Dictionary( + uniqueKeysWithValues: LocalTranscriptionModelKind.allCases.map { ($0, .notDownloaded) } + ) + diskSizeBytesByModel = Dictionary( + uniqueKeysWithValues: LocalTranscriptionModelKind.allCases.map { ($0, Int64(0)) } + ) + downloadDetailsByModel = [:] + checkAllStatuses() + } + + // MARK: - Public API + + /// Backward-compatible convenience for currently selected local model. + var status: LocalModelStatus { status(for: AppSettings.shared.selectedLocalModel) } + var diskSizeBytes: Int64 { diskSizeBytes(for: AppSettings.shared.selectedLocalModel) } + var modelDirectory: URL { modelDirectory(for: AppSettings.shared.selectedLocalModel) } + func checkStatus() { checkStatus(for: AppSettings.shared.selectedLocalModel) } + func download() { download(for: AppSettings.shared.selectedLocalModel) } + func cancelDownload() { cancelDownload(for: AppSettings.shared.selectedLocalModel) } + func delete() { delete(for: AppSettings.shared.selectedLocalModel) } + + func status(for kind: LocalTranscriptionModelKind) -> LocalModelStatus { + statuses[kind] ?? .notDownloaded + } + + func diskSizeBytes(for kind: LocalTranscriptionModelKind) -> Int64 { + diskSizeBytesByModel[kind] ?? 0 + } + + func downloadDetail(for kind: LocalTranscriptionModelKind) -> LocalModelDownloadDetail? { + downloadDetailsByModel[kind] + } + + func hasResumeData(for kind: LocalTranscriptionModelKind) -> Bool { + guard let spec = Self.specs[kind] else { return false } + if FileManager.default.fileExists(atPath: resumeDataURL(for: kind, role: "archive").path(percentEncoded: false)) { + return true + } + for item in spec.supplementalDownloads { + if FileManager.default.fileExists( + atPath: resumeDataURL(for: kind, role: "supplemental-\(item.fileName)").path(percentEncoded: false) + ) { + return true + } + } + return false + } + + /// Directory to use for loading the specified model. + /// If legacy assets are present and valid, they are preferred. + func modelDirectory(for kind: LocalTranscriptionModelKind) -> URL { + if let readyDir = resolvedReadyDirectory(for: kind) { + return readyDir + } + return primaryDirectory(for: kind) + } + + func checkAllStatuses() { + for kind in LocalTranscriptionModelKind.allCases { + checkStatus(for: kind) + } + } + + /// Check whether all required model files are present and valid. + func checkStatus(for kind: LocalTranscriptionModelKind) { + if let readyDir = resolvedReadyDirectory(for: kind) { + statuses[kind] = .ready + diskSizeBytesByModel[kind] = computeDiskSize(at: readyDir) + downloadDetailsByModel[kind] = nil + return + } + + // Keep active download state if currently downloading. + if case .downloading = statuses[kind] { + return + } + statuses[kind] = .notDownloaded + diskSizeBytesByModel[kind] = 0 + downloadDetailsByModel[kind] = nil + } + + /// Download model archive (+ supplemental files if configured). No-op if already downloading. + func download(for kind: LocalTranscriptionModelKind) { + guard let spec = Self.specs[kind] else { return } + guard downloadTasks[kind] == nil else { return } + statuses[kind] = .downloading(progress: 0) + downloadDetailsByModel[kind] = nil + + downloadTasks[kind] = Task { + defer { downloadTasks[kind] = nil } + do { + try await performDownloadWithRetries(spec: spec, kind: kind) + checkStatus(for: kind) + if !status(for: kind).isReady { + statuses[kind] = .failed(message: String(localized: "settings.model.error.validation_failed")) + ErrorLogger.shared.log("Model validation failed after download: \(kind.rawValue)", source: "LocalModel") + } + } catch is CancellationError { + statuses[kind] = .notDownloaded + } catch { + let message: String + if isTransientDownloadError(error) { + message = String(localized: "settings.model.error.retry_failed \(Self.maxDownloadRetries)") + } else { + message = error.localizedDescription + } + statuses[kind] = .failed(message: message) + ErrorLogger.shared.log("Model download failed (\(kind.rawValue)): \(message)", source: "LocalModel") + } + if !(status(for: kind).isReady) { + downloadDetailsByModel[kind] = nil + } + cleanupProgressTracking(for: kind) + } + } + + /// Delete downloaded files for the specified model (primary + legacy paths). + func delete(for kind: LocalTranscriptionModelKind) { + downloadTasks[kind]?.cancel() + downloadTasks[kind] = nil + + let fm = FileManager.default + for dir in candidateDirectories(for: kind) { + try? fm.removeItem(at: dir) + } + clearAllResumeData(for: kind) + clearStagingDirectories(for: kind) + statuses[kind] = .notDownloaded + diskSizeBytesByModel[kind] = 0 + downloadDetailsByModel[kind] = nil + cleanupProgressTracking(for: kind) + } + + func cancelDownload(for kind: LocalTranscriptionModelKind) { + guard let task = downloadTasks[kind] else { return } + task.cancel() + statuses[kind] = .notDownloaded + downloadDetailsByModel[kind] = nil + cleanupProgressTracking(for: kind) + } + + // MARK: - Directory Helpers + + private func primaryDirectory(for kind: LocalTranscriptionModelKind) -> URL { + guard let spec = Self.specs[kind] else { return Self.modelsRoot } + return Self.modelsRoot.appending(path: spec.directoryPath, directoryHint: .isDirectory) + } + + private func candidateDirectories(for kind: LocalTranscriptionModelKind) -> [URL] { + guard let spec = Self.specs[kind] else { return [] } + var dirs: [URL] = [primaryDirectory(for: kind)] + dirs.append(contentsOf: spec.legacyDirectoryPaths.map { relativePath in + Self.modelsRoot.appending(path: relativePath, directoryHint: .isDirectory) + }) + return dirs + } + + private func resolvedReadyDirectory(for kind: LocalTranscriptionModelKind) -> URL? { + guard let spec = Self.specs[kind] else { return nil } + for dir in candidateDirectories(for: kind) { + if isModelReady(at: dir, spec: spec) { + return dir + } + } + return nil + } + + private func isModelReady(at directory: URL, spec: LocalModelSpec) -> Bool { + let fm = FileManager.default + for (file, minSize) in spec.requiredFiles { + let url = directory.appending(path: file) + let filePath = url.path(percentEncoded: false) + guard fm.fileExists(atPath: filePath), + let attrs = try? fm.attributesOfItem(atPath: filePath), + let size = attrs[.size] as? Int64, + size >= minSize + else { + return false + } + } + return true + } + + // MARK: - Download Helpers + + private struct DownloadHTTPError: LocalizedError { + let statusCode: Int + var errorDescription: String? { + "HTTP status \(statusCode)" + } + } + + private func performDownloadWithRetries(spec: LocalModelSpec, kind: LocalTranscriptionModelKind) async throws { + var attempt = 1 + while true { + do { + try await performSingleDownloadAttempt(spec: spec, kind: kind) + clearAllResumeData(for: kind) + return + } catch is CancellationError { + throw CancellationError() + } catch { + let retryable = isTransientDownloadError(error) && attempt < Self.maxDownloadRetries + guard retryable else { throw error } + let delaySeconds = pow(2.0, Double(attempt - 1)) + ErrorLogger.shared.log( + "Retrying model download (\(kind.rawValue)) attempt \(attempt + 1)/\(Self.maxDownloadRetries) after \(delaySeconds)s", + source: "LocalModel" + ) + try await Task.sleep(nanoseconds: UInt64(delaySeconds * 1_000_000_000)) + attempt += 1 + } + } + } + + private func performSingleDownloadAttempt(spec: LocalModelSpec, kind: LocalTranscriptionModelKind) async throws { + let fm = FileManager.default + try fm.createDirectory(at: Self.modelsRoot, withIntermediateDirectories: true) + clearStagingDirectories(for: kind) + + let stagingDir = try makeStagingDirectory(for: kind) + var shouldCleanupStaging = true + defer { + if shouldCleanupStaging { + try? fm.removeItem(at: stagingDir) + } + } + + let archiveUpperBound = spec.supplementalDownloads.isEmpty ? 1.0 : 0.9 + let archiveTempURL = try await downloadFile( + from: spec.archiveURL, + modelKind: kind, + role: "archive", + progressRange: 0.0 ..< archiveUpperBound, + estimatedModelBytes: spec.estimatedSizeBytes + ) + defer { try? fm.removeItem(at: archiveTempURL) } + try await extractTarball(archiveTempURL, to: stagingDir) + + if !spec.supplementalDownloads.isEmpty { + for (index, item) in spec.supplementalDownloads.enumerated() { + let start = archiveUpperBound + + (Double(index) / Double(spec.supplementalDownloads.count)) * (1.0 - archiveUpperBound) + let end = archiveUpperBound + + (Double(index + 1) / Double(spec.supplementalDownloads.count)) * (1.0 - archiveUpperBound) + + let tempURL = try await downloadFile( + from: item.url, + modelKind: kind, + role: "supplemental-\(item.fileName)", + progressRange: start ..< end, + estimatedModelBytes: spec.estimatedSizeBytes + ) + defer { try? fm.removeItem(at: tempURL) } + + let dest = stagingDir.appending(path: item.fileName) + if fm.fileExists(atPath: dest.path(percentEncoded: false)) { + try fm.removeItem(at: dest) + } + try fm.moveItem(at: tempURL, to: dest) + } + } + + guard isModelReady(at: stagingDir, spec: spec) else { + throw NSError( + domain: "LocalModelManager", + code: -1, + userInfo: [NSLocalizedDescriptionKey: String(localized: "settings.model.error.validation_failed")] + ) + } + + let destination = primaryDirectory(for: kind) + try installAtomically(stagingDir: stagingDir, to: destination) + shouldCleanupStaging = false + statuses[kind] = .downloading(progress: 1.0) + } + + /// Download a single file with resume + progress reporting. + /// Returns a temporary local file URL owned by this process. + private func downloadFile( + from url: URL, + modelKind: LocalTranscriptionModelKind, + role: String, + progressRange: Range, + estimatedModelBytes: Int64 + ) async throws -> URL { + let config = URLSessionConfiguration.default + config.waitsForConnectivity = true + config.timeoutIntervalForRequest = 60 + config.timeoutIntervalForResource = 4 * 60 * 60 + + let resumeData = loadResumeData(for: modelKind, role: role) + let isResuming = resumeData != nil + let resumeDataFileURL = resumeDataURL(for: modelKind, role: role) + let progressRoleKey = "\(modelKind.rawValue)::\(role)" + + let delegate = ModelDownloadSessionDelegate( + onProgress: { [weak self] _, totalBytesWritten, totalBytesExpected in + Task { @MainActor [weak self] in + self?.updateDownloadProgress( + modelKind: modelKind, + roleKey: progressRoleKey, + progressRange: progressRange, + written: totalBytesWritten, + expected: totalBytesExpected, + estimatedModelBytes: estimatedModelBytes, + isResuming: isResuming + ) + } + }, + onResumeData: { data in + Self.writeResumeData(data, to: resumeDataFileURL) + } + ) + + let queue = OperationQueue() + queue.maxConcurrentOperationCount = 1 + let session = URLSession(configuration: config, delegate: delegate, delegateQueue: queue) + defer { session.finishTasksAndInvalidate() } + + let task: URLSessionDownloadTask + if let resumeData { + task = session.downloadTask(withResumeData: resumeData) + } else { + task = session.downloadTask(with: url) + } + task.priority = URLSessionTask.highPriority + task.resume() + + do { + let completion = try await withTaskCancellationHandler { + try await delegate.waitForCompletion() + } onCancel: { + task.cancel(byProducingResumeData: { data in + guard let data else { return } + Self.writeResumeData(data, to: resumeDataFileURL) + }) + } + clearResumeData(for: modelKind, role: role) + + if let httpResponse = completion.response as? HTTPURLResponse, + !(200 ..< 300).contains(httpResponse.statusCode) { + throw DownloadHTTPError(statusCode: httpResponse.statusCode) + } + + return completion.tempFileURL + } catch is CancellationError { + throw CancellationError() + } catch { + throw error + } + } + + /// Extract a `.tar.bz2` archive, moving inner files into the destination directory. + private func extractTarball(_ tarURL: URL, to destination: URL) async throws { + // Extract to a temporary directory first + let tempDir = FileManager.default.temporaryDirectory + .appending(path: UUID().uuidString, directoryHint: .isDirectory) + try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true) + defer { try? FileManager.default.removeItem(at: tempDir) } + + let process = Process() + process.executableURL = URL(fileURLWithPath: "/usr/bin/tar") + process.arguments = ["xjf", tarURL.path(percentEncoded: false), "-C", tempDir.path(percentEncoded: false)] + try process.run() + process.waitUntilExit() + + guard process.terminationStatus == 0 else { + throw NSError( + domain: "LocalModelManager", + code: Int(process.terminationStatus), + userInfo: [NSLocalizedDescriptionKey: "tar extraction failed with status \(process.terminationStatus)"] + ) + } + + // The tarball extracts into a subdirectory; move the contained files into destination. + let fm = FileManager.default + let contents = try fm.contentsOfDirectory(at: tempDir, includingPropertiesForKeys: nil) + + let extractedDir = contents.first { url in + var isDir: ObjCBool = false + return fm.fileExists(atPath: url.path(percentEncoded: false), isDirectory: &isDir) && isDir.boolValue + } ?? tempDir + + let files = try fm.contentsOfDirectory(at: extractedDir, includingPropertiesForKeys: nil) + for file in files { + let destFile = destination.appending(path: file.lastPathComponent) + if fm.fileExists(atPath: destFile.path(percentEncoded: false)) { + try fm.removeItem(at: destFile) + } + try fm.moveItem(at: file, to: destFile) + } + } + + private func installAtomically(stagingDir: URL, to destination: URL) throws { + let fm = FileManager.default + try fm.createDirectory(at: destination.deletingLastPathComponent(), withIntermediateDirectories: true) + if fm.fileExists(atPath: destination.path(percentEncoded: false)) { + do { + _ = try fm.replaceItemAt(destination, withItemAt: stagingDir, backupItemName: nil, options: []) + return + } catch { + try fm.removeItem(at: destination) + try fm.moveItem(at: stagingDir, to: destination) + return + } + } + try fm.moveItem(at: stagingDir, to: destination) + } + + private func updateDownloadProgress( + modelKind: LocalTranscriptionModelKind, + roleKey: String, + progressRange: Range, + written: Int64, + expected: Int64, + estimatedModelBytes: Int64, + isResuming: Bool + ) { + let now = Date() + let previousTime = lastProgressSampleTimeByRole[roleKey] + let previousBytes = lastProgressSampleBytesByRole[roleKey] + lastProgressSampleTimeByRole[roleKey] = now + lastProgressSampleBytesByRole[roleKey] = written + + let bytesPerSecond: Double? + if let previousTime, let previousBytes { + let elapsed = now.timeIntervalSince(previousTime) + let deltaBytes = written - previousBytes + if elapsed >= 0.25, deltaBytes > 0 { + bytesPerSecond = Double(deltaBytes) / elapsed + } else { + bytesPerSecond = nil + } + } else { + bytesPerSecond = nil + } + + let clampedExpected = expected > 0 ? expected : nil + let fileProgress: Double + if let clampedExpected { + fileProgress = min(max(Double(written) / Double(clampedExpected), 0), 1) + } else { + fileProgress = 0 + } + + let overallProgress = min( + progressRange.upperBound, + max( + progressRange.lowerBound, + progressRange.lowerBound + fileProgress * (progressRange.upperBound - progressRange.lowerBound) + ) + ) + statuses[modelKind] = .downloading(progress: overallProgress) + + guard estimatedModelBytes > 0 else { + downloadDetailsByModel[modelKind] = LocalModelDownloadDetail( + downloadedBytes: written, + totalBytes: clampedExpected, + bytesPerSecond: bytesPerSecond, + etaSeconds: nil, + isResuming: isResuming + ) + return + } + + let overallDownloadedBytes = Int64(Double(estimatedModelBytes) * overallProgress) + let speed = bytesPerSecond + let eta: Double? + if let speed, speed > 0 { + eta = max(0, Double(estimatedModelBytes - overallDownloadedBytes) / speed) + } else { + eta = nil + } + downloadDetailsByModel[modelKind] = LocalModelDownloadDetail( + downloadedBytes: overallDownloadedBytes, + totalBytes: estimatedModelBytes, + bytesPerSecond: speed, + etaSeconds: eta, + isResuming: isResuming + ) + } + + private func isTransientDownloadError(_ error: Error) -> Bool { + if let error = error as? DownloadHTTPError { + switch error.statusCode { + case 408, 425, 429, 500 ... 599: + return true + default: + return false + } + } + + let nsError = error as NSError + guard nsError.domain == NSURLErrorDomain else { + return false + } + let code = URLError.Code(rawValue: nsError.code) + switch code { + case .timedOut, + .networkConnectionLost, + .notConnectedToInternet, + .cannotConnectToHost, + .cannotFindHost, + .dnsLookupFailed, + .resourceUnavailable, + .internationalRoamingOff, + .callIsActive, + .dataNotAllowed, + .secureConnectionFailed: + return true + default: + return false + } + } + + // MARK: - Resume Data / Staging + + private func resumeRootDirectory() -> URL { + Self.modelsRoot.appending(path: Self.resumeRootRelativePath, directoryHint: .isDirectory) + } + + private func stagingRootDirectory() -> URL { + Self.modelsRoot.appending(path: Self.stagingRootRelativePath, directoryHint: .isDirectory) + } + + private func resumeDataURL(for kind: LocalTranscriptionModelKind, role: String) -> URL { + let safeRole = role.replacingOccurrences(of: "/", with: "_") + return resumeRootDirectory() + .appending(path: kind.rawValue, directoryHint: .isDirectory) + .appending(path: "\(safeRole).resume") + } + + private func loadResumeData(for kind: LocalTranscriptionModelKind, role: String) -> Data? { + let url = resumeDataURL(for: kind, role: role) + return try? Data(contentsOf: url) + } + + private func saveResumeData(_ data: Data, for kind: LocalTranscriptionModelKind, role: String) { + let url = resumeDataURL(for: kind, role: role) + Self.writeResumeData(data, to: url) + } + + private func clearResumeData(for kind: LocalTranscriptionModelKind, role: String) { + let url = resumeDataURL(for: kind, role: role) + try? FileManager.default.removeItem(at: url) + } + + private func clearAllResumeData(for kind: LocalTranscriptionModelKind) { + let dir = resumeRootDirectory().appending(path: kind.rawValue, directoryHint: .isDirectory) + try? FileManager.default.removeItem(at: dir) + } + + private func makeStagingDirectory(for kind: LocalTranscriptionModelKind) throws -> URL { + let dir = stagingRootDirectory() + .appending(path: "\(kind.rawValue)-\(UUID().uuidString)", directoryHint: .isDirectory) + try FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) + return dir + } + + private func clearStagingDirectories(for kind: LocalTranscriptionModelKind) { + let root = stagingRootDirectory() + let fm = FileManager.default + guard let contents = try? fm.contentsOfDirectory(at: root, includingPropertiesForKeys: nil) else { return } + for item in contents where item.lastPathComponent.hasPrefix("\(kind.rawValue)-") { + try? fm.removeItem(at: item) + } + } + + nonisolated private static func writeResumeData(_ data: Data, to url: URL) { + try? FileManager.default.createDirectory(at: url.deletingLastPathComponent(), withIntermediateDirectories: true) + try? data.write(to: url, options: .atomic) + } + + private func cleanupProgressTracking(for kind: LocalTranscriptionModelKind) { + let prefix = "\(kind.rawValue)::" + lastProgressSampleTimeByRole = lastProgressSampleTimeByRole.filter { !$0.key.hasPrefix(prefix) } + lastProgressSampleBytesByRole = lastProgressSampleBytesByRole.filter { !$0.key.hasPrefix(prefix) } + } + + // MARK: - Disk Size + + private func computeDiskSize(at directory: URL) -> Int64 { + let fm = FileManager.default + guard let enumerator = fm.enumerator( + at: directory, + includingPropertiesForKeys: [.fileSizeKey], + options: [.skipsHiddenFiles] + ) else { return 0 } + + var total: Int64 = 0 + for case let url as URL in enumerator { + if let size = try? url.resourceValues(forKeys: [.fileSizeKey]).fileSize { + total += Int64(size) + } + } + return total + } +} diff --git a/TransFlow/TransFlow/Services/ModelDownloadSessionDelegate.swift b/TransFlow/TransFlow/Services/ModelDownloadSessionDelegate.swift new file mode 100644 index 0000000..643f93a --- /dev/null +++ b/TransFlow/TransFlow/Services/ModelDownloadSessionDelegate.swift @@ -0,0 +1,119 @@ +import Foundation + +/// Delegate bridge for URLSessionDownloadTask that exposes async completion +/// while reporting progress and resume data callbacks. +final class ModelDownloadSessionDelegate: NSObject, URLSessionDownloadDelegate, URLSessionTaskDelegate { + struct Completion: Sendable { + let tempFileURL: URL + let response: URLResponse + } + + private let onProgress: @Sendable (_ bytesWritten: Int64, _ totalBytesWritten: Int64, _ totalBytesExpected: Int64) -> Void + private let onResumeData: @Sendable (Data) -> Void + + private let lock = NSLock() + private var continuation: CheckedContinuation? + private var pendingResult: Result? + private var finishedLocation: URL? + private var finishedResponse: URLResponse? + private var finishedError: Error? + + init( + onProgress: @escaping @Sendable (_ bytesWritten: Int64, _ totalBytesWritten: Int64, _ totalBytesExpected: Int64) -> Void, + onResumeData: @escaping @Sendable (Data) -> Void + ) { + self.onProgress = onProgress + self.onResumeData = onResumeData + } + + func waitForCompletion() async throws -> Completion { + try await withCheckedThrowingContinuation { [weak self] (continuation: CheckedContinuation) in + guard let self else { + continuation.resume(throwing: URLError(.cancelled)) + return + } + lock.lock() + if let pendingResult { + lock.unlock() + continuation.resume(with: pendingResult) + return + } + self.continuation = continuation + lock.unlock() + } + } + + private func complete(with result: Result) { + lock.lock() + if let continuation { + self.continuation = nil + lock.unlock() + continuation.resume(with: result) + return + } + pendingResult = result + lock.unlock() + } + + // MARK: - URLSessionDownloadDelegate + + func urlSession( + _ session: URLSession, + downloadTask: URLSessionDownloadTask, + didWriteData bytesWritten: Int64, + totalBytesWritten: Int64, + totalBytesExpectedToWrite: Int64 + ) { + onProgress(bytesWritten, totalBytesWritten, totalBytesExpectedToWrite) + } + + func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didFinishDownloadingTo location: URL) { + let fm = FileManager.default + let ownedTempURL = fm.temporaryDirectory + .appending(path: "transflow-model-\(UUID().uuidString)") + do { + if fm.fileExists(atPath: ownedTempURL.path(percentEncoded: false)) { + try fm.removeItem(at: ownedTempURL) + } + // The system-owned download location is only guaranteed during this callback. + try fm.moveItem(at: location, to: ownedTempURL) + lock.lock() + finishedLocation = ownedTempURL + finishedResponse = downloadTask.response + lock.unlock() + } catch { + lock.lock() + finishedError = error + lock.unlock() + } + } + + // MARK: - URLSessionTaskDelegate + + func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) { + if let error { + if let resumeData = (error as NSError).userInfo[NSURLSessionDownloadTaskResumeData] as? Data { + onResumeData(resumeData) + } + complete(with: .failure(error)) + return + } + + lock.lock() + let location = finishedLocation + let response = finishedResponse ?? task.response + let storedError = finishedError + lock.unlock() + + if let storedError { + complete(with: .failure(storedError)) + return + } + + guard let location, let response else { + complete(with: .failure(URLError(.unknown))) + return + } + complete(with: .success(Completion(tempFileURL: location, response: response))) + } +} diff --git a/TransFlow/TransFlow/Services/NemotronStreamingSpeechEngine.swift b/TransFlow/TransFlow/Services/NemotronStreamingSpeechEngine.swift new file mode 100644 index 0000000..1ac8acc --- /dev/null +++ b/TransFlow/TransFlow/Services/NemotronStreamingSpeechEngine.swift @@ -0,0 +1,74 @@ +import Foundation + +/// Local streaming speech-to-text engine using sherpa-onnx + Nemotron Speech Streaming EN 0.6B int8. +/// Emits partial updates while speaking and finalized sentences on endpoint detection. +nonisolated final class NemotronStreamingSpeechEngine: TranscriptionEngine, Sendable { + private let modelDirectory: URL + + init(modelDirectory: URL) { + self.modelDirectory = modelDirectory + } + + func processStream(_ audioStream: AsyncStream) -> AsyncStream { + let (events, continuation) = AsyncStream.makeStream( + bufferingPolicy: .bufferingNewest(128) + ) + let modelDir = self.modelDirectory + + Task.detached(priority: .userInitiated) { + do { + let recognizer = try SherpaOnnxOnlineRecognizerBridge( + encoderPath: modelDir.appending(path: "encoder.int8.onnx").path(percentEncoded: false), + decoderPath: modelDir.appending(path: "decoder.int8.onnx").path(percentEncoded: false), + joinerPath: modelDir.appending(path: "joiner.int8.onnx").path(percentEncoded: false), + tokensPath: modelDir.appending(path: "tokens.txt").path(percentEncoded: false), + numThreads: 2, + modelType: "nemo_transducer", + modelingUnit: "bpe" + ) + + var lastPartial = "" + + func emitPartialIfChanged(_ text: String) { + guard text != lastPartial else { return } + lastPartial = text + continuation.yield(.partial(text)) + } + + func emitFinalIfNeeded(_ text: String) { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return } + continuation.yield(.sentenceComplete(TranscriptionSentence(timestamp: Date(), text: trimmed))) + lastPartial = "" + continuation.yield(.partial("")) + } + + for await chunk in audioStream { + recognizer.acceptWaveform(samples: chunk.samples) + recognizer.decodeWhileReady() + + let text = recognizer.currentText() + if recognizer.isEndpoint() { + emitFinalIfNeeded(text) + recognizer.reset() + } else { + emitPartialIfChanged(text) + } + } + + recognizer.inputFinished() + recognizer.decodeWhileReady() + emitFinalIfNeeded(recognizer.currentText()) + + } catch { + let message = "Nemotron engine error: \(error.localizedDescription)" + await MainActor.run { ErrorLogger.shared.log(message, source: "NemotronEngine") } + continuation.yield(.error(message)) + } + + continuation.finish() + } + + return events + } +} diff --git a/TransFlow/TransFlow/Services/ParakeetSpeechEngine.swift b/TransFlow/TransFlow/Services/ParakeetSpeechEngine.swift new file mode 100644 index 0000000..6456db1 --- /dev/null +++ b/TransFlow/TransFlow/Services/ParakeetSpeechEngine.swift @@ -0,0 +1,103 @@ +import Foundation + +/// Local speech-to-text engine using sherpa-onnx + Parakeet TDT 0.6B v2 (int8). +/// Uses Silero VAD to segment audio, then runs the offline recognizer on each speech segment. +/// Emits `.sentenceComplete` events when a speech segment is fully decoded. +nonisolated final class ParakeetSpeechEngine: TranscriptionEngine, Sendable { + private let modelDirectory: URL + + init(modelDirectory: URL) { + self.modelDirectory = modelDirectory + } + + func processStream(_ audioStream: AsyncStream) -> AsyncStream { + let (events, continuation) = AsyncStream.makeStream( + bufferingPolicy: .bufferingNewest(128) + ) + let modelDir = self.modelDirectory + + Task.detached(priority: .userInitiated) { + do { + // 1. Initialize offline recognizer + let recognizer = try SherpaOnnxOfflineRecognizerBridge( + encoderPath: modelDir.appending(path: "encoder.int8.onnx").path(percentEncoded: false), + decoderPath: modelDir.appending(path: "decoder.int8.onnx").path(percentEncoded: false), + joinerPath: modelDir.appending(path: "joiner.int8.onnx").path(percentEncoded: false), + tokensPath: modelDir.appending(path: "tokens.txt").path(percentEncoded: false), + numThreads: 2 + ) + + // 2. Initialize VAD + let vad = try SherpaOnnxVADBridge( + modelPath: modelDir.appending(path: "silero_vad.onnx").path(percentEncoded: false), + threshold: 0.5, + minSilenceDuration: 0.3, + minSpeechDuration: 0.25, + maxSpeechDuration: 30.0, + windowSize: 512, + bufferSizeInSeconds: 120.0 + ) + + // 3. Feed audio into VAD and decode completed speech segments + let windowSize = 512 // VAD expects 512-sample windows at 16kHz + var sampleBuffer: [Float] = [] + var readIndex = 0 + + func emitDetectedSegments() { + while vad.hasSegment { + let segmentSamples = vad.popFrontSamples() + guard !segmentSamples.isEmpty else { continue } + + let text = recognizer.decode(samples: segmentSamples) + if !text.isEmpty { + continuation.yield(.sentenceComplete( + TranscriptionSentence(timestamp: Date(), text: text) + )) + } + } + } + + for await chunk in audioStream { + sampleBuffer.append(contentsOf: chunk.samples) + + // Feed fixed-size windows without repeated front-removal copies. + while sampleBuffer.count - readIndex >= windowSize { + let endIndex = readIndex + windowSize + let window = Array(sampleBuffer[readIndex ..< endIndex]) + readIndex = endIndex + vad.acceptWaveform(samples: window) + emitDetectedSegments() + } + + // Compact occasionally to keep memory usage bounded. + if readIndex >= windowSize * 64 { + sampleBuffer.removeFirst(readIndex) + readIndex = 0 + } + } + + // Feed final partial frame (if any) padded with zeros before flush. + let remainingCount = sampleBuffer.count - readIndex + if remainingCount > 0 { + var tail = Array(sampleBuffer[readIndex...]) + tail.append(contentsOf: Array(repeating: Float(0), count: windowSize - remainingCount)) + vad.acceptWaveform(samples: tail) + emitDetectedSegments() + } + + // 4. Flush remaining audio on stream end. + vad.flush() + emitDetectedSegments() + + } catch { + let message = "Parakeet engine error: \(error.localizedDescription)" + await MainActor.run { ErrorLogger.shared.log(message, source: "ParakeetEngine") } + continuation.yield(.error(message)) + } + + continuation.finish() + } + + return events + } +} diff --git a/TransFlow/TransFlow/Services/SherpaOnnxBridge.swift b/TransFlow/TransFlow/Services/SherpaOnnxBridge.swift new file mode 100644 index 0000000..543eca2 --- /dev/null +++ b/TransFlow/TransFlow/Services/SherpaOnnxBridge.swift @@ -0,0 +1,390 @@ +/// SherpaOnnxBridge.swift +/// Minimal Swift wrapper around the sherpa-onnx C API. +/// Adapted from https://github.com/k2-fsa/sherpa-onnx/blob/master/swift-api-examples/SherpaOnnx.swift +/// Only includes the APIs needed for offline recognition + VAD. + +import Foundation + +// MARK: - C String Helper + +nonisolated private func toCPointer(_ s: String) -> UnsafePointer! { + (s as NSString).utf8String.map { UnsafePointer($0) } +} + +// MARK: - Errors + +enum SherpaOnnxBridgeError: LocalizedError { + case recognizerCreationFailed + case onlineRecognizerCreationFailed + case onlineStreamCreationFailed + case vadCreationFailed + + var errorDescription: String? { + switch self { + case .recognizerCreationFailed: + "Failed to create SherpaOnnxOfflineRecognizer" + case .onlineRecognizerCreationFailed: + "Failed to create SherpaOnnxOnlineRecognizer" + case .onlineStreamCreationFailed: + "Failed to create SherpaOnnxOnlineStream" + case .vadCreationFailed: + "Failed to create SherpaOnnxVoiceActivityDetector" + } + } +} + +// MARK: - Offline Recognizer + +/// Swift wrapper for `SherpaOnnxOfflineRecognizer` (non-streaming ASR). +nonisolated final class SherpaOnnxOfflineRecognizerBridge: @unchecked Sendable { + private let recognizer: OpaquePointer + + /// Create an offline recognizer for the NeMo Parakeet TDT transducer model. + /// + /// - Parameters: + /// - encoderPath: Path to encoder.int8.onnx + /// - decoderPath: Path to decoder.int8.onnx + /// - joinerPath: Path to joiner.int8.onnx + /// - tokensPath: Path to tokens.txt + /// - numThreads: Number of CPU threads (default: 2) + init( + encoderPath: String, + decoderPath: String, + joinerPath: String, + tokensPath: String, + numThreads: Int = 2 + ) throws { + let transducer = SherpaOnnxOfflineTransducerModelConfig( + encoder: toCPointer(encoderPath), + decoder: toCPointer(decoderPath), + joiner: toCPointer(joinerPath) + ) + + let modelConfig = SherpaOnnxOfflineModelConfig( + transducer: transducer, + paraformer: SherpaOnnxOfflineParaformerModelConfig(model: toCPointer("")), + nemo_ctc: SherpaOnnxOfflineNemoEncDecCtcModelConfig(model: toCPointer("")), + whisper: SherpaOnnxOfflineWhisperModelConfig( + encoder: toCPointer(""), decoder: toCPointer(""), + language: toCPointer(""), task: toCPointer("transcribe"), + tail_paddings: -1 + ), + tdnn: SherpaOnnxOfflineTdnnModelConfig(model: toCPointer("")), + tokens: toCPointer(tokensPath), + num_threads: Int32(numThreads), + debug: 0, + provider: toCPointer("cpu"), + model_type: toCPointer("nemo_transducer"), + modeling_unit: toCPointer("cjkchar"), + bpe_vocab: toCPointer(""), + telespeech_ctc: toCPointer(""), + sense_voice: SherpaOnnxOfflineSenseVoiceModelConfig( + model: toCPointer(""), language: toCPointer(""), use_itn: 0 + ), + moonshine: SherpaOnnxOfflineMoonshineModelConfig( + preprocessor: toCPointer(""), encoder: toCPointer(""), + uncached_decoder: toCPointer(""), cached_decoder: toCPointer("") + ), + fire_red_asr: SherpaOnnxOfflineFireRedAsrModelConfig( + encoder: toCPointer(""), decoder: toCPointer("") + ), + dolphin: SherpaOnnxOfflineDolphinModelConfig(model: toCPointer("")), + zipformer_ctc: SherpaOnnxOfflineZipformerCtcModelConfig(model: toCPointer("")), + canary: SherpaOnnxOfflineCanaryModelConfig( + encoder: toCPointer(""), decoder: toCPointer(""), + src_lang: toCPointer("en"), tgt_lang: toCPointer("en"), use_pnc: 1 + ), + wenet_ctc: SherpaOnnxOfflineWenetCtcModelConfig(model: toCPointer("")), + omnilingual: SherpaOnnxOfflineOmnilingualAsrCtcModelConfig(model: toCPointer("")), + medasr: SherpaOnnxOfflineMedAsrCtcModelConfig(model: toCPointer("")), + funasr_nano: SherpaOnnxOfflineFunASRNanoModelConfig( + encoder_adaptor: toCPointer(""), llm: toCPointer(""), + embedding: toCPointer(""), tokenizer: toCPointer(""), + system_prompt: toCPointer(""), user_prompt: toCPointer(""), + max_new_tokens: 512, temperature: 1e-6, top_p: 0.8, seed: 42 + ) + ) + + let featConfig = SherpaOnnxFeatureConfig(sample_rate: 16000, feature_dim: 80) + let lmConfig = SherpaOnnxOfflineLMConfig(model: toCPointer(""), scale: 0.5) + let hr = SherpaOnnxHomophoneReplacerConfig( + dict_dir: toCPointer(""), lexicon: toCPointer(""), rule_fsts: toCPointer("") + ) + + var config = SherpaOnnxOfflineRecognizerConfig( + feat_config: featConfig, + model_config: modelConfig, + lm_config: lmConfig, + decoding_method: toCPointer("greedy_search"), + max_active_paths: 4, + hotwords_file: toCPointer(""), + hotwords_score: 1.5, + rule_fsts: toCPointer(""), + rule_fars: toCPointer(""), + blank_penalty: 0, + hr: hr + ) + + guard let ptr = SherpaOnnxCreateOfflineRecognizer(&config) else { + throw SherpaOnnxBridgeError.recognizerCreationFailed + } + self.recognizer = ptr + } + + deinit { + SherpaOnnxDestroyOfflineRecognizer(recognizer) + } + + /// Decode a batch of audio samples and return the transcription text. + /// + /// - Parameters: + /// - samples: Audio samples normalized to [-1, 1], 16 kHz mono. + /// - Returns: The recognized text (trimmed). + func decode(samples: [Float]) -> String { + guard let stream = SherpaOnnxCreateOfflineStream(recognizer) else { + return "" + } + defer { SherpaOnnxDestroyOfflineStream(stream) } + + SherpaOnnxAcceptWaveformOffline(stream, 16000, samples, Int32(samples.count)) + SherpaOnnxDecodeOfflineStream(recognizer, stream) + + guard let resultPtr = SherpaOnnxGetOfflineStreamResult(stream) else { + return "" + } + defer { SherpaOnnxDestroyOfflineRecognizerResult(resultPtr) } + + guard let cstr = resultPtr.pointee.text else { return "" } + return String(cString: cstr).trimmingCharacters(in: .whitespacesAndNewlines) + } +} + +// MARK: - Online Recognizer + +/// Swift wrapper for `SherpaOnnxOnlineRecognizer` (streaming ASR). +nonisolated final class SherpaOnnxOnlineRecognizerBridge: @unchecked Sendable { + private let recognizer: OpaquePointer + private let stream: OpaquePointer + + init( + encoderPath: String, + decoderPath: String, + joinerPath: String, + tokensPath: String, + numThreads: Int = 2, + modelType: String = "nemo_transducer", + modelingUnit: String = "bpe" + ) throws { + let transducer = SherpaOnnxOnlineTransducerModelConfig( + encoder: toCPointer(encoderPath), + decoder: toCPointer(decoderPath), + joiner: toCPointer(joinerPath) + ) + let usesBPE = modelingUnit.caseInsensitiveCompare("bpe") == .orderedSame + // For bpe modeling units, sherpa-onnx expects bpe_vocab to be a valid file path. + // Nemotron int8 packages provide tokens.txt, which is the correct vocab source. + let bpeVocabPath = usesBPE ? tokensPath : "" + + let modelConfig = SherpaOnnxOnlineModelConfig( + transducer: transducer, + paraformer: SherpaOnnxOnlineParaformerModelConfig( + encoder: toCPointer(""), + decoder: toCPointer("") + ), + zipformer2_ctc: SherpaOnnxOnlineZipformer2CtcModelConfig(model: toCPointer("")), + tokens: toCPointer(tokensPath), + num_threads: Int32(numThreads), + provider: toCPointer("cpu"), + debug: 0, + model_type: toCPointer(modelType), + modeling_unit: toCPointer(modelingUnit), + bpe_vocab: toCPointer(bpeVocabPath), + tokens_buf: nil, + tokens_buf_size: 0, + nemo_ctc: SherpaOnnxOnlineNemoCtcModelConfig(model: toCPointer("")), + t_one_ctc: SherpaOnnxOnlineToneCtcModelConfig(model: toCPointer("")) + ) + + let featConfig = SherpaOnnxFeatureConfig(sample_rate: 16000, feature_dim: 80) + let ctcFstConfig = SherpaOnnxOnlineCtcFstDecoderConfig( + graph: toCPointer(""), + max_active: 3000 + ) + let hr = SherpaOnnxHomophoneReplacerConfig( + dict_dir: toCPointer(""), + lexicon: toCPointer(""), + rule_fsts: toCPointer("") + ) + + var config = SherpaOnnxOnlineRecognizerConfig( + feat_config: featConfig, + model_config: modelConfig, + decoding_method: toCPointer("greedy_search"), + max_active_paths: 4, + enable_endpoint: 1, + rule1_min_trailing_silence: 2.4, + rule2_min_trailing_silence: 1.2, + rule3_min_utterance_length: 20, + hotwords_file: toCPointer(""), + hotwords_score: 1.5, + ctc_fst_decoder_config: ctcFstConfig, + rule_fsts: toCPointer(""), + rule_fars: toCPointer(""), + blank_penalty: 0, + hotwords_buf: nil, + hotwords_buf_size: 0, + hr: hr + ) + + guard let recognizer = SherpaOnnxCreateOnlineRecognizer(&config) else { + throw SherpaOnnxBridgeError.onlineRecognizerCreationFailed + } + guard let stream = SherpaOnnxCreateOnlineStream(recognizer) else { + SherpaOnnxDestroyOnlineRecognizer(recognizer) + throw SherpaOnnxBridgeError.onlineStreamCreationFailed + } + + self.recognizer = recognizer + self.stream = stream + } + + deinit { + SherpaOnnxDestroyOnlineStream(stream) + SherpaOnnxDestroyOnlineRecognizer(recognizer) + } + + func acceptWaveform(samples: [Float]) { + SherpaOnnxOnlineStreamAcceptWaveform(stream, 16000, samples, Int32(samples.count)) + } + + func decodeWhileReady() { + while SherpaOnnxIsOnlineStreamReady(recognizer, stream) == 1 { + SherpaOnnxDecodeOnlineStream(recognizer, stream) + } + } + + func currentText() -> String { + guard let result = SherpaOnnxGetOnlineStreamResult(recognizer, stream) else { + return "" + } + defer { SherpaOnnxDestroyOnlineRecognizerResult(result) } + + guard let cstr = result.pointee.text else { return "" } + return String(cString: cstr).trimmingCharacters(in: .whitespacesAndNewlines) + } + + func inputFinished() { + SherpaOnnxOnlineStreamInputFinished(stream) + } + + func isEndpoint() -> Bool { + SherpaOnnxOnlineStreamIsEndpoint(recognizer, stream) != 0 + } + + func reset() { + SherpaOnnxOnlineStreamReset(recognizer, stream) + } +} + +// MARK: - Voice Activity Detector + +/// Swift wrapper for `SherpaOnnxVoiceActivityDetector` (Silero VAD). +nonisolated final class SherpaOnnxVADBridge: @unchecked Sendable { + private let vad: OpaquePointer + + /// Create a VAD using the Silero model. + /// + /// - Parameters: + /// - modelPath: Path to silero_vad.onnx + /// - threshold: Speech detection threshold (default: 0.5) + /// - minSilenceDuration: Minimum silence to end a speech segment (seconds) + /// - minSpeechDuration: Minimum speech segment duration (seconds) + /// - maxSpeechDuration: Maximum speech segment before forced split (seconds) + /// - windowSize: Window size in samples (default: 512 for 16kHz) + /// - bufferSizeInSeconds: Circular buffer size (seconds) + init( + modelPath: String, + threshold: Float = 0.5, + minSilenceDuration: Float = 0.25, + minSpeechDuration: Float = 0.25, + maxSpeechDuration: Float = 30.0, + windowSize: Int = 512, + bufferSizeInSeconds: Float = 60.0 + ) throws { + let sileroConfig = SherpaOnnxSileroVadModelConfig( + model: toCPointer(modelPath), + threshold: threshold, + min_silence_duration: minSilenceDuration, + min_speech_duration: minSpeechDuration, + window_size: Int32(windowSize), + max_speech_duration: maxSpeechDuration + ) + + let tenVadConfig = SherpaOnnxTenVadModelConfig( + model: toCPointer(""), + threshold: 0.5, + min_silence_duration: 0.25, + min_speech_duration: 0.5, + window_size: 256, + max_speech_duration: 5.0 + ) + + var vadConfig = SherpaOnnxVadModelConfig( + silero_vad: sileroConfig, + sample_rate: 16000, + num_threads: 1, + provider: toCPointer("cpu"), + debug: 0, + ten_vad: tenVadConfig + ) + + guard let ptr = SherpaOnnxCreateVoiceActivityDetector(&vadConfig, bufferSizeInSeconds) else { + throw SherpaOnnxBridgeError.vadCreationFailed + } + self.vad = ptr + } + + deinit { + SherpaOnnxDestroyVoiceActivityDetector(vad) + } + + /// Feed audio samples into the VAD. + func acceptWaveform(samples: [Float]) { + SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, samples, Int32(samples.count)) + } + + /// Whether there are detected speech segments available. + var hasSegment: Bool { + SherpaOnnxVoiceActivityDetectorEmpty(vad) == 0 + } + + /// Whether speech is currently being detected. + var isSpeechDetected: Bool { + SherpaOnnxVoiceActivityDetectorDetected(vad) != 0 + } + + /// Pop the front speech segment. Returns the audio samples of that segment. + func popFrontSamples() -> [Float] { + guard let p = SherpaOnnxVoiceActivityDetectorFront(vad) else { + return [] + } + defer { SherpaOnnxDestroySpeechSegment(p) } + + let n = Int(p.pointee.n) + guard n > 0, let samplesPtr = p.pointee.samples else { return [] } + let samples = Array(UnsafeBufferPointer(start: samplesPtr, count: n)) + + SherpaOnnxVoiceActivityDetectorPop(vad) + return samples + } + + /// Flush remaining audio through the VAD (call at end of stream). + func flush() { + SherpaOnnxVoiceActivityDetectorFlush(vad) + } + + /// Reset the VAD state. + func reset() { + SherpaOnnxVoiceActivityDetectorReset(vad) + } +} diff --git a/TransFlow/TransFlow/Services/SpeechEngine.swift b/TransFlow/TransFlow/Services/SpeechEngine.swift index 180ea90..5892ec1 100644 --- a/TransFlow/TransFlow/Services/SpeechEngine.swift +++ b/TransFlow/TransFlow/Services/SpeechEngine.swift @@ -1,9 +1,15 @@ import Speech @preconcurrency import AVFoundation -/// Uses macOS 26.0 SpeechAnalyzer + SpeechTranscriber for real-time transcription. +/// Common interface for all speech-to-text backends. +/// Each engine accepts 16kHz mono Float32 audio and emits transcription events. +protocol TranscriptionEngine: Sendable { + func processStream(_ audioStream: AsyncStream) -> AsyncStream +} + +/// Apple Speech backend using macOS 26.0 SpeechAnalyzer + SpeechTranscriber. /// Accepts an AudioChunk stream (16kHz mono Float32), outputs TranscriptionEvent stream. -final class SpeechEngine: Sendable { +final class AppleSpeechEngine: TranscriptionEngine, Sendable { private let locale: Locale init(locale: Locale) { diff --git a/TransFlow/TransFlow/TransFlow-Bridging-Header.h b/TransFlow/TransFlow/TransFlow-Bridging-Header.h new file mode 100644 index 0000000..0e9edcc --- /dev/null +++ b/TransFlow/TransFlow/TransFlow-Bridging-Header.h @@ -0,0 +1,13 @@ +// +// TransFlow-Bridging-Header.h +// TransFlow +// +// Bridging header to expose sherpa-onnx C API to Swift. +// + +#ifndef TransFlow_Bridging_Header_h +#define TransFlow_Bridging_Header_h + +#include "sherpa-onnx/c-api/c-api.h" + +#endif /* TransFlow_Bridging_Header_h */ diff --git a/TransFlow/TransFlow/ViewModels/TransFlowViewModel.swift b/TransFlow/TransFlow/ViewModels/TransFlowViewModel.swift index 2b7b097..3665b1b 100644 --- a/TransFlow/TransFlow/ViewModels/TransFlowViewModel.swift +++ b/TransFlow/TransFlow/ViewModels/TransFlowViewModel.swift @@ -35,16 +35,19 @@ final class TransFlowViewModel { /// Translation service (observed separately for SwiftUI binding) let translationService = TranslationService() - /// Speech model manager for asset checking and downloading. + /// Apple Speech model manager for asset checking and downloading. let modelManager = SpeechModelManager.shared + /// Local model manager for download/validation. + let localModelManager = LocalModelManager.shared + /// JSONL persistence store for the current session. let jsonlStore = JSONLStore() // MARK: - Private private let audioCaptureService = AudioCaptureService() - private var speechEngine: SpeechEngine? + private var speechEngine: (any TranscriptionEngine)? private var stopAudioCapture: (@Sendable () -> Void)? private var listeningTask: Task? private var audioLevelTask: Task? @@ -73,30 +76,44 @@ final class TransFlowViewModel { // Load available apps await refreshAvailableApps() - // Check model status for the default transcription language - await modelManager.checkCurrentStatus(for: selectedLanguage) - - // Auto-download model if not installed - if !modelManager.currentModelStatus.isReady { - await modelManager.ensureModelReady(for: selectedLanguage) + // Engine-specific initialization + let engine = AppSettings.shared.selectedEngine + if engine == .apple { + // Check model status for the default transcription language + await modelManager.checkCurrentStatus(for: selectedLanguage) + if !modelManager.currentModelStatus.isReady { + await modelManager.ensureModelReady(for: selectedLanguage) + } + } else { + // Local engine: check selected local model status + localModelManager.checkStatus(for: AppSettings.shared.selectedLocalModel) } } // MARK: - Language func loadSupportedLanguages() async { - let locales = await SpeechTranscriber.supportedLocales - availableLanguages = locales.map { Locale(identifier: $0.language.minimalIdentifier) } - .sorted { $0.identifier < $1.identifier } + if AppSettings.shared.selectedEngine == .apple { + let locales = await SpeechTranscriber.supportedLocales + availableLanguages = locales.map { Locale(identifier: $0.language.minimalIdentifier) } + .sorted { $0.identifier < $1.identifier } + } else { + // Parakeet TDT 0.6B v2 supports English only + availableLanguages = [Locale(identifier: "en-US")] + selectedLanguage = Locale(identifier: "en-US") + } } func switchLanguage(to locale: Locale) { + // Language switching only applies to Apple engine + guard AppSettings.shared.selectedEngine == .apple else { return } + let wasListening = listeningState == .active if wasListening { stopListening() } selectedLanguage = locale - speechEngine = SpeechEngine(locale: locale) + speechEngine = AppleSpeechEngine(locale: locale) // Sync transcription language to translation source language translationService.updateSourceLanguage(from: locale) @@ -128,15 +145,41 @@ final class TransFlowViewModel { listeningTask = Task { do { - // Ensure model is ready before starting - let modelReady = await modelManager.ensureModelReady(for: selectedLanguage) - guard modelReady else { - showModelNotReadyAlert = true - listeningState = .idle - return + // Create the appropriate engine based on settings + let selectedEngineKind = AppSettings.shared.selectedEngine + let engine: any TranscriptionEngine + + switch selectedEngineKind { + case .apple: + // Ensure Apple Speech model is ready + let modelReady = await modelManager.ensureModelReady(for: selectedLanguage) + guard modelReady else { + showModelNotReadyAlert = true + listeningState = .idle + return + } + engine = AppleSpeechEngine(locale: selectedLanguage) + + case .local: + let selectedLocalModel = AppSettings.shared.selectedLocalModel + + // Ensure selected local model is downloaded and ready. + localModelManager.checkStatus(for: selectedLocalModel) + guard localModelManager.status(for: selectedLocalModel).isReady else { + showModelNotReadyAlert = true + listeningState = .idle + return + } + + let localModelDirectory = localModelManager.modelDirectory(for: selectedLocalModel) + switch selectedLocalModel { + case .parakeetOfflineInt8: + engine = ParakeetSpeechEngine(modelDirectory: localModelDirectory) + case .nemotronStreamingInt8: + engine = NemotronStreamingSpeechEngine(modelDirectory: localModelDirectory) + } } - let engine = SpeechEngine(locale: selectedLanguage) self.speechEngine = engine // Start audio capture based on source diff --git a/TransFlow/TransFlow/Views/ControlBarView.swift b/TransFlow/TransFlow/Views/ControlBarView.swift index e59918b..1830b98 100644 --- a/TransFlow/TransFlow/Views/ControlBarView.swift +++ b/TransFlow/TransFlow/Views/ControlBarView.swift @@ -88,7 +88,8 @@ struct ControlBarView: View { // Loading state overlay if viewModel.listeningState == .starting || viewModel.listeningState == .stopping { ProgressView() - .scaleEffect(0.6) + .controlSize(.small) + .frame(width: 14, height: 14) .tint(.secondary) } } diff --git a/TransFlow/TransFlow/Views/MainView.swift b/TransFlow/TransFlow/Views/MainView.swift index 13d497a..62fa76d 100644 --- a/TransFlow/TransFlow/Views/MainView.swift +++ b/TransFlow/TransFlow/Views/MainView.swift @@ -11,6 +11,7 @@ struct MainView: View { @State private var selectedDestination: SidebarDestination = .transcription @State private var columnVisibility: NavigationSplitViewVisibility = .detailOnly @State private var viewModel = TransFlowViewModel() + @State private var settings = AppSettings.shared var body: some View { NavigationSplitView(columnVisibility: $columnVisibility) { @@ -22,6 +23,11 @@ struct MainView: View { .onReceive(NotificationCenter.default.publisher(for: .navigateToSettings)) { _ in selectedDestination = .settings } + .onChange(of: settings.selectedEngine) { _, _ in + Task { + await viewModel.loadSupportedLanguages() + } + } } @ViewBuilder diff --git a/TransFlow/TransFlow/Views/SettingsView.swift b/TransFlow/TransFlow/Views/SettingsView.swift index ab7ef14..225f528 100644 --- a/TransFlow/TransFlow/Views/SettingsView.swift +++ b/TransFlow/TransFlow/Views/SettingsView.swift @@ -7,6 +7,7 @@ struct SettingsView: View { @State private var settings = AppSettings.shared @State private var updateChecker = UpdateChecker.shared @State private var modelManager = SpeechModelManager.shared + @State private var localModelManager = LocalModelManager.shared @State private var hasLoadedModels = false var body: some View { @@ -23,13 +24,28 @@ struct SettingsView: View { appearanceRow } - // ── Speech Models Section ── + // ── Speech Recognition Engine Section ── settingsSection( - header: "settings.speech_models", - icon: "waveform.badge.mic", - iconColor: .indigo + header: "settings.engine", + icon: "brain", + iconColor: .green ) { - speechModelsContent + enginePickerRow + if settings.selectedEngine == .local { + Divider().padding(.leading, 46) + localModelContent + } + } + + // ── Speech Models Section (Apple engine only) ── + if settings.selectedEngine == .apple { + settingsSection( + header: "settings.speech_models", + icon: "waveform.badge.mic", + iconColor: .indigo + ) { + speechModelsContent + } } // ── Feedback Section ── @@ -61,6 +77,20 @@ struct SettingsView: View { guard !hasLoadedModels else { return } hasLoadedModels = true await modelManager.refreshAllStatuses() + localModelManager.checkAllStatuses() + } + .onChange(of: settings.selectedEngine) { _, newEngine in + switch newEngine { + case .apple: + Task { + await modelManager.refreshAllStatuses() + } + case .local: + localModelManager.checkStatus(for: settings.selectedLocalModel) + } + } + .onChange(of: settings.selectedLocalModel) { _, newModel in + localModelManager.checkStatus(for: newModel) } .onAppear { updateChecker.checkOnceOnLaunch() @@ -494,6 +524,325 @@ struct SettingsView: View { } } + // MARK: - Engine Picker + + private var enginePickerRow: some View { + HStack { + Label { + Text("settings.engine") + .font(.system(size: 13, weight: .regular)) + } icon: { + Image(systemName: "waveform") + .font(.system(size: 14, weight: .medium)) + .foregroundStyle(.green) + .frame(width: 24) + } + + Spacer() + + Picker("", selection: $settings.selectedEngine) { + ForEach(TranscriptionEngineKind.allCases) { engine in + Text(engine.displayName) + .tag(engine) + } + } + .pickerStyle(.menu) + .fixedSize() + .tint(.secondary) + } + .padding(.horizontal, 14) + .padding(.vertical, 10) + } + + // MARK: - Local Model Content + + private var localModelContent: some View { + VStack(spacing: 0) { + localModelPickerRow + Divider().padding(.leading, 46) + // Model status row + localModelStatusRow + Divider().padding(.leading, 46) + // Action row (download / delete) + localModelActionRow + Divider().padding(.leading, 46) + // License notice + localModelLicenseRow + } + } + + private var localModelPickerRow: some View { + HStack { + Label { + Text("settings.local_model") + .font(.system(size: 13, weight: .regular)) + } icon: { + Image(systemName: "square.stack.3d.up") + .font(.system(size: 14, weight: .medium)) + .foregroundStyle(.mint) + .frame(width: 24) + } + + Spacer() + + Picker("", selection: $settings.selectedLocalModel) { + ForEach(LocalTranscriptionModelKind.allCases) { model in + Text(model.displayName) + .tag(model) + } + } + .pickerStyle(.menu) + .fixedSize() + .tint(.secondary) + } + .padding(.horizontal, 14) + .padding(.vertical, 8) + } + + private var localModelStatusRow: some View { + HStack(spacing: 8) { + Label { + VStack(alignment: .leading, spacing: 2) { + Text(settings.selectedLocalModel.displayName) + .font(.system(size: 13, weight: .regular)) + Text(localModelStatusText) + .font(.system(size: 11, weight: .regular)) + .foregroundStyle(localModelStatusColor) + if let progressText = localDownloadProgressText { + Text(progressText) + .font(.system(size: 10, weight: .regular, design: .monospaced)) + .foregroundStyle(.tertiary) + } + } + } icon: { + localModelStatusIcon + .frame(width: 24) + } + + Spacer() + + if localModelStatus.isReady { + Text(formattedDiskSize) + .font(.system(size: 11, weight: .medium, design: .monospaced)) + .foregroundStyle(.tertiary) + } + } + .padding(.horizontal, 14) + .padding(.vertical, 8) + } + + @ViewBuilder + private var localModelStatusIcon: some View { + switch localModelStatus { + case .ready: + Image(systemName: "checkmark.circle.fill") + .font(.system(size: 14, weight: .medium)) + .foregroundStyle(.green) + case .notDownloaded: + Image(systemName: "arrow.down.circle") + .font(.system(size: 14, weight: .medium)) + .foregroundStyle(.secondary) + case .downloading: + ProgressView() + .controlSize(.small) + .frame(width: 14, height: 14) + case .failed: + Image(systemName: "exclamationmark.triangle.fill") + .font(.system(size: 14, weight: .medium)) + .foregroundStyle(.orange) + } + } + + private var localModelStatusText: LocalizedStringKey { + switch localModelStatus { + case .ready: + "settings.model.status.ready" + case .notDownloaded: + selectedLocalModelNotDownloadedKey + case .downloading(let progress): + if localModelDownloadDetail?.isResuming == true { + "settings.model.status.resuming \(Int(progress * 100))" + } else { + "settings.model.status.downloading \(Int(progress * 100))" + } + case .failed(let message): + "settings.model.status.failed \(message)" + } + } + + private var localModelStatusColor: Color { + switch localModelStatus { + case .ready: .green + case .notDownloaded: .secondary + case .downloading: .blue + case .failed: .orange + } + } + + private var localModelActionRow: some View { + HStack { + Label { + Text("settings.model.manage") + .font(.system(size: 13, weight: .regular)) + } icon: { + Image(systemName: "internaldrive") + .font(.system(size: 14, weight: .medium)) + .foregroundStyle(.secondary) + .frame(width: 24) + } + + Spacer() + + switch localModelStatus { + case .notDownloaded, .failed: + Button { + localModelManager.download(for: settings.selectedLocalModel) + } label: { + Text(localModelManager.hasResumeData(for: settings.selectedLocalModel) ? "settings.model.action.resume" : "settings.model.download") + .font(.system(size: 11, weight: .medium)) + .foregroundStyle(.white) + .padding(.horizontal, 10) + .padding(.vertical, 4) + .background( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .fill(Color.accentColor) + ) + } + .buttonStyle(.plain) + + case .downloading(let progress): + HStack(spacing: 8) { + ProgressView(value: progress, total: 1.0) + .progressViewStyle(.linear) + .frame(width: 80) + .tint(.blue) + + Button { + localModelManager.cancelDownload(for: settings.selectedLocalModel) + } label: { + Text("settings.model.action.cancel") + .font(.system(size: 10, weight: .medium)) + .foregroundStyle(.white) + .padding(.horizontal, 8) + .padding(.vertical, 3) + .background( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .fill(Color.orange) + ) + } + .buttonStyle(.plain) + } + + case .ready: + Button { + localModelManager.delete(for: settings.selectedLocalModel) + } label: { + Text("settings.model.delete") + .font(.system(size: 11, weight: .medium)) + .foregroundStyle(.white) + .padding(.horizontal, 10) + .padding(.vertical, 4) + .background( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .fill(Color.red) + ) + } + .buttonStyle(.plain) + } + } + .padding(.horizontal, 14) + .padding(.vertical, 8) + } + + private var localModelLicenseRow: some View { + HStack { + Label { + VStack(alignment: .leading, spacing: 2) { + Text(settings.selectedLocalModel.licenseNoticeKey) + .font(.system(size: 11, weight: .regular)) + .foregroundStyle(.tertiary) + } + } icon: { + Image(systemName: "doc.text") + .font(.system(size: 14, weight: .medium)) + .foregroundStyle(.tertiary) + .frame(width: 24) + } + Spacer() + } + .padding(.horizontal, 14) + .padding(.vertical, 8) + } + + private var localModelStatus: LocalModelStatus { + localModelManager.status(for: settings.selectedLocalModel) + } + + private var localModelDownloadDetail: LocalModelDownloadDetail? { + localModelManager.downloadDetail(for: settings.selectedLocalModel) + } + + private var selectedLocalModelNotDownloadedKey: LocalizedStringKey { + switch settings.selectedLocalModel { + case .parakeetOfflineInt8: + "settings.model.status.not_downloaded.parakeet" + case .nemotronStreamingInt8: + "settings.model.status.not_downloaded.nemotron" + } + } + + private var formattedDiskSize: String { + let bytes = localModelManager.diskSizeBytes(for: settings.selectedLocalModel) + if bytes < 1_000_000 { + return "\(bytes / 1_000) KB" + } else if bytes < 1_000_000_000 { + return String(format: "%.0f MB", Double(bytes) / 1_000_000) + } else { + return String(format: "%.1f GB", Double(bytes) / 1_000_000_000) + } + } + + private var localDownloadProgressText: String? { + guard case .downloading = localModelStatus, + let detail = localModelDownloadDetail + else { + return nil + } + + let bytesText: String + if let totalBytes = detail.totalBytes, totalBytes > 0 { + bytesText = "\(formatBytes(detail.downloadedBytes)) / \(formatBytes(totalBytes))" + } else { + bytesText = formatBytes(detail.downloadedBytes) + } + + var segments: [String] = [bytesText] + if let speed = detail.bytesPerSecond, speed > 0 { + segments.append(String(localized: "settings.model.progress.speed \(formatBytes(Int64(speed)))/s")) + } + if let eta = detail.etaSeconds, eta.isFinite, eta > 0 { + segments.append(String(localized: "settings.model.progress.eta \(formatDuration(eta))")) + } + return segments.joined(separator: " · ") + } + + private func formatBytes(_ bytes: Int64) -> String { + let formatter = ByteCountFormatter() + formatter.allowedUnits = [.useKB, .useMB, .useGB] + formatter.countStyle = .file + formatter.includesUnit = true + formatter.isAdaptive = true + return formatter.string(fromByteCount: bytes) + } + + private func formatDuration(_ seconds: Double) -> String { + let formatter = DateComponentsFormatter() + formatter.allowedUnits = seconds >= 3600 ? [.hour, .minute] : [.minute, .second] + formatter.unitsStyle = .abbreviated + formatter.maximumUnitCount = 2 + return formatter.string(from: seconds) ?? "--" + } + // MARK: - Helpers private var appVersionString: String { diff --git a/scripts/build-sherpa-onnx.sh b/scripts/build-sherpa-onnx.sh new file mode 100755 index 0000000..366cbcf --- /dev/null +++ b/scripts/build-sherpa-onnx.sh @@ -0,0 +1,238 @@ +#!/usr/bin/env bash +# Build sherpa-onnx xcframework for macOS (default: universal arm64+x86_64). +# Output: vendor/sherpa-onnx.xcframework +# +# Usage: +# ./scripts/build-sherpa-onnx.sh +# ./scripts/build-sherpa-onnx.sh --clean --reclone +# ./scripts/build-sherpa-onnx.sh --version v1.12.23 --archs "arm64;x86_64" + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +VENDOR_DIR="$PROJECT_ROOT/vendor" + +SHERPA_VERSION="v1.12.23" +ARCHS="arm64;x86_64" +DEPLOYMENT_TARGET="15.0" +JOBS="$(sysctl -n hw.ncpu)" +CLEAN_BUILD=0 +RECLONE_SOURCE=0 + +SHERPA_SRC="$VENDOR_DIR/sherpa-onnx-src" +BUILD_DIR="$VENDOR_DIR/sherpa-onnx-build" +OUTPUT="$VENDOR_DIR/sherpa-onnx.xcframework" + +START_TS="$(date +%s)" + +usage() { + cat <<'EOF' +Build sherpa-onnx xcframework for TransFlow. + +Options: + --version sherpa-onnx git tag to build (default: v1.12.23) + --archs CMAKE_OSX_ARCHITECTURES (default: arm64;x86_64) + --deployment-target CMAKE_OSX_DEPLOYMENT_TARGET (default: 15.0) + --jobs Parallel build jobs (default: hw.ncpu) + --output Output xcframework path + --clean Remove build directory before building + --reclone Remove source directory and clone again + -h, --help Show this help +EOF +} + +log() { + printf '[%s] %s\n' "$(date '+%H:%M:%S')" "$*" +} + +die() { + printf 'ERROR: %s\n' "$*" >&2 + exit 1 +} + +run() { + log "$*" + "$@" +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || die "Required command not found: $1" +} + +assert_file() { + [ -f "$1" ] || die "Required file not found: $1" +} + +while [ $# -gt 0 ]; do + case "$1" in + --version) + [ $# -ge 2 ] || die "Missing value for --version" + SHERPA_VERSION="$2" + shift 2 + ;; + --archs) + [ $# -ge 2 ] || die "Missing value for --archs" + ARCHS="$2" + shift 2 + ;; + --deployment-target) + [ $# -ge 2 ] || die "Missing value for --deployment-target" + DEPLOYMENT_TARGET="$2" + shift 2 + ;; + --jobs) + [ $# -ge 2 ] || die "Missing value for --jobs" + JOBS="$2" + shift 2 + ;; + --output) + [ $# -ge 2 ] || die "Missing value for --output" + OUTPUT="$2" + shift 2 + ;; + --clean) + CLEAN_BUILD=1 + shift + ;; + --reclone) + RECLONE_SOURCE=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + die "Unknown option: $1 (use --help)" + ;; + esac +done + +OUTPUT_TMP="${OUTPUT}.tmp" + +for cmd in git cmake xcodebuild libtool lipo make sysctl; do + require_cmd "$cmd" +done + +ACTIVE_DEV_DIR="$(xcode-select -p 2>/dev/null || true)" +if [ "$ACTIVE_DEV_DIR" = "/Library/Developer/CommandLineTools" ]; then + if [ -d "/Applications/Xcode.app/Contents/Developer" ]; then + export DEVELOPER_DIR="/Applications/Xcode.app/Contents/Developer" + log "Using DEVELOPER_DIR=$DEVELOPER_DIR" + else + die "xcodebuild is pointing to CommandLineTools. Install full Xcode or set DEVELOPER_DIR." + fi +fi + +log "=== Building sherpa-onnx xcframework ===" +log "Version: $SHERPA_VERSION" +log "Archs: $ARCHS" +log "Deployment target: $DEPLOYMENT_TARGET" +log "Output: $OUTPUT" + +mkdir -p "$VENDOR_DIR" +mkdir -p "$(dirname "$OUTPUT")" + +if [ "$RECLONE_SOURCE" -eq 1 ] && [ -d "$SHERPA_SRC" ]; then + run rm -rf "$SHERPA_SRC" +fi + +if [ ! -d "$SHERPA_SRC/.git" ]; then + [ ! -e "$SHERPA_SRC" ] || die "Path exists but is not a git repo: $SHERPA_SRC (use --reclone)" + run git clone --depth 1 --branch "$SHERPA_VERSION" \ + https://github.com/k2-fsa/sherpa-onnx.git "$SHERPA_SRC" +else + CURRENT_TAG="$(git -C "$SHERPA_SRC" describe --tags --exact-match 2>/dev/null || true)" + if [ "$CURRENT_TAG" != "$SHERPA_VERSION" ]; then + if ! git -C "$SHERPA_SRC" diff --quiet || [ -n "$(git -C "$SHERPA_SRC" status --porcelain)" ]; then + die "Existing source has local changes and is not at $SHERPA_VERSION. Use --reclone." + fi + run git -C "$SHERPA_SRC" fetch --tags origin + run git -C "$SHERPA_SRC" checkout "$SHERPA_VERSION" + run git -C "$SHERPA_SRC" reset --hard "$SHERPA_VERSION" + else + log "Source already at $SHERPA_VERSION: $SHERPA_SRC" + fi +fi + +if [ "$CLEAN_BUILD" -eq 1 ] && [ -d "$BUILD_DIR" ]; then + run rm -rf "$BUILD_DIR" +fi + +run cmake -S "$SHERPA_SRC" -B "$BUILD_DIR" \ + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DSHERPA_ONNX_BUILD_C_API_EXAMPLES=OFF \ + -DCMAKE_OSX_ARCHITECTURES="$ARCHS" \ + -DCMAKE_OSX_DEPLOYMENT_TARGET="$DEPLOYMENT_TARGET" \ + -DCMAKE_INSTALL_PREFIX="$BUILD_DIR/install" \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=OFF \ + -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF + +run cmake --build "$BUILD_DIR" --config Release --parallel "$JOBS" +run cmake --install "$BUILD_DIR" --config Release + +if [ -f "$BUILD_DIR/install/include/cargs.h" ]; then + run rm -f "$BUILD_DIR/install/include/cargs.h" +fi + +MERGED_LIB="$BUILD_DIR/install/lib/libsherpa-onnx.a" +STATIC_LIBS=( + "$BUILD_DIR/install/lib/libsherpa-onnx-c-api.a" + "$BUILD_DIR/install/lib/libsherpa-onnx-core.a" + "$BUILD_DIR/install/lib/libkaldi-native-fbank-core.a" + "$BUILD_DIR/install/lib/libkissfft-float.a" + "$BUILD_DIR/install/lib/libsherpa-onnx-fstfar.a" + "$BUILD_DIR/install/lib/libsherpa-onnx-fst.a" + "$BUILD_DIR/install/lib/libsherpa-onnx-kaldifst-core.a" + "$BUILD_DIR/install/lib/libkaldi-decoder-core.a" + "$BUILD_DIR/install/lib/libucd.a" + "$BUILD_DIR/install/lib/libpiper_phonemize.a" + "$BUILD_DIR/install/lib/libespeak-ng.a" + "$BUILD_DIR/install/lib/libssentencepiece_core.a" + "$BUILD_DIR/install/lib/libonnxruntime.a" +) + +for lib in "${STATIC_LIBS[@]}"; do + assert_file "$lib" +done + +run libtool -static -o "$MERGED_LIB" "${STATIC_LIBS[@]}" +assert_file "$MERGED_LIB" + +ARCH_INFO="$(lipo -info "$MERGED_LIB" 2>/dev/null || true)" +[ -n "$ARCH_INFO" ] || die "Unable to inspect merged library architectures: $MERGED_LIB" +ARCH_LIST="${ARCHS//;/ }" +for arch in $ARCH_LIST; do + case "$ARCH_INFO" in + *"$arch"*) ;; + *) die "Merged library missing architecture: $arch ($ARCH_INFO)" ;; + esac +done +log "Merged library architectures: $ARCH_INFO" + +run rm -rf "$OUTPUT_TMP" +run xcodebuild -create-xcframework \ + -library "$MERGED_LIB" \ + -headers "$BUILD_DIR/install/include" \ + -output "$OUTPUT_TMP" + +run rm -rf "$OUTPUT" +run mv "$OUTPUT_TMP" "$OUTPUT" + +[ -d "$OUTPUT" ] || die "xcframework output missing: $OUTPUT" + +ELAPSED="$(( $(date +%s) - START_TS ))" +log "" +log "=== Done ===" +log "xcframework: $OUTPUT" +log "elapsed: ${ELAPSED}s" +log "" +log "Next step: open TransFlow.xcodeproj and build the app." diff --git a/specs/013-parakeet-onnx.md b/specs/013-parakeet-onnx.md new file mode 100644 index 0000000..05bd01b --- /dev/null +++ b/specs/013-parakeet-onnx.md @@ -0,0 +1,153 @@ +## Implementation Plan: Local Parakeet TDT via sherpa-onnx + +### Goals +- Add a selectable local STT backend based on sherpa-onnx + Parakeet TDT 0.6B v2. +- Keep the app bundle free of model assets; users download models on demand. +- Preserve current Apple Speech backend as the default option. +- Provide live captions with partial text and finalized sentences. + +### Non-Goals +- No cloud inference. +- No model fine-tuning or custom vocabulary training. +- No multilingual support beyond what the chosen model provides. + +### Dependencies and Constraints +- sherpa-onnx native core (C++ + ONNX Runtime) embedded in the app bundle. +- User-downloaded model assets stored in Application Support. +- Model files: encoder.onnx, decoder.onnx, joiner.onnx, tokens.txt. +- macOS app code signing and notarization must include native libraries. +- Model license: Parakeet TDT 0.6B v2 is CC-BY-4.0 (attribution required). +- Download source must be official sherpa-onnx pre-converted artifacts. + +### High-Level Architecture +- Add a new transcription backend that mirrors the existing `SpeechEngine` interface: + - `processStream(_:) -> AsyncStream` +- Route audio capture output (16kHz mono Float32) into the sherpa-onnx recognizer. +- Introduce a model manager for local model status and downloads. +- Add a settings UI section to choose engine and manage the model. +- Keep Apple Speech engine fully intact as the default and fallback. + +### Data Model Additions +- `TranscriptionEngineKind`: `apple` | `parakeetLocal` +- `LocalModelVariant`: `int8` | `fp16` | `fp32` +- `LocalModelStatus`: `notDownloaded` | `downloading(progress)` | `ready` | `failed(message)` +- `LocalModelInfo`: `variant`, `status`, `path`, `sizeBytes` + +### Storage Layout +- Model root directory: + - `~/Library/Application Support/TransFlow/Models/ParakeetTDT0.6Bv2/` +- Variant subfolders (optional): + - `int8/`, `fp16/`, `fp32/` +- Each variant folder contains the 4 required files. + +### Download and Validation Flow +- Add a "Download Model" action in Settings: + - Choose variant (default: int8). + - Download official sherpa-onnx tarball and extract. + - Validate required files before marking ready. +- Add "Delete Model" to remove local files. +- On app start and engine switch: + - Validate model availability and update status. +- If a download is interrupted, resume if possible or clean up partial files. +- Store a local manifest (variant, version, file sizes, hash optional) for verification. + +### Transcription Pipeline (Parakeet) +- Instantiate a sherpa-onnx recognizer using: + - `model_type = "nemo_transducer"` + - Paths from the user-downloaded model directory. +- Feed audio chunks continuously. +- For live captions: + - Emit `.partial` text on periodic decode (e.g., every N ms). + - Emit `.sentenceComplete` when end-of-speech is detected (VAD) or on stream stop. +- On engine stop, flush any remaining partial text into a final sentence. +- Ensure thread-safe access to the recognizer and background decoding. +- Normalize audio to the expected range and sample rate (already 16kHz mono). + +### UI/UX Changes +- Settings: add a "Speech Recognition Engine" section: + - Engine picker: Apple / Parakeet (Local) + - Model status label: Not downloaded / Downloading / Ready / Error + - Download + Delete buttons + - Model location display +- Transcription UI behavior: + - If Parakeet is selected but model is missing, show an inline prompt. + - Fall back to Apple only if explicitly selected by the user. +- Show disk usage and estimated download size. +- Provide a simple error explanation if model validation fails. + +### Localization +- Add string keys to `Localizable.xcstrings`: + - `settings.engine`, `settings.engine.apple`, `settings.engine.parakeet` + - `settings.model.download`, `settings.model.delete` + - `settings.model.status.*` for all states + - `settings.model.size`, `settings.model.location`, `settings.model.error` + - `settings.model.license_notice` + +### Implementation Steps +1. Add new engine selection state to app settings and view model. +2. Create a local model manager: + - Download, extract, validate, delete, and report status. +3. Integrate sherpa-onnx native library and define a Swift wrapper. +4. Implement `ParakeetSpeechEngine` using sherpa-onnx APIs. +5. Update settings UI to manage engine and model status. +6. Wire engine selection in `TransFlowViewModel.startListening()`. +7. Add user messaging for missing model or download failures. +8. Add tests and a manual verification checklist. +9. Add license attribution surface for CC-BY-4.0. + +### Milestones +1. **Backend selection + settings scaffolding** + - Add engine selection to settings and persisted app state. + - Basic model status UI (not downloaded / ready / error). +2. **Model manager + download flow** + - Download/extract tarball, validate files, and delete flow. + - Progress reporting and error handling. +3. **Native sherpa wrapper** + - Build and embed sherpa-onnx + ONNX Runtime. + - Swift API surface for init, feed audio, decode, reset, dispose. +4. **Parakeet engine integration** + - Implement `ParakeetSpeechEngine` and wire into view model. + - Partial text updates and final sentence emission. +5. **QA + performance tuning** + - Validate live captions on representative Macs. + - Tune chunk size, decode cadence, and VAD thresholds. +6. **Release hardening** + - Verify code signing, notarization, and first-run download behavior. + - Document the model download flow and attribution in README/Settings. + +### Proposed API Surface (Swift) +- `protocol TranscriptionEngine`: + - `func processStream(_ audioStream: AsyncStream) -> AsyncStream` + - `func stop()` +- `final class ParakeetSpeechEngine: TranscriptionEngine` + - `init(modelDirectory: URL, decodeIntervalMs: Int, vadEnabled: Bool)` + - Emits `.partial` for incremental text and `.sentenceComplete` on VAD end or stop. +- `final class LocalModelManager` + - `var status: LocalModelStatus` + - `var selectedVariant: LocalModelVariant` + - `func checkStatus()` + - `func download(variant: LocalModelVariant)` + - `func delete(variant: LocalModelVariant)` + - `func modelDirectory(variant: LocalModelVariant) -> URL` +- `final class ParakeetModelValidator` + - `func validate(directory: URL) -> LocalModelStatus` + - Checks presence, minimum sizes, and optional hash verification. + +### Testing Plan +- Unit tests: + - Model path validation and status transitions. + - Download error handling and cleanup. +- Manual tests: + - Download int8 model and run live captions. + - Switch engines while idle and while listening. + - Stop/start listening and ensure partial text flushes correctly. + - Delete model and confirm UI updates. + - Disconnect network mid-download and verify resume/cleanup. + - Run on Intel and Apple Silicon hardware. + +### Risks and Mitigations +- Large model size: offer int8 default and show disk usage. +- Latency on older Macs: allow tuning of decode interval. +- Native library integration issues: isolate with a wrapper and minimal Swift API surface. +- License compliance: show attribution in Settings/About. +- Model corruption: validate on startup and on selection.