Huanshere · doomsday616 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.gitignore b/.gitignore
@@ -172,4 +172,7 @@ config.backup.yaml
 runtime/
 dev/
 installer_files/
-logs/
+
+# Streamlit runtime logs from OneKeyStart.bat
+logs/
+videolingo_*.log
diff --git a/.streamlit/config.toml b/.streamlit/config.toml
@@ -1,2 +1,6 @@
 [server]
-maxUploadSize = 4096
+maxUploadSize = 4096
+fileWatcherType = "none"
+
+[client]
+toolbarMode = "viewer"
diff --git a/OneKeyStart.bat b/OneKeyStart.bat
@@ -1,11 +1,85 @@
 @echo off
-chcp 65001 >nul 2>&1
-call conda activate videolingo 2>nul
-set PYTHONWARNINGS=ignore
-python "%~dp0launch.py"
-if %errorlevel% neq 0 (
-    echo.
-    echo  Pre-flight checks or Streamlit failed. See logs\ for details.
-    echo.
+setlocal EnableExtensions
+cd /D "%~dp0"
+
+for /F "tokens=1,2 delims=#" %%A in ('"prompt #$H#$E# & echo on & for %%B in (1) do rem"') do set "ESC=%%B"
+set "C_RESET=%ESC%[0m"
+set "C_GREEN=%ESC%[32m"
+set "C_YELLOW=%ESC%[33m"
+set "C_RED=%ESC%[31m"
+set "C_CYAN=%ESC%[36m"
+set "C_BOLD=%ESC%[1m"
+
+if not exist "logs" mkdir "logs"
+for /f "tokens=2 delims==" %%I in ('wmic os get localdatetime /value') do set dt=%%I
+set "LOGFILE=logs\videolingo_%dt:~0,8%_%dt:~8,6%.log"
+set "CHECK_ONLY="
+if /I "%~1"=="--check-only" set "CHECK_ONLY=1"
+
+echo [%date% %time%] VideoLingo starting... > "%LOGFILE%"
+echo %C_CYAN%Log file:%C_RESET% %LOGFILE%
+
+set "VENV_LABEL="
+set "VENV_PY="
+
+set "SHARED_VENV=%USERPROFILE%\.venvs\videolingo"
+if exist "%SHARED_VENV%\Scripts\python.exe" (
+    set "VENV_LABEL=shared venv"
+    set "VENV_PY=%SHARED_VENV%\Scripts\python.exe"
+    goto venv_found
 )
+
+if exist ".venv\Scripts\python.exe" (
+    set "VENV_LABEL=project .venv"
+    set "VENV_PY=.venv\Scripts\python.exe"
+    goto venv_found
+)
+
+where conda >nul 2>nul
+if %errorlevel%==0 (
+    echo %C_YELLOW%No uv venv found, falling back to Conda env "videolingo"...%C_RESET%
+    call conda activate videolingo
+    python installer.py --check --quiet
+if errorlevel 1 (
+    echo %C_YELLOW%Conda env is incomplete or outdated. Repairing...%C_RESET%
+    python installer.py --yes
+    if errorlevel 1 goto install_failed
+)
+    if defined CHECK_ONLY (
+        echo %C_GREEN%Environment check passed. --check-only set, not starting Streamlit.%C_RESET%
+        goto end
+    )
+    echo %C_GREEN%Starting VideoLingo with Conda...%C_RESET%
+    python -m streamlit run st.py 2>&1 | powershell -NoProfile -Command "$input | Tee-Object -FilePath '%LOGFILE%' -Append"
+    goto end
+)
+
+echo %C_RED%ERROR: No usable VideoLingo environment found.%C_RESET%
+echo Run one of these first:
+echo   python setup_env.py --shared
+echo   python setup_env.py
+goto end
+
+:venv_found
+echo %C_GREEN%Detected %VENV_LABEL%:%C_RESET% %VENV_PY%
+"%VENV_PY%" installer.py --check --quiet
+if errorlevel 1 (
+    echo %C_YELLOW%Environment is incomplete or outdated. Repairing with installer.py...%C_RESET%
+    "%VENV_PY%" installer.py --yes
+    if errorlevel 1 goto install_failed
+)
+
+if defined CHECK_ONLY (
+    echo %C_GREEN%Environment check passed. --check-only set, not starting Streamlit.%C_RESET%
+    goto end
+)
+
+echo %C_GREEN%Starting VideoLingo with %VENV_LABEL%...%C_RESET%
+"%VENV_PY%" -m streamlit run st.py 2>&1 | powershell -NoProfile -Command "$input | Tee-Object -FilePath '%LOGFILE%' -Append"
+goto end
+
+:install_failed
+echo %C_RED%Install/repair failed. Check the messages above and the log file.%C_RESET%
+
+:end
 pause
diff --git a/OneKeyStart_uv.bat b/OneKeyStart_uv.bat
diff --git a/config.yaml b/config.yaml
@@ -1,7 +1,7 @@
 # * Settings marked with * are advanced settings that won't appear in the Streamlit page and can only be modified manually in config.py
 # recommend to set in streamlit page
 # -------------------
-# version: "3.0.0"
+# version: "3.0.3"
 # author: "Huanshere"
 # -------------------
 
@@ -11,9 +11,9 @@ display_language: "zh-CN"
 
 # API settings
 api:
-  key: 'your-api-key'
+  key: 'YOUR_API_KEY'
   base_url: 'https://yunwu.ai'
-  model: ''
+  model: 'gpt-5.5'
   llm_support_json: false
 # *Number of LLM multi-threaded accesses, set to 1 if using local LLM
 max_workers: 4
@@ -22,7 +22,7 @@ max_workers: 4
 target_language: '简体中文'
 
 # Whether to use Demucs for vocal separation before transcription
-demucs: true
+demucs: false
 
 whisper:
   # ["large-v3", "large-v3-turbo"]. Note: for zh model will force to use Belle/large-v3
@@ -38,7 +38,7 @@ whisper:
   elevenlabs_api_key: 'your_elevenlabs_api_key'
 
 # Whether to burn subtitles into the video
-burn_subtitles: true
+burn_subtitles: false
 
 ## ======================== Advanced Settings ======================== ##
 # *🔬 h264_nvenc GPU acceleration for ffmpeg, make sure your GPU supports it

diff --git a/core/_10_gen_audio.py b/core/_10_gen_audio.py
@@ -85,6 +85,7 @@ def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame:
         warmup_size = min(WARMUP_SIZE, len(tasks_df))
         for _, row in tasks_df.head(warmup_size).iterrows():
             try:
+                check_cancel()
                 number, real_dur = process_row(row, tasks_df)
                 tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur
                 progress.advance(task)
@@ -103,14 +104,20 @@ def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame:
                     for _, row in remaining_tasks.iterrows()
                 ]
 
-                for future in as_completed(futures):
-                    try:
-                        number, real_dur = future.result()
-                        tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur
-                        progress.advance(task)
-                    except Exception as e:
-                        rprint(f"[red]❌ Error: {str(e)}[/red]")
-                        raise e
+                try:
+                    for future in as_completed(futures):
+                        check_cancel()
+                        try:
+                            number, real_dur = future.result()
+                            tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur
+                            progress.advance(task)
+                        except Exception as e:
+                            rprint(f"[red]❌ Error: {str(e)}[/red]")
+                            raise e
+                except BaseException:
+                    for f in futures:
+                        f.cancel()
+                    raise
 
     rprint("[bold green]✨ TTS audio generation completed![/bold green]")
     return tasks_df
@@ -149,6 +156,7 @@ def merge_chunks(tasks_df: pd.DataFrame) -> pd.DataFrame:
 
     for index, row in tasks_df.iterrows():
         if row['cut_off'] == 1:
+            check_cancel()
             chunk_df = tasks_df.iloc[chunk_start:index+1].reset_index(drop=True)
             speed_factor, keep_gaps = process_chunk(chunk_df, accept, min_speed)
 

diff --git a/core/_11_merge_audio.py b/core/_11_merge_audio.py
@@ -58,6 +58,7 @@ def merge_audio_segments(audios, new_sub_times, sample_rate):
         merge_task = progress.add_task("🎵 Merging audio segments...", total=len(audios))
 
         for i, (audio_file, time_range) in enumerate(zip(audios, new_sub_times)):
+            check_cancel()
             if not os.path.exists(audio_file):
                 console.print(f"[bold yellow]⚠️  Warning: File {audio_file} does not exist, skipping...[/bold yellow]")
                 progress.advance(merge_task)

diff --git a/core/_12_dub_to_vid.py b/core/_12_dub_to_vid.py
@@ -30,6 +30,11 @@
 
 def merge_video_audio():
     """Merge video and audio, and reduce video volume"""
+    from core._1_ytdlp import is_audio_only_input
+    if is_audio_only_input():
+        rprint("[bold green]🎵 Audio-only input: skipping dubbing video merge. Dubbed audio is in the `output` directory.[/bold green]")
+        return
+
     VIDEO_FILE = find_video_files()
     background_file = _BACKGROUND_AUDIO_FILE
 

diff --git a/core/_1_ytdlp.py b/core/_1_ytdlp.py
@@ -1,9 +1,14 @@
 import os,sys
 import glob
+import json
 import re
 import subprocess
 from core.utils import *
 
+OUTPUT_DIR = "output"
+INPUT_MANIFEST = "input_manifest.json"
+GENERATED_AUDIO_NAMES = {"dub.mp3", "normalized_dub.wav"}
+
 def sanitize_filename(filename):
     # Remove or replace illegal characters
     filename = re.sub(r'[<>:"/\\|?*]', '', filename)
@@ -51,6 +56,27 @@ def download_video_ytdlp(url, save_path='output', resolution='1080'):
             new_filename = sanitize_filename(filename)
             if new_filename != filename:
                 os.rename(os.path.join(save_path, file), os.path.join(save_path, new_filename + ext))
+    media_file = find_video_files(save_path)
+    write_input_manifest(media_file, "video", save_path)
+
+def write_input_manifest(media_file: str, media_type: str, save_path='output'):
+    os.makedirs(save_path, exist_ok=True)
+    manifest_path = os.path.join(save_path, INPUT_MANIFEST)
+    media_path = media_file.replace("\\", "/") if sys.platform.startswith('win') else media_file
+    with open(manifest_path, "w", encoding="utf-8") as f:
+        json.dump({"path": media_path, "type": media_type}, f, ensure_ascii=False, indent=2)
+
+def _read_input_manifest(save_path='output'):
+    manifest_path = os.path.join(save_path, INPUT_MANIFEST)
+    if not os.path.exists(manifest_path):
+        return None
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    media_file = data.get("path")
+    media_type = data.get("type")
+    if media_type not in {"video", "audio"} or not media_file or not os.path.exists(media_file):
+        return None
+    return media_file.replace("\\", "/") if sys.platform.startswith('win') else media_file, media_type
 
 def find_video_files(save_path='output'):
     video_files = [file for file in glob.glob(save_path + "/*") if os.path.splitext(file)[1][1:].lower() in load_key("allowed_video_formats")]
@@ -62,6 +88,52 @@ def find_video_files(save_path='output'):
         raise ValueError(f"Number of videos found {len(video_files)} is not unique. Please check.")
     return video_files[0]
 
+def find_audio_files(save_path='output'):
+    audio_files = [file for file in glob.glob(save_path + "/*") if os.path.splitext(file)[1][1:].lower() in load_key("allowed_audio_formats")]
+    if sys.platform.startswith('win'):
+        audio_files = [file.replace("\\", "/") for file in audio_files]
+    audio_files = [file for file in audio_files if os.path.basename(file) not in GENERATED_AUDIO_NAMES]
+    if len(audio_files) != 1:
+        raise ValueError(f"Number of audio files found {len(audio_files)} is not unique. Please check.")
+    return audio_files[0]
+
+def _safe_find_video_file(save_path='output'):
+    try:
+        return find_video_files(save_path)
+    except ValueError as e:
+        if "found 0" in str(e):
+            return None
+        raise
+
+def _safe_find_audio_file(save_path='output'):
+    try:
+        return find_audio_files(save_path)
+    except ValueError as e:
+        if "found 0" in str(e):
+            return None
+        raise
+
+def find_media_file(save_path='output'):
+    manifest = _read_input_manifest(save_path)
+    if manifest:
+        return manifest
+    video_file = _safe_find_video_file(save_path)
+    if video_file:
+        return video_file, "video"
+    audio_file = _safe_find_audio_file(save_path)
+    if audio_file:
+        return audio_file, "audio"
+    raise ValueError("No media file found. Please download or upload a media file first.")
+
+def is_audio_only_input(save_path='output'):
+    # True when the input is a standalone audio file (no video present).
+    # In this case VideoLingo only produces subtitle files; no video output.
+    try:
+        _, media_type = find_media_file(save_path)
+        return media_type == "audio"
+    except Exception:
+        return False
+
 if __name__ == '__main__':
     # Example usage
     url = input('Please enter the URL of the video you want to download: ')

diff --git a/core/_2_asr.py b/core/_2_asr.py
@@ -1,14 +1,17 @@
 from core.utils import *
 from core.asr_backend.demucs_vl import demucs_audio
-from core.asr_backend.audio_preprocess import process_transcription, convert_video_to_audio, split_audio, save_results, normalize_audio_volume
-from core._1_ytdlp import find_video_files
+from core.asr_backend.audio_preprocess import process_transcription, convert_video_to_audio, prepare_audio_for_asr, split_audio, save_results, normalize_audio_volume
+from core._1_ytdlp import find_media_file
 from core.utils.models import *
 
 @check_file_exists(_2_CLEANED_CHUNKS)
 def transcribe():
-    # 1. video to audio
-    video_file = find_video_files()
-    convert_video_to_audio(video_file)
+    # 1. prepare audio
+    media_file, media_type = find_media_file()
+    if media_type == "video":
+        convert_video_to_audio(media_file)
+    else:
+        prepare_audio_for_asr(media_file)
 
     # 2. Demucs vocal separation:
     if load_key("demucs"):
@@ -34,6 +37,7 @@ def transcribe():
         rprint("[cyan]🎤 Transcribing audio with ElevenLabs API...[/cyan]")
 
     for start, end in segments:
+        check_cancel()
         result = ts(_RAW_AUDIO_FILE, vocal_audio, start, end)
         all_results.append(result)
 
@@ -47,4 +51,4 @@ def transcribe():
     save_results(df)
 
 if __name__ == "__main__":
-    transcribe()
+    transcribe()
diff --git a/core/_4_2_translate.py b/core/_4_2_translate.py
@@ -67,9 +67,15 @@ def translate_all():
                 future = executor.submit(translate_chunk, chunk, chunks, theme_prompt, i)
                 futures.append(future)
             results = []
-            for future in concurrent.futures.as_completed(futures):
-                results.append(future.result())
-                progress.update(task, advance=1)
+            try:
+                for future in concurrent.futures.as_completed(futures):
+                    check_cancel()
+                    results.append(future.result())
+                    progress.update(task, advance=1)
+            except BaseException:
+                for f in futures:
+                    f.cancel()
+                raise
 
     results.sort(key=lambda x: x[0])  # Sort results based on original order