From 39aa9f50dbca20d729178e210885794fea94c1b6 Mon Sep 17 00:00:00 2001 From: Claude-Assistant Date: Sun, 15 Mar 2026 13:07:16 +0100 Subject: [PATCH 1/6] fix: restore word-level timestamps for unalignable characters (#1372) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Characters not in the alignment model's vocabulary (digits, symbols, foreign script) were excluded from CTC alignment and got no timestamps. Fix: extend the emission matrix with a wildcard column (max non-blank score per frame) so unknown chars participate in alignment via the existing get_trellis/backtrack — no changes to the CTC internals. Interpolation fallback fills any remaining gaps as a safety net. Unlike PR #986, this does not rewrite get_trellis or backtrack, so it avoids the regression that caused #1220. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_word_timestamp_interpolation.py | 135 +++++++++++++++++++++ whisperx/alignment.py | 36 +++++- 2 files changed, 166 insertions(+), 5 deletions(-) create mode 100644 tests/test_word_timestamp_interpolation.py diff --git a/tests/test_word_timestamp_interpolation.py b/tests/test_word_timestamp_interpolation.py new file mode 100644 index 000000000..08be577fd --- /dev/null +++ b/tests/test_word_timestamp_interpolation.py @@ -0,0 +1,135 @@ +"""Test that align() produces word-level timestamps for unalignable characters.""" + +import torch +from unittest.mock import MagicMock + +from whisperx.alignment import align + + +def _make_mock_model(emission, dictionary): + """Create a mock torchaudio-style model that returns a fixed emission matrix. + + The emission should be pre-log-softmax logits of shape (num_frames, vocab_size). + align() will apply log_softmax itself. + """ + model = MagicMock() + # torchaudio interface: model(waveform, lengths=lengths) -> (emissions, _) + # emissions shape: (batch=1, num_frames, vocab_size) + model.return_value = (emission.unsqueeze(0), None) + return model + + +def _make_emission(num_frames, dictionary, transcript_chars, blank_id=0): + """Build a synthetic emission matrix where known chars peak at the right frames. + + Distributes characters evenly across frames. Known chars get high logits + at their assigned frames. Unknown chars have no specific token but will + get wildcard treatment in align(). + """ + vocab_size = max(dictionary.values()) + 1 + # Start with uniform low logits, blank slightly favored + emission = torch.full((num_frames, vocab_size), -5.0) + emission[:, blank_id] = -1.0 + + # Assign each transcript char a span of frames + chars_in_dict = [(i, c) for i, c in enumerate(transcript_chars) + if c.lower() in dictionary] + if chars_in_dict: + frames_per_char = num_frames // (len(transcript_chars) + 1) + for seq_idx, (char_idx, char) in enumerate(chars_in_dict): + center = (char_idx + 1) * frames_per_char + start = max(0, center - frames_per_char // 2) + end = min(num_frames, center + frames_per_char // 2) + token_id = dictionary[char.lower()] + for t in range(start, end): + emission[t, token_id] = 2.0 # high logit for this token + emission[t, blank_id] = -3.0 # suppress blank + + return emission + + +class TestAlignWithWildcards: + """Test align() end-to-end with unknown characters.""" + + DICTIONARY = { + "": 0, # blank + "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, + "f": 6, "g": 7, "h": 8, "i": 9, "k": 10, + "l": 11, "m": 12, "n": 13, "o": 14, "p": 15, + "r": 16, "s": 17, "t": 18, "u": 19, "w": 20, + "|": 21, + } + METADATA = {"language": "en", "dictionary": DICTIONARY, "type": "torchaudio"} + + def _run_align(self, text, duration=5.0, num_frames=100): + """Run align() with a mock model on a single segment.""" + torch.manual_seed(0) + emission = _make_emission(num_frames, self.DICTIONARY, list(text), blank_id=0) + model = _make_mock_model(emission, self.DICTIONARY) + + sample_rate = 16000 + num_samples = int(duration * sample_rate) + audio = torch.randn(num_samples) + + transcript = [{"text": text, "start": 0.0, "end": duration}] + result = align( + transcript=transcript, + model=model, + align_model_metadata=self.METADATA, + audio=audio, + device="cpu", + ) + return result + + def test_known_chars_get_timestamps(self): + """Baseline: words with all known chars get timestamps.""" + result = self._run_align("the cat sat") + for word in result["word_segments"]: + assert "start" in word, f"'{word['word']}' missing start" + assert "end" in word, f"'{word['word']}' missing end" + assert "score" in word, f"'{word['word']}' missing score" + + def test_unknown_word_gets_timestamps(self): + """A word made of unknown chars (digits) gets timestamps via wildcard.""" + result = self._run_align("cost 43 dollars") + words = {w["word"]: w for w in result["word_segments"]} + assert "43" in words, f"'43' not in word_segments: {list(words.keys())}" + assert "start" in words["43"], "'43' missing start timestamp" + assert "end" in words["43"], "'43' missing end timestamp" + assert "score" in words["43"], "'43' missing score" + + def test_mixed_word_gets_timestamps(self): + """A word with mixed known/unknown chars gets timestamps.""" + result = self._run_align("has 43k users") + # "43k" has unknown '4','3' and known 'k' + words = {w["word"]: w for w in result["word_segments"]} + assert "43k" in words, f"'43k' not in word_segments: {list(words.keys())}" + assert "start" in words["43k"] + assert "end" in words["43k"] + + def test_unknown_word_does_not_corrupt_neighbors(self): + """Known words adjacent to unknown words still get valid timestamps.""" + result = self._run_align("cost 43 dollars") + words = {w["word"]: w for w in result["word_segments"]} + for known in ("cost", "dollars"): + assert known in words + assert "start" in words[known], f"'{known}' missing start" + assert "end" in words[known], f"'{known}' missing end" + assert "score" in words[known], f"'{known}' missing score" + + def test_all_unknown_segment_gets_timestamps(self): + """A segment with only unknown chars gets wildcard-aligned timestamps.""" + result = self._run_align("123 456") + assert len(result["word_segments"]) > 0, "expected word_segments for all-unknown text" + for word in result["word_segments"]: + assert "start" in word, f"'{word['word']}' missing start" + assert "end" in word, f"'{word['word']}' missing end" + + def test_timestamps_are_ordered(self): + """Word timestamps are monotonically non-decreasing.""" + result = self._run_align("the 99 cats") + starts = [w["start"] for w in result["word_segments"] if "start" in w] + for i in range(1, len(starts)): + assert starts[i] >= starts[i - 1], ( + f"Timestamps not ordered: {starts}" + ) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index 0786d0eb9..35b1f8aac 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -178,11 +178,12 @@ def align( elif char_ in model_dictionary.keys(): clean_char.append(char_) clean_cdx.append(cdx) + elif char_ not in (" ", "|"): + # unknown char (digit, symbol, foreign script) — use wildcard + clean_char.append(char_) + clean_cdx.append(cdx) - clean_wdx = [] - for wdx, wrd in enumerate(per_word): - if any([c in model_dictionary.keys() for c in wrd.lower()]): - clean_wdx.append(wdx) + clean_wdx = list(range(len(per_word))) # Use language-specific Punkt model if available otherwise we fallback to English. punkt_lang = PUNKT_LANGUAGES.get(model_lang, 'english') @@ -236,7 +237,6 @@ def align( continue text_clean = "".join(segment_data[sdx]["clean_char"]) - tokens = [model_dictionary[c] for c in text_clean] f1 = int(t1 * SAMPLE_RATE) f2 = int(t2 * SAMPLE_RATE) @@ -268,6 +268,19 @@ def align( if char == '[pad]' or char == '': blank_id = code + # Build tokens, mapping unknown chars to a wildcard column + has_wildcard = any(c not in model_dictionary for c in text_clean) + if has_wildcard: + # Extend emission with a wildcard column: max non-blank score per frame + non_blank_mask = torch.ones(emission.size(1), dtype=torch.bool) + non_blank_mask[blank_id] = False + wildcard_col = emission[:, non_blank_mask].max(dim=1).values + emission = torch.cat([emission, wildcard_col.unsqueeze(1)], dim=1) + wildcard_id = emission.size(1) - 1 + tokens = [model_dictionary.get(c, wildcard_id) for c in text_clean] + else: + tokens = [model_dictionary[c] for c in text_clean] + trellis = get_trellis(emission, tokens, blank_id) path = backtrack(trellis, emission, tokens, blank_id) @@ -348,6 +361,19 @@ def align( sentence_words.append(word_segment) + # Interpolate timestamps for words with no alignable characters + if sentence_words: + _starts = pd.Series([w.get("start", np.nan) for w in sentence_words]) + _ends = pd.Series([w.get("end", np.nan) for w in sentence_words]) + if _starts.isna().any() and _starts.notna().any(): + _starts = interpolate_nans(_starts, method=interpolate_method) + _ends = interpolate_nans(_ends, method=interpolate_method) + for i, w in enumerate(sentence_words): + if "start" not in w and pd.notna(_starts.iloc[i]): + w["start"] = _starts.iloc[i] + if "end" not in w and pd.notna(_ends.iloc[i]): + w["end"] = _ends.iloc[i] + subsegment = { "text": sentence_text, "start": sentence_start, From da072d6bcb593720566ff6c773f8caf86f1591e7 Mon Sep 17 00:00:00 2001 From: Claude-Assistant Date: Sun, 15 Mar 2026 13:33:11 +0100 Subject: [PATCH 2/6] test: add regression test for #1372 (digits+comma get no timestamps) Reproduces the exact reported bug: '4,9' embedded in text gets no start/end/score because digits and commas are not in the alignment model dictionary. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_word_timestamp_interpolation.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_word_timestamp_interpolation.py b/tests/test_word_timestamp_interpolation.py index 08be577fd..ed2602eff 100644 --- a/tests/test_word_timestamp_interpolation.py +++ b/tests/test_word_timestamp_interpolation.py @@ -133,3 +133,17 @@ def test_timestamps_are_ordered(self): assert starts[i] >= starts[i - 1], ( f"Timestamps not ordered: {starts}" ) + + def test_issue_1372_digits_comma_no_timestamps(self): + """Regression: '4,9' (digits+comma) must get timestamps. + + https://github.com/m-bain/whisperX/issues/1372#issuecomment-4051234966 + Reporter showed that align() returned {'word': '4,9'} with no + start/end/score for German text containing '4,9'. + """ + result = self._run_align("halt mit 4,9 nicht ins parlament", num_frames=200) + words = {w["word"]: w for w in result["word_segments"]} + assert "4,9" in words, f"'4,9' not in word_segments: {list(words.keys())}" + assert "start" in words["4,9"], "'4,9' missing start" + assert "end" in words["4,9"], "'4,9' missing end" + assert "score" in words["4,9"], "'4,9' missing score" From f9a3f8fadae2832ae5b39aee7f469926acccbe58 Mon Sep 17 00:00:00 2001 From: Claude-Assistant Date: Sun, 15 Mar 2026 13:41:50 +0100 Subject: [PATCH 3/6] ci: add pytest dev dependency and test workflow Add pytest to optional dev dependencies and create a dedicated tests.yml workflow to run pytest across Python 3.10-3.13. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/tests.yml | 30 ++++++++++ pyproject.toml | 3 + uv.lock | 115 ++++++++++++++++++++++++++---------- 3 files changed, 118 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..dbfd87509 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,30 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + version: "0.5.14" + python-version: ${{ matrix.python-version }} + + - name: Install the project + run: uv sync --all-extras + + - name: Run tests + run: uv run pytest tests/ -v diff --git a/pyproject.toml b/pyproject.toml index 94626d605..b5d69f625 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,9 @@ dependencies = [ ] +[project.optional-dependencies] +dev = ["pytest"] + [project.scripts] whisperx = "whisperx.__main__:cli" diff --git a/uv.lock b/uv.lock index 22d61222e..2aace58a6 100644 --- a/uv.lock +++ b/uv.lock @@ -526,6 +526,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/62/9773de14fe6c45c23649e98b83231fffd7b9892b6cf863251dc2afa73643/einops-0.8.1-py3-none-any.whl", hash = "sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737", size = 64359, upload-time = "2025-02-09T03:17:01.998Z" }, ] +[[package]] +name = "exceptiongroup" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, +] + [[package]] name = "faster-whisper" version = "1.2.0" @@ -888,6 +900,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, ] +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -1864,6 +1885,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" }, ] +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + [[package]] name = "primepy" version = "1.3" @@ -2109,6 +2139,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" }, ] +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -2849,16 +2897,16 @@ dependencies = [ { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c2f44cf279f673cfcdd8f576c349eee8bedf8caab351a5dd78b32970cc34a212" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d3c1b85b26a09832d139f6d6da6b66caeb51d2e16e08f8587665c44a9e1aa8f9" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c9276857d241c6de257af765c0f51fc011af38cb725401495121b280913007cf" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4573c6042950c20278e3608a9a38050ba0bc72e0049e1bbfd249caf859a8029b" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ddef94bf181e6447cbb05f38beaca8f6c5bb8d2b9ddced1aa3452025b9fc70d3" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:862e2e40bf09d865e5df080a84c1a39bbcef40e43140f4b1737eb3a389d3b38f" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f851d32e94ca05e470f0c60e25726ec1e0eb71cb2ca5a0206b7fd03272ccc3c8" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:09535a9b727c0793cd07c1ace99f3f353626281bcc3e30c2f2314e3ebc9d3f96" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:68df9c9068984edff8065c2b6656725e6114fe89281b0cf122c7505305fc98a4" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:1951f10ed092f2dda57634f6a3950ef21c9d9352551aa84a9fccd51bbda18095" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c2f44cf279f673cfcdd8f576c349eee8bedf8caab351a5dd78b32970cc34a212" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d3c1b85b26a09832d139f6d6da6b66caeb51d2e16e08f8587665c44a9e1aa8f9" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c9276857d241c6de257af765c0f51fc011af38cb725401495121b280913007cf" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4573c6042950c20278e3608a9a38050ba0bc72e0049e1bbfd249caf859a8029b" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ddef94bf181e6447cbb05f38beaca8f6c5bb8d2b9ddced1aa3452025b9fc70d3" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:862e2e40bf09d865e5df080a84c1a39bbcef40e43140f4b1737eb3a389d3b38f" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f851d32e94ca05e470f0c60e25726ec1e0eb71cb2ca5a0206b7fd03272ccc3c8" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:09535a9b727c0793cd07c1ace99f3f353626281bcc3e30c2f2314e3ebc9d3f96" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:68df9c9068984edff8065c2b6656725e6114fe89281b0cf122c7505305fc98a4" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:1951f10ed092f2dda57634f6a3950ef21c9d9352551aa84a9fccd51bbda18095" }, ] [[package]] @@ -2875,16 +2923,16 @@ dependencies = [ { name = "torch", version = "2.8.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and platform_python_implementation != 'CPython' and sys_platform == 'linux') or (platform_machine != 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c955835e470ebbde03d7d54ca5d8ba5722138bbfd66cfb86845234b3a5b9f9fa" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp310-cp310-win_amd64.whl", hash = "sha256:e9e68f16f1afe108f0cb1c7d636d0242fdc43cbbcaab222a72a373b9d2799134" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e54bd7fc9472019308097d99102df9acee22aa2451ae808d27840bc874320292" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:db37df7eee906f8fe0a639fdc673f3541cb2e173169b16d4133447eb922d1938" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9377faee65a290578280ac7f4884c3586253dac2ca28c60f458ff6efe86a6b05" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:9b302192b570657c1cc787a4d487ae4bbb7f2aab1c01b1fcc46757e7f86f391e" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e1b1f530e8b71b1d079e23db45a0e621709061710ef8540aae8280aa039554ee" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:0c2d081e24204768e636cbf05e1377c8a6964b8ed6fa3aa5092ba9af9bbc19c5" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:89c2d04fe1cb7c31eb042f7b36e1ce8e2afacf769ecd5f216527e184e4857099" }, - { url = "https://download.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:ab4653da31dc37f0a643f41f4da8bee647a8686bacf12d3929cac8aead186811" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c955835e470ebbde03d7d54ca5d8ba5722138bbfd66cfb86845234b3a5b9f9fa" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp310-cp310-win_amd64.whl", hash = "sha256:e9e68f16f1afe108f0cb1c7d636d0242fdc43cbbcaab222a72a373b9d2799134" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e54bd7fc9472019308097d99102df9acee22aa2451ae808d27840bc874320292" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:db37df7eee906f8fe0a639fdc673f3541cb2e173169b16d4133447eb922d1938" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9377faee65a290578280ac7f4884c3586253dac2ca28c60f458ff6efe86a6b05" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:9b302192b570657c1cc787a4d487ae4bbb7f2aab1c01b1fcc46757e7f86f391e" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e1b1f530e8b71b1d079e23db45a0e621709061710ef8540aae8280aa039554ee" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:0c2d081e24204768e636cbf05e1377c8a6964b8ed6fa3aa5092ba9af9bbc19c5" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:89c2d04fe1cb7c31eb042f7b36e1ce8e2afacf769ecd5f216527e184e4857099" }, + { url = "https://download-r2.pytorch.org/whl/cpu/torchaudio-2.8.0%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:ab4653da31dc37f0a643f41f4da8bee647a8686bacf12d3929cac8aead186811" }, ] [[package]] @@ -2901,16 +2949,16 @@ dependencies = [ { name = "torch", version = "2.8.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a0161e95285a0b716de210fee0392151d601e7da3cc86595008d826abff48a8c" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:5d7a9d913e2744573ed3b7ec2f781ed39833c81c9c41859973ec10ac174c2366" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f4409df567d0723a7a3a89d32c7552a17e0ff6f137ea26a0d268c665259b2995" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:7a1eb6154e05b8056b34c7a41495e09d57f79eb0180eb4e7f3bb2a61845ca8ea" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:145b8a0c21cfcaa1705c67173c5d439087e0e120d5da9bc344746f937901d243" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:cce3a60cd9a97f7360c8f95504ac349311fb7d6b9b826135936764f4de5f782d" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:410bb8ea46225efe658e5d27a3802c181a2255913003621a5d25a51aca8018d9" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:3146bbd48992d215f6bb1aef9626d734c3180b377791ded2a4d4d2c0e63c0cc2" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:04b410f93337fc6c16576d0c88e2a31091aef9d1fd212ebb8cd26899dba175e0" }, - { url = "https://download.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:1054e0a7613cac54ed9b3784a5fcbe023748a70004d9cca74c5f9ae00a1fdfd1" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a0161e95285a0b716de210fee0392151d601e7da3cc86595008d826abff48a8c" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:5d7a9d913e2744573ed3b7ec2f781ed39833c81c9c41859973ec10ac174c2366" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f4409df567d0723a7a3a89d32c7552a17e0ff6f137ea26a0d268c665259b2995" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:7a1eb6154e05b8056b34c7a41495e09d57f79eb0180eb4e7f3bb2a61845ca8ea" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:145b8a0c21cfcaa1705c67173c5d439087e0e120d5da9bc344746f937901d243" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:cce3a60cd9a97f7360c8f95504ac349311fb7d6b9b826135936764f4de5f782d" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:410bb8ea46225efe658e5d27a3802c181a2255913003621a5d25a51aca8018d9" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:3146bbd48992d215f6bb1aef9626d734c3180b377791ded2a4d4d2c0e63c0cc2" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:04b410f93337fc6c16576d0c88e2a31091aef9d1fd212ebb8cd26899dba175e0" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchaudio-2.8.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:1054e0a7613cac54ed9b3784a5fcbe023748a70004d9cca74c5f9ae00a1fdfd1" }, ] [[package]] @@ -3047,6 +3095,11 @@ dependencies = [ { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] +[package.optional-dependencies] +dev = [ + { name = "pytest" }, +] + [package.metadata] requires-dist = [ { name = "ctranslate2", specifier = ">=4.5.0" }, @@ -3057,6 +3110,7 @@ requires-dist = [ { name = "omegaconf", specifier = ">=2.3.0" }, { name = "pandas", specifier = ">=2.2.3" }, { name = "pyannote-audio", specifier = ">=4.0.0" }, + { name = "pytest", marker = "extra == 'dev'" }, { name = "torch", marker = "sys_platform == 'darwin'", specifier = "~=2.8.0", index = "https://download.pytorch.org/whl/cpu" }, { name = "torch", marker = "platform_machine != 'x86_64' and sys_platform != 'darwin'", specifier = "~=2.8.0", index = "https://download.pytorch.org/whl/cpu" }, { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = "~=2.8.0", index = "https://download.pytorch.org/whl/cu128" }, @@ -3066,6 +3120,7 @@ requires-dist = [ { name = "transformers", specifier = ">=4.48.0" }, { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'", specifier = ">=3.3.0", index = "https://download.pytorch.org/whl/cu128" }, ] +provides-extras = ["dev"] [[package]] name = "yarl" From 94f60aab58564fdb5bfa7f34c77643daf6d22e00 Mon Sep 17 00:00:00 2001 From: Barabazs <31799121+Barabazs@users.noreply.github.com> Date: Wed, 25 Mar 2026 08:36:27 +0100 Subject: [PATCH 4/6] chore: bump version to 3.8.3 --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b5d69f625..0a9f9f9a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ urls = { repository = "https://github.com/m-bain/whisperx" } authors = [{ name = "Max Bain" }] name = "whisperx" -version = "3.8.2" +version = "3.8.3" description = "Time-Accurate Automatic Speech Recognition using Whisper." readme = "README.md" requires-python = ">=3.10, <3.14" diff --git a/uv.lock b/uv.lock index 2aace58a6..c7f0e3453 100644 --- a/uv.lock +++ b/uv.lock @@ -3074,7 +3074,7 @@ wheels = [ [[package]] name = "whisperx" -version = "3.8.2" +version = "3.8.3" source = { editable = "." } dependencies = [ { name = "ctranslate2" }, From 8efddaa2506c546fc66856405b5876033c1ca72a Mon Sep 17 00:00:00 2001 From: Claude-Assistant Date: Wed, 25 Mar 2026 08:54:26 +0100 Subject: [PATCH 5/6] fix: require faster-whisper>=1.2.0 for use_auth_token support (#1385) Co-Authored-By: Claude Opus 4.6 (1M context) --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0a9f9f9a6..1b87bd4fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ license = { text = "BSD-2-Clause" } dependencies = [ "ctranslate2>=4.5.0", - "faster-whisper>=1.1.1", + "faster-whisper>=1.2.0", "nltk>=3.9.1", "numpy>=2.1.0", "omegaconf>=2.3.0", diff --git a/uv.lock b/uv.lock index c7f0e3453..ade706d0f 100644 --- a/uv.lock +++ b/uv.lock @@ -3103,7 +3103,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "ctranslate2", specifier = ">=4.5.0" }, - { name = "faster-whisper", specifier = ">=1.1.1" }, + { name = "faster-whisper", specifier = ">=1.2.0" }, { name = "huggingface-hub", specifier = "<1.0.0" }, { name = "nltk", specifier = ">=3.9.1" }, { name = "numpy", specifier = ">=2.1.0" }, From 095b36b5573ae2b6b7d0711d36c0e0c09df088d1 Mon Sep 17 00:00:00 2001 From: Barabazs <31799121+Barabazs@users.noreply.github.com> Date: Wed, 25 Mar 2026 09:10:59 +0100 Subject: [PATCH 6/6] chore: bump version to 3.8.4 --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1b87bd4fc..df0592eaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ urls = { repository = "https://github.com/m-bain/whisperx" } authors = [{ name = "Max Bain" }] name = "whisperx" -version = "3.8.3" +version = "3.8.4" description = "Time-Accurate Automatic Speech Recognition using Whisper." readme = "README.md" requires-python = ">=3.10, <3.14" diff --git a/uv.lock b/uv.lock index ade706d0f..7b26699df 100644 --- a/uv.lock +++ b/uv.lock @@ -3074,7 +3074,7 @@ wheels = [ [[package]] name = "whisperx" -version = "3.8.3" +version = "3.8.4" source = { editable = "." } dependencies = [ { name = "ctranslate2" },