From d9941988e809d593b00887f22c5f0500b737fd99 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sun, 14 Jun 2026 14:06:15 +0000 Subject: [PATCH 1/2] fix: make _get_offset_tokenizer immune to the global fastokens patch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit load_tokenizer toggles a process-global fastokens monkeypatch per pool-slot load, holding _FASTOKENS_PATCH_LOCK only around the patch/unpatch calls, not the load. Under the concurrent renderer pool, _get_offset_tokenizer's 'vanilla' reload could race an open patch window, get an offset-less fastokens-backed tokenizer, and cache it — permanently breaking offset attribution (renderers using attribute_text_segments then raise 'fastokens does not track character offsets'). Reload with the patch forced off under _FASTOKENS_PATCH_LOCK and re-probe before caching, so a poisoned (non-offset) tokenizer is never returned or cached. Co-Authored-By: Claude Opus 4.8 (1M context) --- renderers/base.py | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/renderers/base.py b/renderers/base.py index 242adae..44c897b 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1689,19 +1689,44 @@ def _get_offset_tokenizer(tokenizer): kwargs = {"trust_remote_code": True, "revision": revision} else: kwargs = {"trust_remote_code": False} - # Explicitly vanilla — we want HF's Rust tokenizer with offset - # tracking, not the fastokens shim. ``load_tokenizer`` would - # patch fastokens in by default; routing through - # ``_load_tokenizer_via_auto`` keeps the fastokens patch out - # of this code path while still applying the config-build - # fallback (RoPE-validation failures on nested - # ``rope_parameters``, etc.). + def _has_offsets(tok) -> bool: + if not getattr(tok, "is_fast", False): + return False + try: + tok("a", add_special_tokens=False, return_offsets_mapping=True) + return True + except (NotImplementedError, ValueError, TypeError): + return False + + # We want HF's Rust tokenizer with offset tracking, not the fastokens + # shim. The shim is installed by a *process-global* monkeypatch that + # ``load_tokenizer`` toggles per pool-slot load, so a plain reload here + # can race a concurrent slot's open patch window and silently pick up + # the offset-less shim (then get cached, poisoning the process). So: + # load, verify offsets, and if missing, reload with the patch forced + # off — serialized against pool patch/unpatch via ``_FASTOKENS_PATCH_LOCK`` + # so no concurrent window can swap the shim back in mid-load — then + # restore the prior patch state. Never cache a non-offset tokenizer. offset_tok = _load_tokenizer_via_auto(name_or_path, **kwargs) - if not getattr(offset_tok, "is_fast", False): + if not _has_offsets(offset_tok): + import fastokens + + with _FASTOKENS_PATCH_LOCK: + was_patched = bool(getattr(fastokens, "_patched", False)) + if was_patched: + with contextlib.redirect_stdout(io.StringIO()): + fastokens.unpatch_transformers() + try: + offset_tok = _load_tokenizer_via_auto(name_or_path, **kwargs) + finally: + if was_patched: + with contextlib.redirect_stdout(io.StringIO()): + fastokens.patch_transformers() + if not _has_offsets(offset_tok): raise RuntimeError( - f"Vanilla tokenizer for {name_or_path!r} is not a fast " - "tokenizer; offset_mapping is unavailable. Hand-coded " - "renderers require a fast tokenizer for body/scaffold " + f"Could not load an offset-capable tokenizer for {name_or_path!r}: " + "offset_mapping is unavailable even with the fastokens patch off. " + "Hand-coded renderers require a fast tokenizer for body/scaffold " "attribution." ) _offset_tokenizers[name_or_path] = offset_tok From 057f0087d805a428f70393be209134f23b1e9880 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sun, 14 Jun 2026 21:27:40 +0000 Subject: [PATCH 2/2] style: ruff format (blank line before nested def) Co-Authored-By: Claude Opus 4.8 (1M context) --- renderers/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/renderers/base.py b/renderers/base.py index 44c897b..ff2f083 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1689,6 +1689,7 @@ def _get_offset_tokenizer(tokenizer): kwargs = {"trust_remote_code": True, "revision": revision} else: kwargs = {"trust_remote_code": False} + def _has_offsets(tok) -> bool: if not getattr(tok, "is_fast", False): return False