From c2e245ef67a71113d482fb46d49432e498e17863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20Lakunza?= Date: Fri, 6 Mar 2026 12:25:29 +0100 Subject: [PATCH] fix(normalizations): guard against index out of range in LogProbTokenNorm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem When running evaluations with cached predictions, `normalize_log_probs` crashes with an `IndexError: list index out of range` inside the `LogProbTokenNorm` case. This happens because the cached `output_tokens` list can be shorter than `choices_logprob`, for example when a task's choices change between runs but the cache is not invalidated. Affected path: `lighteval/metrics/normalizations.py` → `normalize_log_probs` → `LogProbTokenNorm` branch ## Fix - Use `min(len(choices_logprob), len(choices_tokens))` to safely cap the iteration range instead of blindly iterating over `len(choices_logprob)`. - Emit a `logger.warning` when truncation occurs so users are alerted to potential cache corruption and can take action (e.g. clearing the cache). - Add a module-level `logger = logging.getLogger(__name__)` for consistent logging. ## Changes - `src/lighteval/metrics/normalizations.py` - Added `import logging` and module-level logger - Replaced bare list comprehension in `LogProbTokenNorm` with length-guarded version ## How to reproduce the original bug 1. Run an evaluation that uses `LogProbTokenNorm` (e.g. `belebele_mkd_Cyrl_cf`). 2. Allow results to be cached. 3. Modify the number of choices for the task, or use a stale cache from a previous run. 4. Re-run, the pipeline crashes at the metric computation stage with `IndexError: list index out of range`. --- src/lighteval/metrics/normalizations.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/lighteval/metrics/normalizations.py b/src/lighteval/metrics/normalizations.py index ef55681b1..15932dd87 100644 --- a/src/lighteval/metrics/normalizations.py +++ b/src/lighteval/metrics/normalizations.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging import re import string import sys @@ -31,6 +32,8 @@ from lighteval.utils.imports import Extra, requires from lighteval.utils.language import Language +logger = logging.getLogger(__name__) + # From HELM def helm_normalizer(text: str) -> str: @@ -523,8 +526,14 @@ def normalize_log_probs( normalized_log_probs = [choices_logprob[ix] / len(choice) for ix, choice in enumerate(choices_text)] case LogProbTokenNorm(): assert choices_tokens is not None, "choices_tokens must be provided for token normalization" + n = min(len(choices_logprob), len(choices_tokens)) + if n < len(choices_logprob): + logger.warning( + f"choices_tokens length ({len(choices_tokens)}) is less than choices_logprob length " + f"({len(choices_logprob)}). This may indicate corrupted cache data. Truncating to {n} elements." + ) normalized_log_probs = [ - choices_logprob[ix] / len(choices_tokens[ix]) for ix in range(len(choices_logprob)) + choices_logprob[ix] / len(choices_tokens[ix]) for ix in range(n) ] case LogProbPMINorm(): assert unconditioned_logprob is not None, "unconditioned_logprob must be provided for PMI normalization"