From c2e245ef67a71113d482fb46d49432e498e17863 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20Lakunza?= <ialakunza@gmail.com>
Date: Fri, 6 Mar 2026 12:25:29 +0100
Subject: [PATCH] fix(normalizations): guard against index out of range in
 LogProbTokenNorm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

When running evaluations with cached predictions, `normalize_log_probs` crashes with an `IndexError: list index out of range` inside the `LogProbTokenNorm` case. This happens because the cached `output_tokens` list can be shorter than `choices_logprob`, for example when a task's choices change between runs but the cache is not invalidated.

Affected path:
`lighteval/metrics/normalizations.py` → `normalize_log_probs` → `LogProbTokenNorm` branch

## Fix

- Use `min(len(choices_logprob), len(choices_tokens))` to safely cap the iteration range instead of blindly iterating over `len(choices_logprob)`.
- Emit a `logger.warning` when truncation occurs so users are alerted to potential cache corruption and can take action (e.g. clearing the cache).
- Add a module-level `logger = logging.getLogger(__name__)` for consistent logging.

## Changes

- `src/lighteval/metrics/normalizations.py`
  - Added `import logging` and module-level logger
  - Replaced bare list comprehension in `LogProbTokenNorm` with length-guarded version

## How to reproduce the original bug

1. Run an evaluation that uses `LogProbTokenNorm` (e.g. `belebele_mkd_Cyrl_cf`).
2. Allow results to be cached.
3. Modify the number of choices for the task, or use a stale cache from a previous run.
4. Re-run, the pipeline crashes at the metric computation stage with `IndexError: list index out of range`.
---
 src/lighteval/metrics/normalizations.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/lighteval/metrics/normalizations.py b/src/lighteval/metrics/normalizations.py
index ef55681b1..15932dd87 100644
--- a/src/lighteval/metrics/normalizations.py
+++ b/src/lighteval/metrics/normalizations.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import re
 import string
 import sys
@@ -31,6 +32,8 @@
 from lighteval.utils.imports import Extra, requires
 from lighteval.utils.language import Language
 
+logger = logging.getLogger(__name__)
+
 
 # From HELM
 def helm_normalizer(text: str) -> str:
@@ -523,8 +526,14 @@ def normalize_log_probs(
             normalized_log_probs = [choices_logprob[ix] / len(choice) for ix, choice in enumerate(choices_text)]
         case LogProbTokenNorm():
             assert choices_tokens is not None, "choices_tokens must be provided for token normalization"
+            n = min(len(choices_logprob), len(choices_tokens))
+            if n < len(choices_logprob):
+                logger.warning(
+                    f"choices_tokens length ({len(choices_tokens)}) is less than choices_logprob length "
+                    f"({len(choices_logprob)}). This may indicate corrupted cache data. Truncating to {n} elements."
+                )
             normalized_log_probs = [
-                choices_logprob[ix] / len(choices_tokens[ix]) for ix in range(len(choices_logprob))
+                choices_logprob[ix] / len(choices_tokens[ix]) for ix in range(n)
             ]
         case LogProbPMINorm():
             assert unconditioned_logprob is not None, "unconditioned_logprob must be provided for PMI normalization"