From 27bdf1edba6bea182e63617b1db95d7282bcdfc9 Mon Sep 17 00:00:00 2001
From: bartzbeielstein <32470350+bartzbeielstein@users.noreply.github.com>
Date: Sun, 14 Jun 2026 16:42:54 +0200
Subject: [PATCH] feat: add forecast level/debias, prior blend, and approach
 scoring to processing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three pure additions to spotforecast2_safe.processing, motivated by the
2026-06-13 team_4 post-mortem (a flat all-day over-prediction on a low-load
Saturday the four-zone setup could not catch):

- shape_check.check_forecast_level / LevelCheckReport / apply_level_correction:
  detect and remove a systematic flat level offset; complements the existing
  profile/shape check.
- blend.blend_with_prior: convex post-hoc blend of a model forecast with an
  external prior — the correct lever for down-weighting a near-oracle prior,
  since tree models are invariant to monotonic feature scaling.
- forecast_scoring.score_forecasts: tidy approach-by-metric table ranking
  competing forecasts (e.g. 4-zone bottom-up sum vs single combined model)
  against a shared actual.

Pure pandas/numpy, no forbidden deps; registered in the quartodoc API
reference; 67 new unit tests; full suite green (2755 passed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 _quarto.yml                                   |  15 +
 docs/reference/index.qmd                      |   5 +
 .../processing.blend.blend_with_prior.qmd     |  56 ++++
 ...ssing.forecast_scoring.score_forecasts.qmd |  63 +++++
 ...rocessing.shape_check.LevelCheckReport.qmd |  48 ++++
 ...ing.shape_check.apply_level_correction.qmd |  62 +++++
 ...ssing.shape_check.check_forecast_level.qmd |  67 +++++
 src/spotforecast2_safe/processing/__init__.py |  16 +-
 src/spotforecast2_safe/processing/blend.py    |  87 ++++++
 .../processing/forecast_scoring.py            | 132 +++++++++
 .../processing/shape_check.py                 | 256 ++++++++++++++++++
 tests/processing/test_blend.py                |  63 +++++
 tests/processing/test_forecast_scoring.py     |  82 ++++++
 tests/processing/test_level_check.py          | 166 ++++++++++++
 14 files changed, 1117 insertions(+), 1 deletion(-)
 create mode 100644 docs/reference/processing.blend.blend_with_prior.qmd
 create mode 100644 docs/reference/processing.forecast_scoring.score_forecasts.qmd
 create mode 100644 docs/reference/processing.shape_check.LevelCheckReport.qmd
 create mode 100644 docs/reference/processing.shape_check.apply_level_correction.qmd
 create mode 100644 docs/reference/processing.shape_check.check_forecast_level.qmd
 create mode 100644 src/spotforecast2_safe/processing/blend.py
 create mode 100644 src/spotforecast2_safe/processing/forecast_scoring.py
 create mode 100644 tests/processing/test_blend.py
 create mode 100644 tests/processing/test_forecast_scoring.py
 create mode 100644 tests/processing/test_level_check.py

diff --git a/_quarto.yml b/_quarto.yml
index aca6ea983..a4d021e88 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -230,6 +230,16 @@ website:
                 file: docs/reference/processing.shape_check.ShapeCheckReport.qmd
               - text: "check_forecast_shape"
                 file: docs/reference/processing.shape_check.check_forecast_shape.qmd
+              - text: "LevelCheckReport"
+                file: docs/reference/processing.shape_check.LevelCheckReport.qmd
+              - text: "check_forecast_level"
+                file: docs/reference/processing.shape_check.check_forecast_level.qmd
+              - text: "apply_level_correction"
+                file: docs/reference/processing.shape_check.apply_level_correction.qmd
+              - text: "blend_with_prior"
+                file: docs/reference/processing.blend.blend_with_prior.qmd
+              - text: "score_forecasts"
+                file: docs/reference/processing.forecast_scoring.score_forecasts.qmd
 
           - section: "Forecaster"
             contents:
@@ -672,6 +682,11 @@ quartodoc:
         - processing.n2n_predict_with_covariates.n2n_predict_with_covariates
         - processing.shape_check.ShapeCheckReport
         - processing.shape_check.check_forecast_shape
+        - processing.shape_check.LevelCheckReport
+        - processing.shape_check.check_forecast_level
+        - processing.shape_check.apply_level_correction
+        - processing.blend.blend_with_prior
+        - processing.forecast_scoring.score_forecasts
 
     # ── Forecaster ────────────────────────────────────────────────────────────
     - title: "Forecaster"
diff --git a/docs/reference/index.qmd b/docs/reference/index.qmd
index 01f56df75..201a201e1 100644
--- a/docs/reference/index.qmd
+++ b/docs/reference/index.qmd
@@ -88,6 +88,11 @@ Utilities for aggregated and n-to-n predictions.
 | [processing.n2n_predict_with_covariates.n2n_predict_with_covariates](processing.n2n_predict_with_covariates.n2n_predict_with_covariates.qmd#spotforecast2_safe.processing.n2n_predict_with_covariates.n2n_predict_with_covariates) | End-to-end recursive forecasting with exogenous covariates. |
 | [processing.shape_check.ShapeCheckReport](processing.shape_check.ShapeCheckReport.qmd#spotforecast2_safe.processing.shape_check.ShapeCheckReport) | Immutable result of a forecast shape plausibility check. |
 | [processing.shape_check.check_forecast_shape](processing.shape_check.check_forecast_shape.qmd#spotforecast2_safe.processing.shape_check.check_forecast_shape) | Measure correlation and daily-range ratio between a forecast and its reference. |
+| [processing.shape_check.LevelCheckReport](processing.shape_check.LevelCheckReport.qmd#spotforecast2_safe.processing.shape_check.LevelCheckReport) | Immutable result of a forecast *level* (systematic-bias) check. |
+| [processing.shape_check.check_forecast_level](processing.shape_check.check_forecast_level.qmd#spotforecast2_safe.processing.shape_check.check_forecast_level) | Measure the systematic level offset between a forecast and its reference. |
+| [processing.shape_check.apply_level_correction](processing.shape_check.apply_level_correction.qmd#spotforecast2_safe.processing.shape_check.apply_level_correction) | Shift a forecast so its central level matches a reference (debias). |
+| [processing.blend.blend_with_prior](processing.blend.blend_with_prior.qmd#spotforecast2_safe.processing.blend.blend_with_prior) | Convex-blend a model forecast with an external prior. |
+| [processing.forecast_scoring.score_forecasts](processing.forecast_scoring.score_forecasts.qmd#spotforecast2_safe.processing.forecast_scoring.score_forecasts) | Score several forecasts against a shared actual and rank them. |
 
 ## Forecaster
 
diff --git a/docs/reference/processing.blend.blend_with_prior.qmd b/docs/reference/processing.blend.blend_with_prior.qmd
new file mode 100644
index 000000000..1cc7f0b50
--- /dev/null
+++ b/docs/reference/processing.blend.blend_with_prior.qmd
@@ -0,0 +1,56 @@
+# processing.blend.blend_with_prior { #spotforecast2_safe.processing.blend.blend_with_prior }
+
+```python
+processing.blend.blend_with_prior(model_forecast, prior, *, weight)
+```
+
+Convex-blend a model forecast with an external prior.
+
+Returns ``(1 - weight) * model_forecast + weight * prior`` on the index
+intersection of the two series.  ``weight`` is the trust placed in the
+prior: ``0.0`` returns the model forecast unchanged (prior ignored),
+``1.0`` returns the prior, and intermediate values interpolate.  This is the
+correct lever for down-weighting a near-oracle prior whose influence a
+tree model cannot be tuned through feature scaling.
+
+The function is **pure**: it does not mutate its inputs and emits no
+warnings.  The result carries ``model_forecast``'s name.
+
+## Parameters {.doc-section .doc-section-parameters}
+
+| Name           | Type                                     | Description                                                                             | Default    |
+|----------------|------------------------------------------|-----------------------------------------------------------------------------------------|------------|
+| model_forecast | [pd](`pandas`).[Series](`pandas.Series`) | The trained model's forecast.                                                           | _required_ |
+| prior          | [pd](`pandas`).[Series](`pandas.Series`) | The external prior to blend in (e.g. the ENTSO-E day-ahead forecast), aligned by index. | _required_ |
+| weight         | [float](`float`)                         | Blend weight in ``[0.0, 1.0]`` — the trust placed in ``prior``.                         | _required_ |
+
+## Returns {.doc-section .doc-section-returns}
+
+| Name   | Type                                     | Description                                                 |
+|--------|------------------------------------------|-------------------------------------------------------------|
+|        | [pd](`pandas`).[Series](`pandas.Series`) | A new ``pd.Series`` over the index intersection, named like |
+|        | [pd](`pandas`).[Series](`pandas.Series`) | ``model_forecast``.                                         |
+
+## Raises {.doc-section .doc-section-raises}
+
+| Name   | Type                       | Description                                                                           |
+|--------|----------------------------|---------------------------------------------------------------------------------------|
+|        | [TypeError](`TypeError`)   | When ``model_forecast`` or ``prior`` is not a ``pd.Series``.                          |
+|        | [ValueError](`ValueError`) | When ``weight`` is outside ``[0.0, 1.0]`` or the two series share no index positions. |
+
+## Examples {.doc-section .doc-section-examples}
+
+```{python}
+import pandas as pd
+from spotforecast2_safe.processing.blend import blend_with_prior
+
+idx = pd.date_range("2026-06-13 00:00", periods=4, freq="h", tz="UTC")
+model = pd.Series([100.0, 110.0, 120.0, 130.0], index=idx, name="y0")
+prior = pd.Series([140.0, 140.0, 140.0, 140.0], index=idx)
+
+# weight=0 -> model unchanged; weight=1 -> prior; 0.25 -> 75/25 mix.
+print(blend_with_prior(model, prior, weight=0.0).tolist())
+print(blend_with_prior(model, prior, weight=1.0).tolist())
+print(blend_with_prior(model, prior, weight=0.25).tolist())
+assert blend_with_prior(model, prior, weight=0.0).equals(model)
+```
\ No newline at end of file
diff --git a/docs/reference/processing.forecast_scoring.score_forecasts.qmd b/docs/reference/processing.forecast_scoring.score_forecasts.qmd
new file mode 100644
index 000000000..302f86e91
--- /dev/null
+++ b/docs/reference/processing.forecast_scoring.score_forecasts.qmd
@@ -0,0 +1,63 @@
+# processing.forecast_scoring.score_forecasts { #spotforecast2_safe.processing.forecast_scoring.score_forecasts }
+
+```python
+processing.forecast_scoring.score_forecasts(
+    forecasts,
+    actual,
+    *,
+    metrics=SUPPORTED_METRICS,
+)
+```
+
+Score several forecasts against a shared actual and rank them.
+
+Each forecast is aligned to ``actual`` on the index intersection and scored
+on the requested ``metrics``.  The result is a tidy table indexed by
+approach name, with one column per metric plus an ``n`` column (overlap
+length), sorted ascending by the first requested metric so the best
+approach is the top row.
+
+This is **pure**: no logging, no plotting, no mutation.  Use it to compare,
+for example, a four-zone bottom-up sum against a single combined model
+(compute each approach's forecast first, e.g. via ``backtesting_forecaster``).
+
+## Parameters {.doc-section .doc-section-parameters}
+
+| Name      | Type                                                                                           | Description                                                                                                                                                                                   | Default             |
+|-----------|------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------|
+| forecasts | [Mapping](`collections.abc.Mapping`)\[[str](`str`), [pd](`pandas`).[Series](`pandas.Series`)\] | Mapping of approach name to its forecast series.                                                                                                                                              | _required_          |
+| actual    | [pd](`pandas`).[Series](`pandas.Series`)                                                       | The ground-truth series every forecast is scored against.                                                                                                                                     | _required_          |
+| metrics   | [tuple](`tuple`)\[[str](`str`), ...\]                                                          | Subset of `SUPPORTED_METRICS` to compute, in output order. ``"mae"``, ``"rmse"``, and ``"bias"`` are in the units of the series; ``"mape"`` is a percentage. The ranking uses ``metrics[0]``. | `SUPPORTED_METRICS` |
+
+## Returns {.doc-section .doc-section-returns}
+
+| Name   | Type                                           | Description                                              |
+|--------|------------------------------------------------|----------------------------------------------------------|
+|        | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | A ``pd.DataFrame`` indexed by approach name with columns |
+|        | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | ``[*metrics, "n"]``, sorted ascending by ``metrics[0]``. |
+
+## Raises {.doc-section .doc-section-raises}
+
+| Name   | Type                       | Description                                                                                               |
+|--------|----------------------------|-----------------------------------------------------------------------------------------------------------|
+|        | [TypeError](`TypeError`)   | When ``actual`` is not a ``pd.Series`` or a forecast value is not a ``pd.Series``.                        |
+|        | [ValueError](`ValueError`) | When ``actual`` is empty, ``forecasts`` is empty, or ``metrics`` contains an unsupported name / is empty. |
+
+## Examples {.doc-section .doc-section-examples}
+
+```{python}
+import pandas as pd
+from spotforecast2_safe.processing.forecast_scoring import score_forecasts
+
+idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC")
+actual = pd.Series([43_858.0] * 24, index=idx)
+
+forecasts = {
+    "combined": actual + 300.0,        # small mixed-ish offset
+    "four_zone_sum": actual + 1_780.0,  # flat over-prediction
+}
+table = score_forecasts(forecasts, actual, metrics=("mae", "bias"))
+print(table.round(2).to_string())
+# combined ranks first (lower MAE).
+assert table.index[0] == "combined"
+```
\ No newline at end of file
diff --git a/docs/reference/processing.shape_check.LevelCheckReport.qmd b/docs/reference/processing.shape_check.LevelCheckReport.qmd
new file mode 100644
index 000000000..d65266101
--- /dev/null
+++ b/docs/reference/processing.shape_check.LevelCheckReport.qmd
@@ -0,0 +1,48 @@
+# processing.shape_check.LevelCheckReport { #spotforecast2_safe.processing.shape_check.LevelCheckReport }
+
+```python
+processing.shape_check.LevelCheckReport(
+    n_overlap,
+    statistic,
+    forecast_level,
+    reference_level,
+    offset,
+    rel_offset,
+    tol,
+)
+```
+
+Immutable result of a forecast *level* (systematic-bias) check.
+
+Where `ShapeCheckReport` answers "does the forecast track the daily
+*profile*", this answers "does the forecast sit at the right *level*".  It
+captures a near-constant offset of the whole forecast against a reference —
+the failure mode behind the 2026-06-13 team_4 miss, where the forecast
+over-predicted every hour by a flat ~1.8 GW (``bias == MAE``).
+
+## Attributes {.doc-section .doc-section-attributes}
+
+| Name            | Type             | Description                                                                                                                                                  |
+|-----------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| n_overlap       | [int](`int`)     | Number of aligned (overlapping) index positions used. When below the evaluable minimum, ``skipped`` is ``True`` and the numeric fields are ``float('nan')``. |
+| statistic       | [str](`str`)     | Central-tendency statistic used for both levels — either ``"median"`` (robust, the default) or ``"mean"``.                                                   |
+| forecast_level  | [float](`float`) | ``statistic`` of the forecast over the overlap.                                                                                                              |
+| reference_level | [float](`float`) | ``statistic`` of the reference over the overlap.                                                                                                             |
+| offset          | [float](`float`) | ``forecast_level - reference_level`` (signed; positive means the forecast sits high, i.e. systematic over-prediction).                                       |
+| rel_offset      | [float](`float`) | ``offset / abs(reference_level)`` (signed). ``float('nan')`` when the reference level is zero.                                                               |
+| tol             | [float](`float`) | Relative-offset tolerance for ``biased`` (passed through from `check_forecast_level`).                                                                       |
+
+## Examples {.doc-section .doc-section-examples}
+
+```{python}
+from spotforecast2_safe.processing.shape_check import LevelCheckReport
+
+# Forecast sits 4 % high vs the reference -> biased at tol=0.02.
+r = LevelCheckReport(
+    n_overlap=24, statistic="median",
+    forecast_level=45_600.0, reference_level=43_858.0,
+    offset=1_742.0, rel_offset=0.0397, tol=0.02,
+)
+print("biased:", r.biased, "rel_offset:", round(r.rel_offset, 4))
+assert r.biased and not r.skipped
+```
\ No newline at end of file
diff --git a/docs/reference/processing.shape_check.apply_level_correction.qmd b/docs/reference/processing.shape_check.apply_level_correction.qmd
new file mode 100644
index 000000000..fa2436e07
--- /dev/null
+++ b/docs/reference/processing.shape_check.apply_level_correction.qmd
@@ -0,0 +1,62 @@
+# processing.shape_check.apply_level_correction { #spotforecast2_safe.processing.shape_check.apply_level_correction }
+
+```python
+processing.shape_check.apply_level_correction(
+    y,
+    reference,
+    *,
+    statistic='median',
+    min_overlap=12,
+)
+```
+
+Shift a forecast so its central level matches a reference (debias).
+
+Estimates the constant offset ``statistic(y) - statistic(reference)`` over
+the index overlap and subtracts it from **every** value of ``y``, removing a
+systematic flat bias while preserving the daily shape.  This is the
+post-hoc correction for the failure `check_forecast_level` detects.
+
+The returned series keeps ``y``'s full index, name, and ordering; only the
+level is shifted.  The function is pure (no mutation of the inputs).
+
+## Parameters {.doc-section .doc-section-parameters}
+
+| Name        | Type                                     | Description                                                                                              | Default    |
+|-------------|------------------------------------------|----------------------------------------------------------------------------------------------------------|------------|
+| y           | [pd](`pandas`).[Series](`pandas.Series`) | Forecast series to correct.                                                                              | _required_ |
+| reference   | [pd](`pandas`).[Series](`pandas.Series`) | Reference whose level ``y`` should be aligned to.                                                        | _required_ |
+| statistic   | [str](`str`)                             | ``"median"`` (default) or ``"mean"`` — must match the estimator you would use in `check_forecast_level`. | `'median'` |
+| min_overlap | [int](`int`)                             | Minimum overlap required to estimate the offset.                                                         | `12`       |
+
+## Returns {.doc-section .doc-section-returns}
+
+| Name   | Type                                     | Description                                                             |
+|--------|------------------------------------------|-------------------------------------------------------------------------|
+|        | [pd](`pandas`).[Series](`pandas.Series`) | A new ``pd.Series`` equal to ``y - offset`` (same index/name as ``y``). |
+
+## Raises {.doc-section .doc-section-raises}
+
+| Name   | Type                       | Description                                                                                                                       |
+|--------|----------------------------|-----------------------------------------------------------------------------------------------------------------------------------|
+|        | [TypeError](`TypeError`)   | When ``y`` or ``reference`` is not a ``pd.Series``.                                                                               |
+|        | [ValueError](`ValueError`) | When ``y``/``reference`` is empty, ``statistic`` is invalid, or the overlap is smaller than ``min_overlap`` (no reliable offset). |
+
+## Examples {.doc-section .doc-section-examples}
+
+```{python}
+import pandas as pd
+from spotforecast2_safe.processing.shape_check import (
+    apply_level_correction, check_forecast_level,
+)
+
+idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC")
+actual = pd.Series([43_000.0 + 3_000.0 * (i % 12) / 12 for i in range(24)],
+                   index=idx)
+biased = actual + 1_800.0  # flat over-prediction
+
+corrected = apply_level_correction(biased, actual)
+print("offset before:", round(check_forecast_level(biased, actual).offset))
+print("offset after :", round(check_forecast_level(corrected, actual).offset))
+assert abs(check_forecast_level(corrected, actual).offset) < 1.0
+```
\ No newline at end of file
diff --git a/docs/reference/processing.shape_check.check_forecast_level.qmd b/docs/reference/processing.shape_check.check_forecast_level.qmd
new file mode 100644
index 000000000..51b0dbffd
--- /dev/null
+++ b/docs/reference/processing.shape_check.check_forecast_level.qmd
@@ -0,0 +1,67 @@
+# processing.shape_check.check_forecast_level { #spotforecast2_safe.processing.shape_check.check_forecast_level }
+
+```python
+processing.shape_check.check_forecast_level(
+    y,
+    reference,
+    *,
+    statistic='median',
+    tol=0.02,
+    min_overlap=12,
+)
+```
+
+Measure the systematic level offset between a forecast and its reference.
+
+Complements `check_forecast_shape`: a forecast can track the daily profile
+perfectly (high correlation, good range ratio) yet sit at the wrong level —
+a flat over- or under-prediction.  This returns the signed offset of the
+forecast's central level against the reference's, in absolute and relative
+terms, and flags it as ``biased`` when the relative offset exceeds ``tol``.
+
+Like `check_forecast_shape`, this function is **pure**: no logging, no
+warning, no raising on a biased result.  Only invalid inputs raise.
+
+## Parameters {.doc-section .doc-section-parameters}
+
+| Name        | Type                                     | Description                                                                               | Default    |
+|-------------|------------------------------------------|-------------------------------------------------------------------------------------------|------------|
+| y           | [pd](`pandas`).[Series](`pandas.Series`) | Forecast series (e.g. the 24-h submission).                                               | _required_ |
+| reference   | [pd](`pandas`).[Series](`pandas.Series`) | Reference profile (e.g. ENTSO-E day-ahead forecast or actuals one week earlier).          | _required_ |
+| statistic   | [str](`str`)                             | Central-tendency statistic, ``"median"`` (default, robust) or ``"mean"``.                 | `'median'` |
+| tol         | [float](`float`)                         | Relative-offset tolerance for ``LevelCheckReport.biased``. Default ``0.02`` (2 %).        | `0.02`     |
+| min_overlap | [int](`int`)                             | Minimum overlap length to evaluate. Below this the report is ``skipped`` with NaN levels. | `12`       |
+
+## Returns {.doc-section .doc-section-returns}
+
+| Name   | Type                                                                             | Description                                                        |
+|--------|----------------------------------------------------------------------------------|--------------------------------------------------------------------|
+|        | [LevelCheckReport](`spotforecast2_safe.processing.shape_check.LevelCheckReport`) | `LevelCheckReport` with the computed levels, offsets, and ``tol``. |
+
+## Raises {.doc-section .doc-section-raises}
+
+| Name   | Type                       | Description                                                                              |
+|--------|----------------------------|------------------------------------------------------------------------------------------|
+|        | [TypeError](`TypeError`)   | When ``y`` or ``reference`` is not a ``pd.Series``.                                      |
+|        | [ValueError](`ValueError`) | When ``y`` or ``reference`` is empty, or ``statistic`` is not ``"median"`` / ``"mean"``. |
+
+## Examples {.doc-section .doc-section-examples}
+
+```{python}
+import pandas as pd
+from spotforecast2_safe.processing.shape_check import check_forecast_level
+
+idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC")
+actual = pd.Series([43_000.0 + 3_000.0 * (i % 12) / 12 for i in range(24)],
+                   index=idx)
+
+# Forecast that sits a flat 1_800 MW too high -> biased.
+high = actual + 1_800.0
+rep = check_forecast_level(high, actual, tol=0.02)
+print(f"offset={rep.offset:.0f} MW  rel={rep.rel_offset:.3f}  biased={rep.biased}")
+assert rep.biased and rep.offset > 0
+
+# A well-centred forecast -> not biased.
+ok = actual + 50.0
+print("small offset biased:", check_forecast_level(ok, actual).biased)
+```
\ No newline at end of file
diff --git a/src/spotforecast2_safe/processing/__init__.py b/src/spotforecast2_safe/processing/__init__.py
index c5ccddcd0..5fe51a4e1 100644
--- a/src/spotforecast2_safe/processing/__init__.py
+++ b/src/spotforecast2_safe/processing/__init__.py
@@ -4,14 +4,28 @@
 """Processing module for end-to-end forecasting pipelines."""
 
 from .agg_predict import agg_predict
+from .blend import blend_with_prior
+from .forecast_scoring import SUPPORTED_METRICS, score_forecasts
 from .n2n_predict import n2n_predict
 from .n2n_predict_with_covariates import n2n_predict_with_covariates
-from .shape_check import ShapeCheckReport, check_forecast_shape
+from .shape_check import (
+    LevelCheckReport,
+    ShapeCheckReport,
+    apply_level_correction,
+    check_forecast_level,
+    check_forecast_shape,
+)
 
 __all__ = [
     "agg_predict",
+    "blend_with_prior",
+    "score_forecasts",
+    "SUPPORTED_METRICS",
     "n2n_predict",
     "n2n_predict_with_covariates",
     "ShapeCheckReport",
     "check_forecast_shape",
+    "LevelCheckReport",
+    "check_forecast_level",
+    "apply_level_correction",
 ]
diff --git a/src/spotforecast2_safe/processing/blend.py b/src/spotforecast2_safe/processing/blend.py
new file mode 100644
index 000000000..d133f35f6
--- /dev/null
+++ b/src/spotforecast2_safe/processing/blend.py
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: 2026 bartzbeielstein
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+"""Post-hoc blend of a model forecast with an external prior — pure computation.
+
+Motivation (2026-06-13 team_4 post-mortem): the ``--entsoe`` variant fed the
+ENTSO-E day-ahead *Forecasted Load* in as a near-oracle model **feature** and
+did *worse* on a low-load Saturday because that prior was itself biased high.
+The obvious "down-weight the prior" idea — scaling the feature column — is a
+**no-op** for gradient-boosted trees: tree splits are invariant to any
+monotonic rescaling of a single feature.  The sound way to down-weight a prior
+is therefore a *post-hoc convex blend* of the trained model's forecast with the
+prior, which this module provides.  The operator keeps the prior out of the
+model (or in it) and tunes its influence at the output stage.
+"""
+
+from __future__ import annotations
+
+import pandas as pd
+
+
+def blend_with_prior(
+    model_forecast: pd.Series,
+    prior: pd.Series,
+    *,
+    weight: float,
+) -> pd.Series:
+    """Convex-blend a model forecast with an external prior.
+
+    Returns ``(1 - weight) * model_forecast + weight * prior`` on the index
+    intersection of the two series.  ``weight`` is the trust placed in the
+    prior: ``0.0`` returns the model forecast unchanged (prior ignored),
+    ``1.0`` returns the prior, and intermediate values interpolate.  This is the
+    correct lever for down-weighting a near-oracle prior whose influence a
+    tree model cannot be tuned through feature scaling.
+
+    The function is **pure**: it does not mutate its inputs and emits no
+    warnings.  The result carries ``model_forecast``'s name.
+
+    Args:
+        model_forecast: The trained model's forecast.
+        prior: The external prior to blend in (e.g. the ENTSO-E day-ahead
+            forecast), aligned by index.
+        weight: Blend weight in ``[0.0, 1.0]`` — the trust placed in ``prior``.
+
+    Returns:
+        A new ``pd.Series`` over the index intersection, named like
+        ``model_forecast``.
+
+    Raises:
+        TypeError: When ``model_forecast`` or ``prior`` is not a ``pd.Series``.
+        ValueError: When ``weight`` is outside ``[0.0, 1.0]`` or the two series
+            share no index positions.
+
+    Examples:
+        ```{python}
+        import pandas as pd
+        from spotforecast2_safe.processing.blend import blend_with_prior
+
+        idx = pd.date_range("2026-06-13 00:00", periods=4, freq="h", tz="UTC")
+        model = pd.Series([100.0, 110.0, 120.0, 130.0], index=idx, name="y0")
+        prior = pd.Series([140.0, 140.0, 140.0, 140.0], index=idx)
+
+        # weight=0 -> model unchanged; weight=1 -> prior; 0.25 -> 75/25 mix.
+        print(blend_with_prior(model, prior, weight=0.0).tolist())
+        print(blend_with_prior(model, prior, weight=1.0).tolist())
+        print(blend_with_prior(model, prior, weight=0.25).tolist())
+        assert blend_with_prior(model, prior, weight=0.0).equals(model)
+        ```
+    """
+    if not isinstance(model_forecast, pd.Series):
+        raise TypeError(
+            f"model_forecast must be a pd.Series, got "
+            f"{type(model_forecast).__name__!r}."
+        )
+    if not isinstance(prior, pd.Series):
+        raise TypeError(f"prior must be a pd.Series, got {type(prior).__name__!r}.")
+    if not 0.0 <= weight <= 1.0:
+        raise ValueError(f"weight must be in [0.0, 1.0], got {weight}.")
+
+    common = model_forecast.index.intersection(prior.index)
+    if len(common) == 0:
+        raise ValueError("model_forecast and prior share no index positions.")
+
+    blended = (1.0 - weight) * model_forecast.loc[common] + weight * prior.loc[common]
+    blended.name = model_forecast.name
+    return blended
diff --git a/src/spotforecast2_safe/processing/forecast_scoring.py b/src/spotforecast2_safe/processing/forecast_scoring.py
new file mode 100644
index 000000000..f4fc92d9b
--- /dev/null
+++ b/src/spotforecast2_safe/processing/forecast_scoring.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: 2026 bartzbeielstein
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+"""Score and compare several forecasts against a shared actual — pure computation.
+
+Motivation (2026-06-13 team_4 post-mortem): the four-zone bottom-up sum lost to
+the single aggregate ("combined") model.  Deciding whether bottom-up aggregation
+helps or merely amplifies bias is an apples-to-apples comparison question: run a
+backtest for each modelling approach (with
+`spotforecast2_safe.backtesting.validation.backtesting_forecaster`), then score
+every approach's forecast against the same actual on the same metrics.
+
+`score_forecasts` is that second step — a pure, source-agnostic comparison
+primitive.  It takes the per-approach forecast series (e.g. the 4-zone bottom-up
+sum and the combined model's prediction) plus the actual, and returns a tidy
+"approach x metric" table sorted by the leading metric, so the better setup is
+read off directly.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+import numpy as np
+import pandas as pd
+
+#: Metrics ``score_forecasts`` can compute, in canonical order.
+SUPPORTED_METRICS: tuple[str, ...] = ("mae", "rmse", "bias", "mape")
+
+
+def _compute_metric(name: str, error: pd.Series, actual: pd.Series) -> float:
+    """Return one metric over an aligned error/actual pair (NaN if empty)."""
+    if len(error) == 0:
+        return float("nan")
+    if name == "mae":
+        return float(error.abs().mean())
+    if name == "rmse":
+        return float(np.sqrt((error**2).mean()))
+    if name == "bias":
+        return float(error.mean())
+    if name == "mape":
+        denom = actual.abs().replace(0.0, np.nan)
+        return float((error.abs() / denom).mean() * 100.0)
+    raise ValueError(  # pragma: no cover - guarded by caller
+        f"unknown metric {name!r}; supported: {SUPPORTED_METRICS}."
+    )
+
+
+def score_forecasts(
+    forecasts: Mapping[str, pd.Series],
+    actual: pd.Series,
+    *,
+    metrics: tuple[str, ...] = SUPPORTED_METRICS,
+) -> pd.DataFrame:
+    """Score several forecasts against a shared actual and rank them.
+
+    Each forecast is aligned to ``actual`` on the index intersection and scored
+    on the requested ``metrics``.  The result is a tidy table indexed by
+    approach name, with one column per metric plus an ``n`` column (overlap
+    length), sorted ascending by the first requested metric so the best
+    approach is the top row.
+
+    This is **pure**: no logging, no plotting, no mutation.  Use it to compare,
+    for example, a four-zone bottom-up sum against a single combined model
+    (compute each approach's forecast first, e.g. via ``backtesting_forecaster``).
+
+    Args:
+        forecasts: Mapping of approach name to its forecast series.
+        actual: The ground-truth series every forecast is scored against.
+        metrics: Subset of `SUPPORTED_METRICS` to compute, in output order.
+            ``"mae"``, ``"rmse"``, and ``"bias"`` are in the units of the
+            series; ``"mape"`` is a percentage. The ranking uses ``metrics[0]``.
+
+    Returns:
+        A ``pd.DataFrame`` indexed by approach name with columns
+        ``[*metrics, "n"]``, sorted ascending by ``metrics[0]``.
+
+    Raises:
+        TypeError: When ``actual`` is not a ``pd.Series`` or a forecast value
+            is not a ``pd.Series``.
+        ValueError: When ``actual`` is empty, ``forecasts`` is empty, or
+            ``metrics`` contains an unsupported name / is empty.
+
+    Examples:
+        ```{python}
+        import pandas as pd
+        from spotforecast2_safe.processing.forecast_scoring import score_forecasts
+
+        idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC")
+        actual = pd.Series([43_858.0] * 24, index=idx)
+
+        forecasts = {
+            "combined": actual + 300.0,        # small mixed-ish offset
+            "four_zone_sum": actual + 1_780.0,  # flat over-prediction
+        }
+        table = score_forecasts(forecasts, actual, metrics=("mae", "bias"))
+        print(table.round(2).to_string())
+        # combined ranks first (lower MAE).
+        assert table.index[0] == "combined"
+        ```
+    """
+    if not isinstance(actual, pd.Series):
+        raise TypeError(f"actual must be a pd.Series, got {type(actual).__name__!r}.")
+    if actual.empty:
+        raise ValueError("actual is empty.")
+    if not forecasts:
+        raise ValueError("forecasts is empty; nothing to score.")
+    if not metrics:
+        raise ValueError("metrics is empty; request at least one metric.")
+    unknown = [m for m in metrics if m not in SUPPORTED_METRICS]
+    if unknown:
+        raise ValueError(
+            f"unsupported metric(s) {unknown}; supported: {SUPPORTED_METRICS}."
+        )
+
+    rows: dict[str, dict[str, float]] = {}
+    for name, forecast in forecasts.items():
+        if not isinstance(forecast, pd.Series):
+            raise TypeError(
+                f"forecast {name!r} must be a pd.Series, got "
+                f"{type(forecast).__name__!r}."
+            )
+        common = forecast.index.intersection(actual.index)
+        a = actual.loc[common]
+        error = forecast.loc[common] - a
+        row: dict[str, float] = {m: _compute_metric(m, error, a) for m in metrics}
+        row["n"] = float(len(common))
+        rows[name] = row
+
+    table = pd.DataFrame.from_dict(rows, orient="index")[list(metrics) + ["n"]]
+    table["n"] = table["n"].astype(int)
+    return table.sort_values(by=metrics[0], kind="stable")
diff --git a/src/spotforecast2_safe/processing/shape_check.py b/src/spotforecast2_safe/processing/shape_check.py
index 96adf9abe..9e77b3c0a 100644
--- a/src/spotforecast2_safe/processing/shape_check.py
+++ b/src/spotforecast2_safe/processing/shape_check.py
@@ -215,3 +215,259 @@ def check_forecast_shape(
         min_corr=min_corr,
         min_range_ratio=min_range_ratio,
     )
+
+
+@dataclass(frozen=True)
+class LevelCheckReport:
+    """Immutable result of a forecast *level* (systematic-bias) check.
+
+    Where `ShapeCheckReport` answers "does the forecast track the daily
+    *profile*", this answers "does the forecast sit at the right *level*".  It
+    captures a near-constant offset of the whole forecast against a reference —
+    the failure mode behind the 2026-06-13 team_4 miss, where the forecast
+    over-predicted every hour by a flat ~1.8 GW (``bias == MAE``).
+
+    Attributes:
+        n_overlap: Number of aligned (overlapping) index positions used. When
+            below the evaluable minimum, ``skipped`` is ``True`` and the
+            numeric fields are ``float('nan')``.
+        statistic: Central-tendency statistic used for both levels — either
+            ``"median"`` (robust, the default) or ``"mean"``.
+        forecast_level: ``statistic`` of the forecast over the overlap.
+        reference_level: ``statistic`` of the reference over the overlap.
+        offset: ``forecast_level - reference_level`` (signed; positive means
+            the forecast sits high, i.e. systematic over-prediction).
+        rel_offset: ``offset / abs(reference_level)`` (signed). ``float('nan')``
+            when the reference level is zero.
+        tol: Relative-offset tolerance for ``biased`` (passed through from
+            `check_forecast_level`).
+
+    Examples:
+        ```{python}
+        from spotforecast2_safe.processing.shape_check import LevelCheckReport
+
+        # Forecast sits 4 % high vs the reference -> biased at tol=0.02.
+        r = LevelCheckReport(
+            n_overlap=24, statistic="median",
+            forecast_level=45_600.0, reference_level=43_858.0,
+            offset=1_742.0, rel_offset=0.0397, tol=0.02,
+        )
+        print("biased:", r.biased, "rel_offset:", round(r.rel_offset, 4))
+        assert r.biased and not r.skipped
+        ```
+    """
+
+    n_overlap: int
+    statistic: str
+    forecast_level: float
+    reference_level: float
+    offset: float
+    rel_offset: float
+    tol: float
+
+    @property
+    def biased(self) -> bool:
+        """Return ``True`` when ``abs(rel_offset)`` exceeds ``tol``.
+
+        A ``NaN`` relative offset (zero reference level or skipped check) is
+        treated as *not* biased (returns ``False``).
+        """
+        if math.isnan(self.rel_offset):
+            return False
+        return abs(self.rel_offset) > self.tol
+
+    @property
+    def skipped(self) -> bool:
+        """Return ``True`` when the overlap was too small to evaluate.
+
+        By construction `check_forecast_level` stores ``n_overlap=0`` and NaN
+        levels when skipping.
+        """
+        return (
+            self.n_overlap == 0
+            and math.isnan(self.offset)
+            and math.isnan(self.rel_offset)
+        )
+
+
+def _resolve_level_statistic(statistic: str):
+    """Return the pandas reducer for ``statistic`` or raise ``ValueError``."""
+    if statistic == "median":
+        return lambda s: float(s.median())
+    if statistic == "mean":
+        return lambda s: float(s.mean())
+    raise ValueError(f"statistic must be 'median' or 'mean', got {statistic!r}.")
+
+
+def check_forecast_level(
+    y: pd.Series,
+    reference: pd.Series,
+    *,
+    statistic: str = "median",
+    tol: float = 0.02,
+    min_overlap: int = 12,
+) -> LevelCheckReport:
+    """Measure the systematic level offset between a forecast and its reference.
+
+    Complements `check_forecast_shape`: a forecast can track the daily profile
+    perfectly (high correlation, good range ratio) yet sit at the wrong level —
+    a flat over- or under-prediction.  This returns the signed offset of the
+    forecast's central level against the reference's, in absolute and relative
+    terms, and flags it as ``biased`` when the relative offset exceeds ``tol``.
+
+    Like `check_forecast_shape`, this function is **pure**: no logging, no
+    warning, no raising on a biased result.  Only invalid inputs raise.
+
+    Args:
+        y: Forecast series (e.g. the 24-h submission).
+        reference: Reference profile (e.g. ENTSO-E day-ahead forecast or
+            actuals one week earlier).
+        statistic: Central-tendency statistic, ``"median"`` (default, robust)
+            or ``"mean"``.
+        tol: Relative-offset tolerance for ``LevelCheckReport.biased``. Default
+            ``0.02`` (2 %).
+        min_overlap: Minimum overlap length to evaluate. Below this the report
+            is ``skipped`` with NaN levels.
+
+    Returns:
+        `LevelCheckReport` with the computed levels, offsets, and ``tol``.
+
+    Raises:
+        TypeError: When ``y`` or ``reference`` is not a ``pd.Series``.
+        ValueError: When ``y`` or ``reference`` is empty, or ``statistic`` is
+            not ``"median"`` / ``"mean"``.
+
+    Examples:
+        ```{python}
+        import pandas as pd
+        from spotforecast2_safe.processing.shape_check import check_forecast_level
+
+        idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC")
+        actual = pd.Series([43_000.0 + 3_000.0 * (i % 12) / 12 for i in range(24)],
+                           index=idx)
+
+        # Forecast that sits a flat 1_800 MW too high -> biased.
+        high = actual + 1_800.0
+        rep = check_forecast_level(high, actual, tol=0.02)
+        print(f"offset={rep.offset:.0f} MW  rel={rep.rel_offset:.3f}  biased={rep.biased}")
+        assert rep.biased and rep.offset > 0
+
+        # A well-centred forecast -> not biased.
+        ok = actual + 50.0
+        print("small offset biased:", check_forecast_level(ok, actual).biased)
+        ```
+    """
+    reducer = _resolve_level_statistic(statistic)
+    if not isinstance(y, pd.Series):
+        raise TypeError(f"y must be a pd.Series, got {type(y).__name__!r}.")
+    if not isinstance(reference, pd.Series):
+        raise TypeError(
+            f"reference must be a pd.Series, got {type(reference).__name__!r}."
+        )
+    if y.empty:
+        raise ValueError("y is empty.")
+    if reference.empty:
+        raise ValueError("reference is empty.")
+
+    common = y.index.intersection(reference.index)
+    n_overlap = len(common)
+
+    nan = float("nan")
+    if n_overlap < min_overlap:
+        return LevelCheckReport(
+            n_overlap=0,
+            statistic=statistic,
+            forecast_level=nan,
+            reference_level=nan,
+            offset=nan,
+            rel_offset=nan,
+            tol=tol,
+        )
+
+    forecast_level = reducer(y.loc[common])
+    reference_level = reducer(reference.loc[common])
+    offset = forecast_level - reference_level
+    rel_offset = offset / abs(reference_level) if reference_level != 0 else nan
+
+    return LevelCheckReport(
+        n_overlap=n_overlap,
+        statistic=statistic,
+        forecast_level=forecast_level,
+        reference_level=reference_level,
+        offset=offset,
+        rel_offset=rel_offset,
+        tol=tol,
+    )
+
+
+def apply_level_correction(
+    y: pd.Series,
+    reference: pd.Series,
+    *,
+    statistic: str = "median",
+    min_overlap: int = 12,
+) -> pd.Series:
+    """Shift a forecast so its central level matches a reference (debias).
+
+    Estimates the constant offset ``statistic(y) - statistic(reference)`` over
+    the index overlap and subtracts it from **every** value of ``y``, removing a
+    systematic flat bias while preserving the daily shape.  This is the
+    post-hoc correction for the failure `check_forecast_level` detects.
+
+    The returned series keeps ``y``'s full index, name, and ordering; only the
+    level is shifted.  The function is pure (no mutation of the inputs).
+
+    Args:
+        y: Forecast series to correct.
+        reference: Reference whose level ``y`` should be aligned to.
+        statistic: ``"median"`` (default) or ``"mean"`` — must match the
+            estimator you would use in `check_forecast_level`.
+        min_overlap: Minimum overlap required to estimate the offset.
+
+    Returns:
+        A new ``pd.Series`` equal to ``y - offset`` (same index/name as ``y``).
+
+    Raises:
+        TypeError: When ``y`` or ``reference`` is not a ``pd.Series``.
+        ValueError: When ``y``/``reference`` is empty, ``statistic`` is invalid,
+            or the overlap is smaller than ``min_overlap`` (no reliable offset).
+
+    Examples:
+        ```{python}
+        import pandas as pd
+        from spotforecast2_safe.processing.shape_check import (
+            apply_level_correction, check_forecast_level,
+        )
+
+        idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC")
+        actual = pd.Series([43_000.0 + 3_000.0 * (i % 12) / 12 for i in range(24)],
+                           index=idx)
+        biased = actual + 1_800.0  # flat over-prediction
+
+        corrected = apply_level_correction(biased, actual)
+        print("offset before:", round(check_forecast_level(biased, actual).offset))
+        print("offset after :", round(check_forecast_level(corrected, actual).offset))
+        assert abs(check_forecast_level(corrected, actual).offset) < 1.0
+        ```
+    """
+    reducer = _resolve_level_statistic(statistic)
+    if not isinstance(y, pd.Series):
+        raise TypeError(f"y must be a pd.Series, got {type(y).__name__!r}.")
+    if not isinstance(reference, pd.Series):
+        raise TypeError(
+            f"reference must be a pd.Series, got {type(reference).__name__!r}."
+        )
+    if y.empty:
+        raise ValueError("y is empty.")
+    if reference.empty:
+        raise ValueError("reference is empty.")
+
+    common = y.index.intersection(reference.index)
+    if len(common) < min_overlap:
+        raise ValueError(
+            f"overlap {len(common)} < min_overlap {min_overlap}; cannot "
+            f"estimate a level offset."
+        )
+
+    offset = reducer(y.loc[common]) - reducer(reference.loc[common])
+    return y - offset
diff --git a/tests/processing/test_blend.py b/tests/processing/test_blend.py
new file mode 100644
index 000000000..fa5ebee5a
--- /dev/null
+++ b/tests/processing/test_blend.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: 2026 bartzbeielstein
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+"""Tests for processing.blend.blend_with_prior (pure convex blend)."""
+
+import pandas as pd
+import pytest
+
+from spotforecast2_safe.processing.blend import blend_with_prior
+
+IDX = pd.date_range("2026-06-13 00:00", periods=4, freq="h", tz="UTC")
+MODEL = pd.Series([100.0, 110.0, 120.0, 130.0], index=IDX, name="y0")
+PRIOR = pd.Series([140.0, 140.0, 140.0, 140.0], index=IDX)
+
+
+class TestBlendWithPrior:
+    def test_weight_zero_returns_model(self):
+        out = blend_with_prior(MODEL, PRIOR, weight=0.0)
+        pd.testing.assert_series_equal(out, MODEL)
+
+    def test_weight_one_returns_prior_values(self):
+        out = blend_with_prior(MODEL, PRIOR, weight=1.0)
+        assert out.tolist() == PRIOR.tolist()
+
+    def test_half_weight_is_midpoint(self):
+        out = blend_with_prior(MODEL, PRIOR, weight=0.5)
+        assert out.tolist() == [120.0, 125.0, 130.0, 135.0]
+
+    def test_name_preserved_from_model(self):
+        assert blend_with_prior(MODEL, PRIOR, weight=0.3).name == "y0"
+
+    def test_intersection_only(self):
+        prior_short = PRIOR.iloc[1:]
+        out = blend_with_prior(MODEL, prior_short, weight=0.5)
+        assert out.index.equals(prior_short.index)
+        assert len(out) == 3
+
+    def test_does_not_mutate_inputs(self):
+        m_before, p_before = MODEL.copy(), PRIOR.copy()
+        blend_with_prior(MODEL, PRIOR, weight=0.4)
+        pd.testing.assert_series_equal(MODEL, m_before)
+        pd.testing.assert_series_equal(PRIOR, p_before)
+
+    @pytest.mark.parametrize("bad", [-0.1, 1.1, 2.0])
+    def test_weight_out_of_range_raises(self, bad):
+        with pytest.raises(ValueError, match=r"\[0.0, 1.0\]"):
+            blend_with_prior(MODEL, PRIOR, weight=bad)
+
+    def test_non_series_model_raises(self):
+        with pytest.raises(TypeError, match="pd.Series"):
+            blend_with_prior([1, 2], PRIOR, weight=0.5)  # type: ignore[arg-type]
+
+    def test_non_series_prior_raises(self):
+        with pytest.raises(TypeError, match="pd.Series"):
+            blend_with_prior(MODEL, [1, 2], weight=0.5)  # type: ignore[arg-type]
+
+    def test_disjoint_index_raises(self):
+        other = pd.Series(
+            [1.0, 2.0],
+            index=pd.date_range("2027-01-01", periods=2, freq="h", tz="UTC"),
+        )
+        with pytest.raises(ValueError, match="no index positions"):
+            blend_with_prior(MODEL, other, weight=0.5)
diff --git a/tests/processing/test_forecast_scoring.py b/tests/processing/test_forecast_scoring.py
new file mode 100644
index 000000000..03a5e7c38
--- /dev/null
+++ b/tests/processing/test_forecast_scoring.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: 2026 bartzbeielstein
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+"""Tests for processing.forecast_scoring.score_forecasts (pure comparison)."""
+
+import math
+
+import pandas as pd
+import pytest
+
+from spotforecast2_safe.processing.forecast_scoring import (
+    SUPPORTED_METRICS,
+    score_forecasts,
+)
+
+IDX = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC")
+ACTUAL = pd.Series([43_858.0] * 24, index=IDX)
+
+
+class TestScoreForecasts:
+    def test_ranks_lower_mae_first(self):
+        forecasts = {
+            "four_zone_sum": ACTUAL + 1_780.0,  # flat over-prediction
+            "combined": ACTUAL + 300.0,
+        }
+        table = score_forecasts(forecasts, ACTUAL)
+        assert list(table.index) == ["combined", "four_zone_sum"]
+
+    def test_metric_values_correct(self):
+        forecasts = {"high": ACTUAL + 1_000.0}
+        table = score_forecasts(forecasts, ACTUAL)
+        row = table.loc["high"]
+        assert row["mae"] == pytest.approx(1_000.0)
+        assert row["rmse"] == pytest.approx(1_000.0)
+        assert row["bias"] == pytest.approx(1_000.0)  # all-positive error
+        assert row["mape"] == pytest.approx(1_000.0 / 43_858.0 * 100.0)
+        assert row["n"] == 24
+
+    def test_metric_subset_and_order(self):
+        table = score_forecasts({"a": ACTUAL + 1.0}, ACTUAL, metrics=("bias", "mae"))
+        assert list(table.columns) == ["bias", "mae", "n"]
+
+    def test_n_reflects_overlap(self):
+        partial = (ACTUAL + 5.0).iloc[:10]
+        table = score_forecasts({"p": partial}, ACTUAL)
+        assert table.loc["p", "n"] == 10
+
+    def test_supported_metrics_constant(self):
+        assert set(SUPPORTED_METRICS) == {"mae", "rmse", "bias", "mape"}
+
+    def test_no_overlap_yields_nan_metrics(self):
+        other = pd.Series(
+            [1.0, 2.0],
+            index=pd.date_range("2027-01-01", periods=2, freq="h", tz="UTC"),
+        )
+        table = score_forecasts({"x": other}, ACTUAL)
+        assert table.loc["x", "n"] == 0
+        assert math.isnan(table.loc["x", "mae"])
+
+    def test_unknown_metric_raises(self):
+        with pytest.raises(ValueError, match="unsupported metric"):
+            score_forecasts({"a": ACTUAL}, ACTUAL, metrics=("mae", "smape"))
+
+    def test_empty_metrics_raises(self):
+        with pytest.raises(ValueError, match="at least one metric"):
+            score_forecasts({"a": ACTUAL}, ACTUAL, metrics=())
+
+    def test_empty_forecasts_raises(self):
+        with pytest.raises(ValueError, match="nothing to score"):
+            score_forecasts({}, ACTUAL)
+
+    def test_empty_actual_raises(self):
+        with pytest.raises(ValueError, match="empty"):
+            score_forecasts({"a": ACTUAL}, pd.Series(dtype=float))
+
+    def test_non_series_actual_raises(self):
+        with pytest.raises(TypeError, match="pd.Series"):
+            score_forecasts({"a": ACTUAL}, [1, 2, 3])  # type: ignore[arg-type]
+
+    def test_non_series_forecast_raises(self):
+        with pytest.raises(TypeError, match="pd.Series"):
+            score_forecasts({"a": [1, 2, 3]}, ACTUAL)  # type: ignore[arg-type]
diff --git a/tests/processing/test_level_check.py b/tests/processing/test_level_check.py
new file mode 100644
index 000000000..27216ecfe
--- /dev/null
+++ b/tests/processing/test_level_check.py
@@ -0,0 +1,166 @@
+# SPDX-FileCopyrightText: 2026 bartzbeielstein
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+"""Tests for processing.shape_check level / debias functions.
+
+check_forecast_level measures a systematic flat offset; apply_level_correction
+removes it.  Both are pure: no logging, no raising on a biased result.
+"""
+
+import math
+
+import pandas as pd
+import pytest
+
+from spotforecast2_safe.processing.shape_check import (
+    LevelCheckReport,
+    apply_level_correction,
+    check_forecast_level,
+)
+
+IDX = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC")
+# A non-flat reference so median != endpoints and the range is non-zero.
+ACTUAL = pd.Series([43_000.0 + 250.0 * (i % 12) for i in range(24)], index=IDX)
+
+
+class TestLevelCheckReport:
+    def test_biased_when_rel_offset_exceeds_tol(self):
+        r = LevelCheckReport(
+            n_overlap=24,
+            statistic="median",
+            forecast_level=45_600.0,
+            reference_level=43_858.0,
+            offset=1_742.0,
+            rel_offset=0.04,
+            tol=0.02,
+        )
+        assert r.biased and not r.skipped
+
+    def test_not_biased_at_or_below_tol(self):
+        r = LevelCheckReport(
+            n_overlap=24,
+            statistic="median",
+            forecast_level=1.0,
+            reference_level=100.0,
+            offset=2.0,
+            rel_offset=0.02,
+            tol=0.02,
+        )
+        assert not r.biased  # strictly greater-than required
+
+    def test_nan_rel_offset_not_biased(self):
+        r = LevelCheckReport(
+            n_overlap=0,
+            statistic="median",
+            forecast_level=float("nan"),
+            reference_level=float("nan"),
+            offset=float("nan"),
+            rel_offset=float("nan"),
+            tol=0.02,
+        )
+        assert not r.biased
+        assert r.skipped
+
+    def test_frozen(self):
+        r = LevelCheckReport(
+            n_overlap=24,
+            statistic="mean",
+            forecast_level=1.0,
+            reference_level=1.0,
+            offset=0.0,
+            rel_offset=0.0,
+            tol=0.02,
+        )
+        with pytest.raises((AttributeError, TypeError)):
+            r.offset = 9.0  # type: ignore[misc]
+
+
+class TestCheckForecastLevel:
+    def test_flat_high_forecast_is_biased(self):
+        rep = check_forecast_level(ACTUAL + 1_800.0, ACTUAL, tol=0.02)
+        assert rep.biased
+        assert rep.offset == pytest.approx(1_800.0, abs=1e-6)
+        assert rep.rel_offset > 0
+        assert rep.n_overlap == 24
+
+    def test_flat_low_forecast_negative_offset(self):
+        rep = check_forecast_level(ACTUAL - 1_800.0, ACTUAL)
+        assert rep.offset < 0 and rep.rel_offset < 0
+        assert rep.biased
+
+    def test_well_centred_not_biased(self):
+        rep = check_forecast_level(ACTUAL + 10.0, ACTUAL, tol=0.02)
+        assert not rep.biased
+
+    def test_mean_statistic(self):
+        rep = check_forecast_level(ACTUAL + 500.0, ACTUAL, statistic="mean")
+        assert rep.statistic == "mean"
+        assert rep.offset == pytest.approx(500.0, abs=1e-6)
+
+    def test_zero_reference_level_nan_rel(self):
+        zero_ref = pd.Series(0.0, index=IDX)
+        rep = check_forecast_level(ACTUAL, zero_ref)
+        assert math.isnan(rep.rel_offset)
+        assert not rep.biased  # NaN rel offset -> not biased
+
+    def test_short_overlap_skipped(self):
+        rep = check_forecast_level(ACTUAL.iloc[:5], ACTUAL, min_overlap=12)
+        assert rep.skipped
+        assert math.isnan(rep.offset)
+
+    def test_invalid_statistic_raises(self):
+        with pytest.raises(ValueError, match="median"):
+            check_forecast_level(ACTUAL, ACTUAL, statistic="mode")
+
+    def test_non_series_raises(self):
+        with pytest.raises(TypeError, match="pd.Series"):
+            check_forecast_level([1, 2, 3], ACTUAL)  # type: ignore[arg-type]
+
+    def test_empty_raises(self):
+        with pytest.raises(ValueError, match="empty"):
+            check_forecast_level(pd.Series(dtype=float), ACTUAL)
+
+    def test_deterministic(self):
+        a = check_forecast_level(ACTUAL + 5.0, ACTUAL)
+        b = check_forecast_level(ACTUAL + 5.0, ACTUAL)
+        assert a == b
+
+
+class TestApplyLevelCorrection:
+    def test_removes_flat_offset(self):
+        biased = ACTUAL + 1_800.0
+        corrected = apply_level_correction(biased, ACTUAL)
+        assert check_forecast_level(corrected, ACTUAL).offset == pytest.approx(
+            0.0, abs=1e-6
+        )
+
+    def test_preserves_index_name_and_shape(self):
+        biased = (ACTUAL + 1_800.0).rename("y0")
+        corrected = apply_level_correction(biased, ACTUAL)
+        assert corrected.index.equals(biased.index)
+        assert corrected.name == "y0"
+        assert len(corrected) == len(biased)
+
+    def test_does_not_mutate_input(self):
+        biased = ACTUAL + 1_800.0
+        before = biased.copy()
+        apply_level_correction(biased, ACTUAL)
+        pd.testing.assert_series_equal(biased, before)
+
+    def test_shape_is_preserved_only_level_shifts(self):
+        biased = ACTUAL + 1_800.0
+        corrected = apply_level_correction(biased, ACTUAL)
+        # constant shift => differences between consecutive points unchanged
+        pd.testing.assert_series_equal(biased.diff(), corrected.diff())
+
+    def test_short_overlap_raises(self):
+        with pytest.raises(ValueError, match="min_overlap"):
+            apply_level_correction(ACTUAL.iloc[:5], ACTUAL, min_overlap=12)
+
+    def test_invalid_statistic_raises(self):
+        with pytest.raises(ValueError, match="median"):
+            apply_level_correction(ACTUAL, ACTUAL, statistic="mode")
+
+    def test_non_series_raises(self):
+        with pytest.raises(TypeError, match="pd.Series"):
+            apply_level_correction(ACTUAL, [1, 2, 3])  # type: ignore[arg-type]