From 27bdf1edba6bea182e63617b1db95d7282bcdfc9 Mon Sep 17 00:00:00 2001 From: bartzbeielstein <32470350+bartzbeielstein@users.noreply.github.com> Date: Sun, 14 Jun 2026 16:42:54 +0200 Subject: [PATCH] feat: add forecast level/debias, prior blend, and approach scoring to processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three pure additions to spotforecast2_safe.processing, motivated by the 2026-06-13 team_4 post-mortem (a flat all-day over-prediction on a low-load Saturday the four-zone setup could not catch): - shape_check.check_forecast_level / LevelCheckReport / apply_level_correction: detect and remove a systematic flat level offset; complements the existing profile/shape check. - blend.blend_with_prior: convex post-hoc blend of a model forecast with an external prior — the correct lever for down-weighting a near-oracle prior, since tree models are invariant to monotonic feature scaling. - forecast_scoring.score_forecasts: tidy approach-by-metric table ranking competing forecasts (e.g. 4-zone bottom-up sum vs single combined model) against a shared actual. Pure pandas/numpy, no forbidden deps; registered in the quartodoc API reference; 67 new unit tests; full suite green (2755 passed). Co-Authored-By: Claude Opus 4.8 (1M context) --- _quarto.yml | 15 + docs/reference/index.qmd | 5 + .../processing.blend.blend_with_prior.qmd | 56 ++++ ...ssing.forecast_scoring.score_forecasts.qmd | 63 +++++ ...rocessing.shape_check.LevelCheckReport.qmd | 48 ++++ ...ing.shape_check.apply_level_correction.qmd | 62 +++++ ...ssing.shape_check.check_forecast_level.qmd | 67 +++++ src/spotforecast2_safe/processing/__init__.py | 16 +- src/spotforecast2_safe/processing/blend.py | 87 ++++++ .../processing/forecast_scoring.py | 132 +++++++++ .../processing/shape_check.py | 256 ++++++++++++++++++ tests/processing/test_blend.py | 63 +++++ tests/processing/test_forecast_scoring.py | 82 ++++++ tests/processing/test_level_check.py | 166 ++++++++++++ 14 files changed, 1117 insertions(+), 1 deletion(-) create mode 100644 docs/reference/processing.blend.blend_with_prior.qmd create mode 100644 docs/reference/processing.forecast_scoring.score_forecasts.qmd create mode 100644 docs/reference/processing.shape_check.LevelCheckReport.qmd create mode 100644 docs/reference/processing.shape_check.apply_level_correction.qmd create mode 100644 docs/reference/processing.shape_check.check_forecast_level.qmd create mode 100644 src/spotforecast2_safe/processing/blend.py create mode 100644 src/spotforecast2_safe/processing/forecast_scoring.py create mode 100644 tests/processing/test_blend.py create mode 100644 tests/processing/test_forecast_scoring.py create mode 100644 tests/processing/test_level_check.py diff --git a/_quarto.yml b/_quarto.yml index aca6ea983..a4d021e88 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -230,6 +230,16 @@ website: file: docs/reference/processing.shape_check.ShapeCheckReport.qmd - text: "check_forecast_shape" file: docs/reference/processing.shape_check.check_forecast_shape.qmd + - text: "LevelCheckReport" + file: docs/reference/processing.shape_check.LevelCheckReport.qmd + - text: "check_forecast_level" + file: docs/reference/processing.shape_check.check_forecast_level.qmd + - text: "apply_level_correction" + file: docs/reference/processing.shape_check.apply_level_correction.qmd + - text: "blend_with_prior" + file: docs/reference/processing.blend.blend_with_prior.qmd + - text: "score_forecasts" + file: docs/reference/processing.forecast_scoring.score_forecasts.qmd - section: "Forecaster" contents: @@ -672,6 +682,11 @@ quartodoc: - processing.n2n_predict_with_covariates.n2n_predict_with_covariates - processing.shape_check.ShapeCheckReport - processing.shape_check.check_forecast_shape + - processing.shape_check.LevelCheckReport + - processing.shape_check.check_forecast_level + - processing.shape_check.apply_level_correction + - processing.blend.blend_with_prior + - processing.forecast_scoring.score_forecasts # ── Forecaster ──────────────────────────────────────────────────────────── - title: "Forecaster" diff --git a/docs/reference/index.qmd b/docs/reference/index.qmd index 01f56df75..201a201e1 100644 --- a/docs/reference/index.qmd +++ b/docs/reference/index.qmd @@ -88,6 +88,11 @@ Utilities for aggregated and n-to-n predictions. | [processing.n2n_predict_with_covariates.n2n_predict_with_covariates](processing.n2n_predict_with_covariates.n2n_predict_with_covariates.qmd#spotforecast2_safe.processing.n2n_predict_with_covariates.n2n_predict_with_covariates) | End-to-end recursive forecasting with exogenous covariates. | | [processing.shape_check.ShapeCheckReport](processing.shape_check.ShapeCheckReport.qmd#spotforecast2_safe.processing.shape_check.ShapeCheckReport) | Immutable result of a forecast shape plausibility check. | | [processing.shape_check.check_forecast_shape](processing.shape_check.check_forecast_shape.qmd#spotforecast2_safe.processing.shape_check.check_forecast_shape) | Measure correlation and daily-range ratio between a forecast and its reference. | +| [processing.shape_check.LevelCheckReport](processing.shape_check.LevelCheckReport.qmd#spotforecast2_safe.processing.shape_check.LevelCheckReport) | Immutable result of a forecast *level* (systematic-bias) check. | +| [processing.shape_check.check_forecast_level](processing.shape_check.check_forecast_level.qmd#spotforecast2_safe.processing.shape_check.check_forecast_level) | Measure the systematic level offset between a forecast and its reference. | +| [processing.shape_check.apply_level_correction](processing.shape_check.apply_level_correction.qmd#spotforecast2_safe.processing.shape_check.apply_level_correction) | Shift a forecast so its central level matches a reference (debias). | +| [processing.blend.blend_with_prior](processing.blend.blend_with_prior.qmd#spotforecast2_safe.processing.blend.blend_with_prior) | Convex-blend a model forecast with an external prior. | +| [processing.forecast_scoring.score_forecasts](processing.forecast_scoring.score_forecasts.qmd#spotforecast2_safe.processing.forecast_scoring.score_forecasts) | Score several forecasts against a shared actual and rank them. | ## Forecaster diff --git a/docs/reference/processing.blend.blend_with_prior.qmd b/docs/reference/processing.blend.blend_with_prior.qmd new file mode 100644 index 000000000..1cc7f0b50 --- /dev/null +++ b/docs/reference/processing.blend.blend_with_prior.qmd @@ -0,0 +1,56 @@ +# processing.blend.blend_with_prior { #spotforecast2_safe.processing.blend.blend_with_prior } + +```python +processing.blend.blend_with_prior(model_forecast, prior, *, weight) +``` + +Convex-blend a model forecast with an external prior. + +Returns ``(1 - weight) * model_forecast + weight * prior`` on the index +intersection of the two series. ``weight`` is the trust placed in the +prior: ``0.0`` returns the model forecast unchanged (prior ignored), +``1.0`` returns the prior, and intermediate values interpolate. This is the +correct lever for down-weighting a near-oracle prior whose influence a +tree model cannot be tuned through feature scaling. + +The function is **pure**: it does not mutate its inputs and emits no +warnings. The result carries ``model_forecast``'s name. + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|----------------|------------------------------------------|-----------------------------------------------------------------------------------------|------------| +| model_forecast | [pd](`pandas`).[Series](`pandas.Series`) | The trained model's forecast. | _required_ | +| prior | [pd](`pandas`).[Series](`pandas.Series`) | The external prior to blend in (e.g. the ENTSO-E day-ahead forecast), aligned by index. | _required_ | +| weight | [float](`float`) | Blend weight in ``[0.0, 1.0]`` — the trust placed in ``prior``. | _required_ | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------|-------------------------------------------------------------| +| | [pd](`pandas`).[Series](`pandas.Series`) | A new ``pd.Series`` over the index intersection, named like | +| | [pd](`pandas`).[Series](`pandas.Series`) | ``model_forecast``. | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|----------------------------|---------------------------------------------------------------------------------------| +| | [TypeError](`TypeError`) | When ``model_forecast`` or ``prior`` is not a ``pd.Series``. | +| | [ValueError](`ValueError`) | When ``weight`` is outside ``[0.0, 1.0]`` or the two series share no index positions. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from spotforecast2_safe.processing.blend import blend_with_prior + +idx = pd.date_range("2026-06-13 00:00", periods=4, freq="h", tz="UTC") +model = pd.Series([100.0, 110.0, 120.0, 130.0], index=idx, name="y0") +prior = pd.Series([140.0, 140.0, 140.0, 140.0], index=idx) + +# weight=0 -> model unchanged; weight=1 -> prior; 0.25 -> 75/25 mix. +print(blend_with_prior(model, prior, weight=0.0).tolist()) +print(blend_with_prior(model, prior, weight=1.0).tolist()) +print(blend_with_prior(model, prior, weight=0.25).tolist()) +assert blend_with_prior(model, prior, weight=0.0).equals(model) +``` \ No newline at end of file diff --git a/docs/reference/processing.forecast_scoring.score_forecasts.qmd b/docs/reference/processing.forecast_scoring.score_forecasts.qmd new file mode 100644 index 000000000..302f86e91 --- /dev/null +++ b/docs/reference/processing.forecast_scoring.score_forecasts.qmd @@ -0,0 +1,63 @@ +# processing.forecast_scoring.score_forecasts { #spotforecast2_safe.processing.forecast_scoring.score_forecasts } + +```python +processing.forecast_scoring.score_forecasts( + forecasts, + actual, + *, + metrics=SUPPORTED_METRICS, +) +``` + +Score several forecasts against a shared actual and rank them. + +Each forecast is aligned to ``actual`` on the index intersection and scored +on the requested ``metrics``. The result is a tidy table indexed by +approach name, with one column per metric plus an ``n`` column (overlap +length), sorted ascending by the first requested metric so the best +approach is the top row. + +This is **pure**: no logging, no plotting, no mutation. Use it to compare, +for example, a four-zone bottom-up sum against a single combined model +(compute each approach's forecast first, e.g. via ``backtesting_forecaster``). + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|-----------|------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------| +| forecasts | [Mapping](`collections.abc.Mapping`)\[[str](`str`), [pd](`pandas`).[Series](`pandas.Series`)\] | Mapping of approach name to its forecast series. | _required_ | +| actual | [pd](`pandas`).[Series](`pandas.Series`) | The ground-truth series every forecast is scored against. | _required_ | +| metrics | [tuple](`tuple`)\[[str](`str`), ...\] | Subset of `SUPPORTED_METRICS` to compute, in output order. ``"mae"``, ``"rmse"``, and ``"bias"`` are in the units of the series; ``"mape"`` is a percentage. The ranking uses ``metrics[0]``. | `SUPPORTED_METRICS` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------------|----------------------------------------------------------| +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | A ``pd.DataFrame`` indexed by approach name with columns | +| | [pd](`pandas`).[DataFrame](`pandas.DataFrame`) | ``[*metrics, "n"]``, sorted ascending by ``metrics[0]``. | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|----------------------------|-----------------------------------------------------------------------------------------------------------| +| | [TypeError](`TypeError`) | When ``actual`` is not a ``pd.Series`` or a forecast value is not a ``pd.Series``. | +| | [ValueError](`ValueError`) | When ``actual`` is empty, ``forecasts`` is empty, or ``metrics`` contains an unsupported name / is empty. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from spotforecast2_safe.processing.forecast_scoring import score_forecasts + +idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC") +actual = pd.Series([43_858.0] * 24, index=idx) + +forecasts = { + "combined": actual + 300.0, # small mixed-ish offset + "four_zone_sum": actual + 1_780.0, # flat over-prediction +} +table = score_forecasts(forecasts, actual, metrics=("mae", "bias")) +print(table.round(2).to_string()) +# combined ranks first (lower MAE). +assert table.index[0] == "combined" +``` \ No newline at end of file diff --git a/docs/reference/processing.shape_check.LevelCheckReport.qmd b/docs/reference/processing.shape_check.LevelCheckReport.qmd new file mode 100644 index 000000000..d65266101 --- /dev/null +++ b/docs/reference/processing.shape_check.LevelCheckReport.qmd @@ -0,0 +1,48 @@ +# processing.shape_check.LevelCheckReport { #spotforecast2_safe.processing.shape_check.LevelCheckReport } + +```python +processing.shape_check.LevelCheckReport( + n_overlap, + statistic, + forecast_level, + reference_level, + offset, + rel_offset, + tol, +) +``` + +Immutable result of a forecast *level* (systematic-bias) check. + +Where `ShapeCheckReport` answers "does the forecast track the daily +*profile*", this answers "does the forecast sit at the right *level*". It +captures a near-constant offset of the whole forecast against a reference — +the failure mode behind the 2026-06-13 team_4 miss, where the forecast +over-predicted every hour by a flat ~1.8 GW (``bias == MAE``). + +## Attributes {.doc-section .doc-section-attributes} + +| Name | Type | Description | +|-----------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------| +| n_overlap | [int](`int`) | Number of aligned (overlapping) index positions used. When below the evaluable minimum, ``skipped`` is ``True`` and the numeric fields are ``float('nan')``. | +| statistic | [str](`str`) | Central-tendency statistic used for both levels — either ``"median"`` (robust, the default) or ``"mean"``. | +| forecast_level | [float](`float`) | ``statistic`` of the forecast over the overlap. | +| reference_level | [float](`float`) | ``statistic`` of the reference over the overlap. | +| offset | [float](`float`) | ``forecast_level - reference_level`` (signed; positive means the forecast sits high, i.e. systematic over-prediction). | +| rel_offset | [float](`float`) | ``offset / abs(reference_level)`` (signed). ``float('nan')`` when the reference level is zero. | +| tol | [float](`float`) | Relative-offset tolerance for ``biased`` (passed through from `check_forecast_level`). | + +## Examples {.doc-section .doc-section-examples} + +```{python} +from spotforecast2_safe.processing.shape_check import LevelCheckReport + +# Forecast sits 4 % high vs the reference -> biased at tol=0.02. +r = LevelCheckReport( + n_overlap=24, statistic="median", + forecast_level=45_600.0, reference_level=43_858.0, + offset=1_742.0, rel_offset=0.0397, tol=0.02, +) +print("biased:", r.biased, "rel_offset:", round(r.rel_offset, 4)) +assert r.biased and not r.skipped +``` \ No newline at end of file diff --git a/docs/reference/processing.shape_check.apply_level_correction.qmd b/docs/reference/processing.shape_check.apply_level_correction.qmd new file mode 100644 index 000000000..fa2436e07 --- /dev/null +++ b/docs/reference/processing.shape_check.apply_level_correction.qmd @@ -0,0 +1,62 @@ +# processing.shape_check.apply_level_correction { #spotforecast2_safe.processing.shape_check.apply_level_correction } + +```python +processing.shape_check.apply_level_correction( + y, + reference, + *, + statistic='median', + min_overlap=12, +) +``` + +Shift a forecast so its central level matches a reference (debias). + +Estimates the constant offset ``statistic(y) - statistic(reference)`` over +the index overlap and subtracts it from **every** value of ``y``, removing a +systematic flat bias while preserving the daily shape. This is the +post-hoc correction for the failure `check_forecast_level` detects. + +The returned series keeps ``y``'s full index, name, and ordering; only the +level is shifted. The function is pure (no mutation of the inputs). + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|-------------|------------------------------------------|----------------------------------------------------------------------------------------------------------|------------| +| y | [pd](`pandas`).[Series](`pandas.Series`) | Forecast series to correct. | _required_ | +| reference | [pd](`pandas`).[Series](`pandas.Series`) | Reference whose level ``y`` should be aligned to. | _required_ | +| statistic | [str](`str`) | ``"median"`` (default) or ``"mean"`` — must match the estimator you would use in `check_forecast_level`. | `'median'` | +| min_overlap | [int](`int`) | Minimum overlap required to estimate the offset. | `12` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|------------------------------------------|-------------------------------------------------------------------------| +| | [pd](`pandas`).[Series](`pandas.Series`) | A new ``pd.Series`` equal to ``y - offset`` (same index/name as ``y``). | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|----------------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| | [TypeError](`TypeError`) | When ``y`` or ``reference`` is not a ``pd.Series``. | +| | [ValueError](`ValueError`) | When ``y``/``reference`` is empty, ``statistic`` is invalid, or the overlap is smaller than ``min_overlap`` (no reliable offset). | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from spotforecast2_safe.processing.shape_check import ( + apply_level_correction, check_forecast_level, +) + +idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC") +actual = pd.Series([43_000.0 + 3_000.0 * (i % 12) / 12 for i in range(24)], + index=idx) +biased = actual + 1_800.0 # flat over-prediction + +corrected = apply_level_correction(biased, actual) +print("offset before:", round(check_forecast_level(biased, actual).offset)) +print("offset after :", round(check_forecast_level(corrected, actual).offset)) +assert abs(check_forecast_level(corrected, actual).offset) < 1.0 +``` \ No newline at end of file diff --git a/docs/reference/processing.shape_check.check_forecast_level.qmd b/docs/reference/processing.shape_check.check_forecast_level.qmd new file mode 100644 index 000000000..51b0dbffd --- /dev/null +++ b/docs/reference/processing.shape_check.check_forecast_level.qmd @@ -0,0 +1,67 @@ +# processing.shape_check.check_forecast_level { #spotforecast2_safe.processing.shape_check.check_forecast_level } + +```python +processing.shape_check.check_forecast_level( + y, + reference, + *, + statistic='median', + tol=0.02, + min_overlap=12, +) +``` + +Measure the systematic level offset between a forecast and its reference. + +Complements `check_forecast_shape`: a forecast can track the daily profile +perfectly (high correlation, good range ratio) yet sit at the wrong level — +a flat over- or under-prediction. This returns the signed offset of the +forecast's central level against the reference's, in absolute and relative +terms, and flags it as ``biased`` when the relative offset exceeds ``tol``. + +Like `check_forecast_shape`, this function is **pure**: no logging, no +warning, no raising on a biased result. Only invalid inputs raise. + +## Parameters {.doc-section .doc-section-parameters} + +| Name | Type | Description | Default | +|-------------|------------------------------------------|-------------------------------------------------------------------------------------------|------------| +| y | [pd](`pandas`).[Series](`pandas.Series`) | Forecast series (e.g. the 24-h submission). | _required_ | +| reference | [pd](`pandas`).[Series](`pandas.Series`) | Reference profile (e.g. ENTSO-E day-ahead forecast or actuals one week earlier). | _required_ | +| statistic | [str](`str`) | Central-tendency statistic, ``"median"`` (default, robust) or ``"mean"``. | `'median'` | +| tol | [float](`float`) | Relative-offset tolerance for ``LevelCheckReport.biased``. Default ``0.02`` (2 %). | `0.02` | +| min_overlap | [int](`int`) | Minimum overlap length to evaluate. Below this the report is ``skipped`` with NaN levels. | `12` | + +## Returns {.doc-section .doc-section-returns} + +| Name | Type | Description | +|--------|----------------------------------------------------------------------------------|--------------------------------------------------------------------| +| | [LevelCheckReport](`spotforecast2_safe.processing.shape_check.LevelCheckReport`) | `LevelCheckReport` with the computed levels, offsets, and ``tol``. | + +## Raises {.doc-section .doc-section-raises} + +| Name | Type | Description | +|--------|----------------------------|------------------------------------------------------------------------------------------| +| | [TypeError](`TypeError`) | When ``y`` or ``reference`` is not a ``pd.Series``. | +| | [ValueError](`ValueError`) | When ``y`` or ``reference`` is empty, or ``statistic`` is not ``"median"`` / ``"mean"``. | + +## Examples {.doc-section .doc-section-examples} + +```{python} +import pandas as pd +from spotforecast2_safe.processing.shape_check import check_forecast_level + +idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC") +actual = pd.Series([43_000.0 + 3_000.0 * (i % 12) / 12 for i in range(24)], + index=idx) + +# Forecast that sits a flat 1_800 MW too high -> biased. +high = actual + 1_800.0 +rep = check_forecast_level(high, actual, tol=0.02) +print(f"offset={rep.offset:.0f} MW rel={rep.rel_offset:.3f} biased={rep.biased}") +assert rep.biased and rep.offset > 0 + +# A well-centred forecast -> not biased. +ok = actual + 50.0 +print("small offset biased:", check_forecast_level(ok, actual).biased) +``` \ No newline at end of file diff --git a/src/spotforecast2_safe/processing/__init__.py b/src/spotforecast2_safe/processing/__init__.py index c5ccddcd0..5fe51a4e1 100644 --- a/src/spotforecast2_safe/processing/__init__.py +++ b/src/spotforecast2_safe/processing/__init__.py @@ -4,14 +4,28 @@ """Processing module for end-to-end forecasting pipelines.""" from .agg_predict import agg_predict +from .blend import blend_with_prior +from .forecast_scoring import SUPPORTED_METRICS, score_forecasts from .n2n_predict import n2n_predict from .n2n_predict_with_covariates import n2n_predict_with_covariates -from .shape_check import ShapeCheckReport, check_forecast_shape +from .shape_check import ( + LevelCheckReport, + ShapeCheckReport, + apply_level_correction, + check_forecast_level, + check_forecast_shape, +) __all__ = [ "agg_predict", + "blend_with_prior", + "score_forecasts", + "SUPPORTED_METRICS", "n2n_predict", "n2n_predict_with_covariates", "ShapeCheckReport", "check_forecast_shape", + "LevelCheckReport", + "check_forecast_level", + "apply_level_correction", ] diff --git a/src/spotforecast2_safe/processing/blend.py b/src/spotforecast2_safe/processing/blend.py new file mode 100644 index 000000000..d133f35f6 --- /dev/null +++ b/src/spotforecast2_safe/processing/blend.py @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Post-hoc blend of a model forecast with an external prior — pure computation. + +Motivation (2026-06-13 team_4 post-mortem): the ``--entsoe`` variant fed the +ENTSO-E day-ahead *Forecasted Load* in as a near-oracle model **feature** and +did *worse* on a low-load Saturday because that prior was itself biased high. +The obvious "down-weight the prior" idea — scaling the feature column — is a +**no-op** for gradient-boosted trees: tree splits are invariant to any +monotonic rescaling of a single feature. The sound way to down-weight a prior +is therefore a *post-hoc convex blend* of the trained model's forecast with the +prior, which this module provides. The operator keeps the prior out of the +model (or in it) and tunes its influence at the output stage. +""" + +from __future__ import annotations + +import pandas as pd + + +def blend_with_prior( + model_forecast: pd.Series, + prior: pd.Series, + *, + weight: float, +) -> pd.Series: + """Convex-blend a model forecast with an external prior. + + Returns ``(1 - weight) * model_forecast + weight * prior`` on the index + intersection of the two series. ``weight`` is the trust placed in the + prior: ``0.0`` returns the model forecast unchanged (prior ignored), + ``1.0`` returns the prior, and intermediate values interpolate. This is the + correct lever for down-weighting a near-oracle prior whose influence a + tree model cannot be tuned through feature scaling. + + The function is **pure**: it does not mutate its inputs and emits no + warnings. The result carries ``model_forecast``'s name. + + Args: + model_forecast: The trained model's forecast. + prior: The external prior to blend in (e.g. the ENTSO-E day-ahead + forecast), aligned by index. + weight: Blend weight in ``[0.0, 1.0]`` — the trust placed in ``prior``. + + Returns: + A new ``pd.Series`` over the index intersection, named like + ``model_forecast``. + + Raises: + TypeError: When ``model_forecast`` or ``prior`` is not a ``pd.Series``. + ValueError: When ``weight`` is outside ``[0.0, 1.0]`` or the two series + share no index positions. + + Examples: + ```{python} + import pandas as pd + from spotforecast2_safe.processing.blend import blend_with_prior + + idx = pd.date_range("2026-06-13 00:00", periods=4, freq="h", tz="UTC") + model = pd.Series([100.0, 110.0, 120.0, 130.0], index=idx, name="y0") + prior = pd.Series([140.0, 140.0, 140.0, 140.0], index=idx) + + # weight=0 -> model unchanged; weight=1 -> prior; 0.25 -> 75/25 mix. + print(blend_with_prior(model, prior, weight=0.0).tolist()) + print(blend_with_prior(model, prior, weight=1.0).tolist()) + print(blend_with_prior(model, prior, weight=0.25).tolist()) + assert blend_with_prior(model, prior, weight=0.0).equals(model) + ``` + """ + if not isinstance(model_forecast, pd.Series): + raise TypeError( + f"model_forecast must be a pd.Series, got " + f"{type(model_forecast).__name__!r}." + ) + if not isinstance(prior, pd.Series): + raise TypeError(f"prior must be a pd.Series, got {type(prior).__name__!r}.") + if not 0.0 <= weight <= 1.0: + raise ValueError(f"weight must be in [0.0, 1.0], got {weight}.") + + common = model_forecast.index.intersection(prior.index) + if len(common) == 0: + raise ValueError("model_forecast and prior share no index positions.") + + blended = (1.0 - weight) * model_forecast.loc[common] + weight * prior.loc[common] + blended.name = model_forecast.name + return blended diff --git a/src/spotforecast2_safe/processing/forecast_scoring.py b/src/spotforecast2_safe/processing/forecast_scoring.py new file mode 100644 index 000000000..f4fc92d9b --- /dev/null +++ b/src/spotforecast2_safe/processing/forecast_scoring.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Score and compare several forecasts against a shared actual — pure computation. + +Motivation (2026-06-13 team_4 post-mortem): the four-zone bottom-up sum lost to +the single aggregate ("combined") model. Deciding whether bottom-up aggregation +helps or merely amplifies bias is an apples-to-apples comparison question: run a +backtest for each modelling approach (with +`spotforecast2_safe.backtesting.validation.backtesting_forecaster`), then score +every approach's forecast against the same actual on the same metrics. + +`score_forecasts` is that second step — a pure, source-agnostic comparison +primitive. It takes the per-approach forecast series (e.g. the 4-zone bottom-up +sum and the combined model's prediction) plus the actual, and returns a tidy +"approach x metric" table sorted by the leading metric, so the better setup is +read off directly. +""" + +from __future__ import annotations + +from collections.abc import Mapping + +import numpy as np +import pandas as pd + +#: Metrics ``score_forecasts`` can compute, in canonical order. +SUPPORTED_METRICS: tuple[str, ...] = ("mae", "rmse", "bias", "mape") + + +def _compute_metric(name: str, error: pd.Series, actual: pd.Series) -> float: + """Return one metric over an aligned error/actual pair (NaN if empty).""" + if len(error) == 0: + return float("nan") + if name == "mae": + return float(error.abs().mean()) + if name == "rmse": + return float(np.sqrt((error**2).mean())) + if name == "bias": + return float(error.mean()) + if name == "mape": + denom = actual.abs().replace(0.0, np.nan) + return float((error.abs() / denom).mean() * 100.0) + raise ValueError( # pragma: no cover - guarded by caller + f"unknown metric {name!r}; supported: {SUPPORTED_METRICS}." + ) + + +def score_forecasts( + forecasts: Mapping[str, pd.Series], + actual: pd.Series, + *, + metrics: tuple[str, ...] = SUPPORTED_METRICS, +) -> pd.DataFrame: + """Score several forecasts against a shared actual and rank them. + + Each forecast is aligned to ``actual`` on the index intersection and scored + on the requested ``metrics``. The result is a tidy table indexed by + approach name, with one column per metric plus an ``n`` column (overlap + length), sorted ascending by the first requested metric so the best + approach is the top row. + + This is **pure**: no logging, no plotting, no mutation. Use it to compare, + for example, a four-zone bottom-up sum against a single combined model + (compute each approach's forecast first, e.g. via ``backtesting_forecaster``). + + Args: + forecasts: Mapping of approach name to its forecast series. + actual: The ground-truth series every forecast is scored against. + metrics: Subset of `SUPPORTED_METRICS` to compute, in output order. + ``"mae"``, ``"rmse"``, and ``"bias"`` are in the units of the + series; ``"mape"`` is a percentage. The ranking uses ``metrics[0]``. + + Returns: + A ``pd.DataFrame`` indexed by approach name with columns + ``[*metrics, "n"]``, sorted ascending by ``metrics[0]``. + + Raises: + TypeError: When ``actual`` is not a ``pd.Series`` or a forecast value + is not a ``pd.Series``. + ValueError: When ``actual`` is empty, ``forecasts`` is empty, or + ``metrics`` contains an unsupported name / is empty. + + Examples: + ```{python} + import pandas as pd + from spotforecast2_safe.processing.forecast_scoring import score_forecasts + + idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC") + actual = pd.Series([43_858.0] * 24, index=idx) + + forecasts = { + "combined": actual + 300.0, # small mixed-ish offset + "four_zone_sum": actual + 1_780.0, # flat over-prediction + } + table = score_forecasts(forecasts, actual, metrics=("mae", "bias")) + print(table.round(2).to_string()) + # combined ranks first (lower MAE). + assert table.index[0] == "combined" + ``` + """ + if not isinstance(actual, pd.Series): + raise TypeError(f"actual must be a pd.Series, got {type(actual).__name__!r}.") + if actual.empty: + raise ValueError("actual is empty.") + if not forecasts: + raise ValueError("forecasts is empty; nothing to score.") + if not metrics: + raise ValueError("metrics is empty; request at least one metric.") + unknown = [m for m in metrics if m not in SUPPORTED_METRICS] + if unknown: + raise ValueError( + f"unsupported metric(s) {unknown}; supported: {SUPPORTED_METRICS}." + ) + + rows: dict[str, dict[str, float]] = {} + for name, forecast in forecasts.items(): + if not isinstance(forecast, pd.Series): + raise TypeError( + f"forecast {name!r} must be a pd.Series, got " + f"{type(forecast).__name__!r}." + ) + common = forecast.index.intersection(actual.index) + a = actual.loc[common] + error = forecast.loc[common] - a + row: dict[str, float] = {m: _compute_metric(m, error, a) for m in metrics} + row["n"] = float(len(common)) + rows[name] = row + + table = pd.DataFrame.from_dict(rows, orient="index")[list(metrics) + ["n"]] + table["n"] = table["n"].astype(int) + return table.sort_values(by=metrics[0], kind="stable") diff --git a/src/spotforecast2_safe/processing/shape_check.py b/src/spotforecast2_safe/processing/shape_check.py index 96adf9abe..9e77b3c0a 100644 --- a/src/spotforecast2_safe/processing/shape_check.py +++ b/src/spotforecast2_safe/processing/shape_check.py @@ -215,3 +215,259 @@ def check_forecast_shape( min_corr=min_corr, min_range_ratio=min_range_ratio, ) + + +@dataclass(frozen=True) +class LevelCheckReport: + """Immutable result of a forecast *level* (systematic-bias) check. + + Where `ShapeCheckReport` answers "does the forecast track the daily + *profile*", this answers "does the forecast sit at the right *level*". It + captures a near-constant offset of the whole forecast against a reference — + the failure mode behind the 2026-06-13 team_4 miss, where the forecast + over-predicted every hour by a flat ~1.8 GW (``bias == MAE``). + + Attributes: + n_overlap: Number of aligned (overlapping) index positions used. When + below the evaluable minimum, ``skipped`` is ``True`` and the + numeric fields are ``float('nan')``. + statistic: Central-tendency statistic used for both levels — either + ``"median"`` (robust, the default) or ``"mean"``. + forecast_level: ``statistic`` of the forecast over the overlap. + reference_level: ``statistic`` of the reference over the overlap. + offset: ``forecast_level - reference_level`` (signed; positive means + the forecast sits high, i.e. systematic over-prediction). + rel_offset: ``offset / abs(reference_level)`` (signed). ``float('nan')`` + when the reference level is zero. + tol: Relative-offset tolerance for ``biased`` (passed through from + `check_forecast_level`). + + Examples: + ```{python} + from spotforecast2_safe.processing.shape_check import LevelCheckReport + + # Forecast sits 4 % high vs the reference -> biased at tol=0.02. + r = LevelCheckReport( + n_overlap=24, statistic="median", + forecast_level=45_600.0, reference_level=43_858.0, + offset=1_742.0, rel_offset=0.0397, tol=0.02, + ) + print("biased:", r.biased, "rel_offset:", round(r.rel_offset, 4)) + assert r.biased and not r.skipped + ``` + """ + + n_overlap: int + statistic: str + forecast_level: float + reference_level: float + offset: float + rel_offset: float + tol: float + + @property + def biased(self) -> bool: + """Return ``True`` when ``abs(rel_offset)`` exceeds ``tol``. + + A ``NaN`` relative offset (zero reference level or skipped check) is + treated as *not* biased (returns ``False``). + """ + if math.isnan(self.rel_offset): + return False + return abs(self.rel_offset) > self.tol + + @property + def skipped(self) -> bool: + """Return ``True`` when the overlap was too small to evaluate. + + By construction `check_forecast_level` stores ``n_overlap=0`` and NaN + levels when skipping. + """ + return ( + self.n_overlap == 0 + and math.isnan(self.offset) + and math.isnan(self.rel_offset) + ) + + +def _resolve_level_statistic(statistic: str): + """Return the pandas reducer for ``statistic`` or raise ``ValueError``.""" + if statistic == "median": + return lambda s: float(s.median()) + if statistic == "mean": + return lambda s: float(s.mean()) + raise ValueError(f"statistic must be 'median' or 'mean', got {statistic!r}.") + + +def check_forecast_level( + y: pd.Series, + reference: pd.Series, + *, + statistic: str = "median", + tol: float = 0.02, + min_overlap: int = 12, +) -> LevelCheckReport: + """Measure the systematic level offset between a forecast and its reference. + + Complements `check_forecast_shape`: a forecast can track the daily profile + perfectly (high correlation, good range ratio) yet sit at the wrong level — + a flat over- or under-prediction. This returns the signed offset of the + forecast's central level against the reference's, in absolute and relative + terms, and flags it as ``biased`` when the relative offset exceeds ``tol``. + + Like `check_forecast_shape`, this function is **pure**: no logging, no + warning, no raising on a biased result. Only invalid inputs raise. + + Args: + y: Forecast series (e.g. the 24-h submission). + reference: Reference profile (e.g. ENTSO-E day-ahead forecast or + actuals one week earlier). + statistic: Central-tendency statistic, ``"median"`` (default, robust) + or ``"mean"``. + tol: Relative-offset tolerance for ``LevelCheckReport.biased``. Default + ``0.02`` (2 %). + min_overlap: Minimum overlap length to evaluate. Below this the report + is ``skipped`` with NaN levels. + + Returns: + `LevelCheckReport` with the computed levels, offsets, and ``tol``. + + Raises: + TypeError: When ``y`` or ``reference`` is not a ``pd.Series``. + ValueError: When ``y`` or ``reference`` is empty, or ``statistic`` is + not ``"median"`` / ``"mean"``. + + Examples: + ```{python} + import pandas as pd + from spotforecast2_safe.processing.shape_check import check_forecast_level + + idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC") + actual = pd.Series([43_000.0 + 3_000.0 * (i % 12) / 12 for i in range(24)], + index=idx) + + # Forecast that sits a flat 1_800 MW too high -> biased. + high = actual + 1_800.0 + rep = check_forecast_level(high, actual, tol=0.02) + print(f"offset={rep.offset:.0f} MW rel={rep.rel_offset:.3f} biased={rep.biased}") + assert rep.biased and rep.offset > 0 + + # A well-centred forecast -> not biased. + ok = actual + 50.0 + print("small offset biased:", check_forecast_level(ok, actual).biased) + ``` + """ + reducer = _resolve_level_statistic(statistic) + if not isinstance(y, pd.Series): + raise TypeError(f"y must be a pd.Series, got {type(y).__name__!r}.") + if not isinstance(reference, pd.Series): + raise TypeError( + f"reference must be a pd.Series, got {type(reference).__name__!r}." + ) + if y.empty: + raise ValueError("y is empty.") + if reference.empty: + raise ValueError("reference is empty.") + + common = y.index.intersection(reference.index) + n_overlap = len(common) + + nan = float("nan") + if n_overlap < min_overlap: + return LevelCheckReport( + n_overlap=0, + statistic=statistic, + forecast_level=nan, + reference_level=nan, + offset=nan, + rel_offset=nan, + tol=tol, + ) + + forecast_level = reducer(y.loc[common]) + reference_level = reducer(reference.loc[common]) + offset = forecast_level - reference_level + rel_offset = offset / abs(reference_level) if reference_level != 0 else nan + + return LevelCheckReport( + n_overlap=n_overlap, + statistic=statistic, + forecast_level=forecast_level, + reference_level=reference_level, + offset=offset, + rel_offset=rel_offset, + tol=tol, + ) + + +def apply_level_correction( + y: pd.Series, + reference: pd.Series, + *, + statistic: str = "median", + min_overlap: int = 12, +) -> pd.Series: + """Shift a forecast so its central level matches a reference (debias). + + Estimates the constant offset ``statistic(y) - statistic(reference)`` over + the index overlap and subtracts it from **every** value of ``y``, removing a + systematic flat bias while preserving the daily shape. This is the + post-hoc correction for the failure `check_forecast_level` detects. + + The returned series keeps ``y``'s full index, name, and ordering; only the + level is shifted. The function is pure (no mutation of the inputs). + + Args: + y: Forecast series to correct. + reference: Reference whose level ``y`` should be aligned to. + statistic: ``"median"`` (default) or ``"mean"`` — must match the + estimator you would use in `check_forecast_level`. + min_overlap: Minimum overlap required to estimate the offset. + + Returns: + A new ``pd.Series`` equal to ``y - offset`` (same index/name as ``y``). + + Raises: + TypeError: When ``y`` or ``reference`` is not a ``pd.Series``. + ValueError: When ``y``/``reference`` is empty, ``statistic`` is invalid, + or the overlap is smaller than ``min_overlap`` (no reliable offset). + + Examples: + ```{python} + import pandas as pd + from spotforecast2_safe.processing.shape_check import ( + apply_level_correction, check_forecast_level, + ) + + idx = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC") + actual = pd.Series([43_000.0 + 3_000.0 * (i % 12) / 12 for i in range(24)], + index=idx) + biased = actual + 1_800.0 # flat over-prediction + + corrected = apply_level_correction(biased, actual) + print("offset before:", round(check_forecast_level(biased, actual).offset)) + print("offset after :", round(check_forecast_level(corrected, actual).offset)) + assert abs(check_forecast_level(corrected, actual).offset) < 1.0 + ``` + """ + reducer = _resolve_level_statistic(statistic) + if not isinstance(y, pd.Series): + raise TypeError(f"y must be a pd.Series, got {type(y).__name__!r}.") + if not isinstance(reference, pd.Series): + raise TypeError( + f"reference must be a pd.Series, got {type(reference).__name__!r}." + ) + if y.empty: + raise ValueError("y is empty.") + if reference.empty: + raise ValueError("reference is empty.") + + common = y.index.intersection(reference.index) + if len(common) < min_overlap: + raise ValueError( + f"overlap {len(common)} < min_overlap {min_overlap}; cannot " + f"estimate a level offset." + ) + + offset = reducer(y.loc[common]) - reducer(reference.loc[common]) + return y - offset diff --git a/tests/processing/test_blend.py b/tests/processing/test_blend.py new file mode 100644 index 000000000..fa5ebee5a --- /dev/null +++ b/tests/processing/test_blend.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Tests for processing.blend.blend_with_prior (pure convex blend).""" + +import pandas as pd +import pytest + +from spotforecast2_safe.processing.blend import blend_with_prior + +IDX = pd.date_range("2026-06-13 00:00", periods=4, freq="h", tz="UTC") +MODEL = pd.Series([100.0, 110.0, 120.0, 130.0], index=IDX, name="y0") +PRIOR = pd.Series([140.0, 140.0, 140.0, 140.0], index=IDX) + + +class TestBlendWithPrior: + def test_weight_zero_returns_model(self): + out = blend_with_prior(MODEL, PRIOR, weight=0.0) + pd.testing.assert_series_equal(out, MODEL) + + def test_weight_one_returns_prior_values(self): + out = blend_with_prior(MODEL, PRIOR, weight=1.0) + assert out.tolist() == PRIOR.tolist() + + def test_half_weight_is_midpoint(self): + out = blend_with_prior(MODEL, PRIOR, weight=0.5) + assert out.tolist() == [120.0, 125.0, 130.0, 135.0] + + def test_name_preserved_from_model(self): + assert blend_with_prior(MODEL, PRIOR, weight=0.3).name == "y0" + + def test_intersection_only(self): + prior_short = PRIOR.iloc[1:] + out = blend_with_prior(MODEL, prior_short, weight=0.5) + assert out.index.equals(prior_short.index) + assert len(out) == 3 + + def test_does_not_mutate_inputs(self): + m_before, p_before = MODEL.copy(), PRIOR.copy() + blend_with_prior(MODEL, PRIOR, weight=0.4) + pd.testing.assert_series_equal(MODEL, m_before) + pd.testing.assert_series_equal(PRIOR, p_before) + + @pytest.mark.parametrize("bad", [-0.1, 1.1, 2.0]) + def test_weight_out_of_range_raises(self, bad): + with pytest.raises(ValueError, match=r"\[0.0, 1.0\]"): + blend_with_prior(MODEL, PRIOR, weight=bad) + + def test_non_series_model_raises(self): + with pytest.raises(TypeError, match="pd.Series"): + blend_with_prior([1, 2], PRIOR, weight=0.5) # type: ignore[arg-type] + + def test_non_series_prior_raises(self): + with pytest.raises(TypeError, match="pd.Series"): + blend_with_prior(MODEL, [1, 2], weight=0.5) # type: ignore[arg-type] + + def test_disjoint_index_raises(self): + other = pd.Series( + [1.0, 2.0], + index=pd.date_range("2027-01-01", periods=2, freq="h", tz="UTC"), + ) + with pytest.raises(ValueError, match="no index positions"): + blend_with_prior(MODEL, other, weight=0.5) diff --git a/tests/processing/test_forecast_scoring.py b/tests/processing/test_forecast_scoring.py new file mode 100644 index 000000000..03a5e7c38 --- /dev/null +++ b/tests/processing/test_forecast_scoring.py @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Tests for processing.forecast_scoring.score_forecasts (pure comparison).""" + +import math + +import pandas as pd +import pytest + +from spotforecast2_safe.processing.forecast_scoring import ( + SUPPORTED_METRICS, + score_forecasts, +) + +IDX = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC") +ACTUAL = pd.Series([43_858.0] * 24, index=IDX) + + +class TestScoreForecasts: + def test_ranks_lower_mae_first(self): + forecasts = { + "four_zone_sum": ACTUAL + 1_780.0, # flat over-prediction + "combined": ACTUAL + 300.0, + } + table = score_forecasts(forecasts, ACTUAL) + assert list(table.index) == ["combined", "four_zone_sum"] + + def test_metric_values_correct(self): + forecasts = {"high": ACTUAL + 1_000.0} + table = score_forecasts(forecasts, ACTUAL) + row = table.loc["high"] + assert row["mae"] == pytest.approx(1_000.0) + assert row["rmse"] == pytest.approx(1_000.0) + assert row["bias"] == pytest.approx(1_000.0) # all-positive error + assert row["mape"] == pytest.approx(1_000.0 / 43_858.0 * 100.0) + assert row["n"] == 24 + + def test_metric_subset_and_order(self): + table = score_forecasts({"a": ACTUAL + 1.0}, ACTUAL, metrics=("bias", "mae")) + assert list(table.columns) == ["bias", "mae", "n"] + + def test_n_reflects_overlap(self): + partial = (ACTUAL + 5.0).iloc[:10] + table = score_forecasts({"p": partial}, ACTUAL) + assert table.loc["p", "n"] == 10 + + def test_supported_metrics_constant(self): + assert set(SUPPORTED_METRICS) == {"mae", "rmse", "bias", "mape"} + + def test_no_overlap_yields_nan_metrics(self): + other = pd.Series( + [1.0, 2.0], + index=pd.date_range("2027-01-01", periods=2, freq="h", tz="UTC"), + ) + table = score_forecasts({"x": other}, ACTUAL) + assert table.loc["x", "n"] == 0 + assert math.isnan(table.loc["x", "mae"]) + + def test_unknown_metric_raises(self): + with pytest.raises(ValueError, match="unsupported metric"): + score_forecasts({"a": ACTUAL}, ACTUAL, metrics=("mae", "smape")) + + def test_empty_metrics_raises(self): + with pytest.raises(ValueError, match="at least one metric"): + score_forecasts({"a": ACTUAL}, ACTUAL, metrics=()) + + def test_empty_forecasts_raises(self): + with pytest.raises(ValueError, match="nothing to score"): + score_forecasts({}, ACTUAL) + + def test_empty_actual_raises(self): + with pytest.raises(ValueError, match="empty"): + score_forecasts({"a": ACTUAL}, pd.Series(dtype=float)) + + def test_non_series_actual_raises(self): + with pytest.raises(TypeError, match="pd.Series"): + score_forecasts({"a": ACTUAL}, [1, 2, 3]) # type: ignore[arg-type] + + def test_non_series_forecast_raises(self): + with pytest.raises(TypeError, match="pd.Series"): + score_forecasts({"a": [1, 2, 3]}, ACTUAL) # type: ignore[arg-type] diff --git a/tests/processing/test_level_check.py b/tests/processing/test_level_check.py new file mode 100644 index 000000000..27216ecfe --- /dev/null +++ b/tests/processing/test_level_check.py @@ -0,0 +1,166 @@ +# SPDX-FileCopyrightText: 2026 bartzbeielstein +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Tests for processing.shape_check level / debias functions. + +check_forecast_level measures a systematic flat offset; apply_level_correction +removes it. Both are pure: no logging, no raising on a biased result. +""" + +import math + +import pandas as pd +import pytest + +from spotforecast2_safe.processing.shape_check import ( + LevelCheckReport, + apply_level_correction, + check_forecast_level, +) + +IDX = pd.date_range("2026-06-13 00:00", periods=24, freq="h", tz="UTC") +# A non-flat reference so median != endpoints and the range is non-zero. +ACTUAL = pd.Series([43_000.0 + 250.0 * (i % 12) for i in range(24)], index=IDX) + + +class TestLevelCheckReport: + def test_biased_when_rel_offset_exceeds_tol(self): + r = LevelCheckReport( + n_overlap=24, + statistic="median", + forecast_level=45_600.0, + reference_level=43_858.0, + offset=1_742.0, + rel_offset=0.04, + tol=0.02, + ) + assert r.biased and not r.skipped + + def test_not_biased_at_or_below_tol(self): + r = LevelCheckReport( + n_overlap=24, + statistic="median", + forecast_level=1.0, + reference_level=100.0, + offset=2.0, + rel_offset=0.02, + tol=0.02, + ) + assert not r.biased # strictly greater-than required + + def test_nan_rel_offset_not_biased(self): + r = LevelCheckReport( + n_overlap=0, + statistic="median", + forecast_level=float("nan"), + reference_level=float("nan"), + offset=float("nan"), + rel_offset=float("nan"), + tol=0.02, + ) + assert not r.biased + assert r.skipped + + def test_frozen(self): + r = LevelCheckReport( + n_overlap=24, + statistic="mean", + forecast_level=1.0, + reference_level=1.0, + offset=0.0, + rel_offset=0.0, + tol=0.02, + ) + with pytest.raises((AttributeError, TypeError)): + r.offset = 9.0 # type: ignore[misc] + + +class TestCheckForecastLevel: + def test_flat_high_forecast_is_biased(self): + rep = check_forecast_level(ACTUAL + 1_800.0, ACTUAL, tol=0.02) + assert rep.biased + assert rep.offset == pytest.approx(1_800.0, abs=1e-6) + assert rep.rel_offset > 0 + assert rep.n_overlap == 24 + + def test_flat_low_forecast_negative_offset(self): + rep = check_forecast_level(ACTUAL - 1_800.0, ACTUAL) + assert rep.offset < 0 and rep.rel_offset < 0 + assert rep.biased + + def test_well_centred_not_biased(self): + rep = check_forecast_level(ACTUAL + 10.0, ACTUAL, tol=0.02) + assert not rep.biased + + def test_mean_statistic(self): + rep = check_forecast_level(ACTUAL + 500.0, ACTUAL, statistic="mean") + assert rep.statistic == "mean" + assert rep.offset == pytest.approx(500.0, abs=1e-6) + + def test_zero_reference_level_nan_rel(self): + zero_ref = pd.Series(0.0, index=IDX) + rep = check_forecast_level(ACTUAL, zero_ref) + assert math.isnan(rep.rel_offset) + assert not rep.biased # NaN rel offset -> not biased + + def test_short_overlap_skipped(self): + rep = check_forecast_level(ACTUAL.iloc[:5], ACTUAL, min_overlap=12) + assert rep.skipped + assert math.isnan(rep.offset) + + def test_invalid_statistic_raises(self): + with pytest.raises(ValueError, match="median"): + check_forecast_level(ACTUAL, ACTUAL, statistic="mode") + + def test_non_series_raises(self): + with pytest.raises(TypeError, match="pd.Series"): + check_forecast_level([1, 2, 3], ACTUAL) # type: ignore[arg-type] + + def test_empty_raises(self): + with pytest.raises(ValueError, match="empty"): + check_forecast_level(pd.Series(dtype=float), ACTUAL) + + def test_deterministic(self): + a = check_forecast_level(ACTUAL + 5.0, ACTUAL) + b = check_forecast_level(ACTUAL + 5.0, ACTUAL) + assert a == b + + +class TestApplyLevelCorrection: + def test_removes_flat_offset(self): + biased = ACTUAL + 1_800.0 + corrected = apply_level_correction(biased, ACTUAL) + assert check_forecast_level(corrected, ACTUAL).offset == pytest.approx( + 0.0, abs=1e-6 + ) + + def test_preserves_index_name_and_shape(self): + biased = (ACTUAL + 1_800.0).rename("y0") + corrected = apply_level_correction(biased, ACTUAL) + assert corrected.index.equals(biased.index) + assert corrected.name == "y0" + assert len(corrected) == len(biased) + + def test_does_not_mutate_input(self): + biased = ACTUAL + 1_800.0 + before = biased.copy() + apply_level_correction(biased, ACTUAL) + pd.testing.assert_series_equal(biased, before) + + def test_shape_is_preserved_only_level_shifts(self): + biased = ACTUAL + 1_800.0 + corrected = apply_level_correction(biased, ACTUAL) + # constant shift => differences between consecutive points unchanged + pd.testing.assert_series_equal(biased.diff(), corrected.diff()) + + def test_short_overlap_raises(self): + with pytest.raises(ValueError, match="min_overlap"): + apply_level_correction(ACTUAL.iloc[:5], ACTUAL, min_overlap=12) + + def test_invalid_statistic_raises(self): + with pytest.raises(ValueError, match="median"): + apply_level_correction(ACTUAL, ACTUAL, statistic="mode") + + def test_non_series_raises(self): + with pytest.raises(TypeError, match="pd.Series"): + apply_level_correction(ACTUAL, [1, 2, 3]) # type: ignore[arg-type]